Adding upstream version 16.2.11+ds.upstream/16.2.11+ds upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
commit: 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree: 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/mds
parent: Initial commit. (diff)
download: ceph-upstream.tar.xz
ceph-upstream.zip
125 files changed, 90903 insertions, 0 deletions
diff --git a/src/mds/Anchor.cc b/src/mds/Anchor.cc
new file mode 100644
index 000000000..609b91188
--- /dev/null
+++ b/src/mds/Anchor.cc
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "mds/Anchor.h"
+
+#include "common/Formatter.h"
+
+void Anchor::encode(bufferlist &bl) const
+{
+  ENCODE_START(2, 1, bl);
+  encode(ino, bl);
+  encode(dirino, bl);
+  encode(d_name, bl);
+  encode(d_type, bl);
+  encode(frags, bl);
+  ENCODE_FINISH(bl);
+}
+
+void Anchor::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(2, bl);
+  decode(ino, bl);
+  decode(dirino, bl);
+  decode(d_name, bl);
+  decode(d_type, bl);
+  if (struct_v >= 2)
+    decode(frags, bl);
+  DECODE_FINISH(bl);
+}
+
+void Anchor::dump(Formatter *f) const
+{
+  f->dump_unsigned("ino", ino);
+  f->dump_unsigned("dirino", dirino);
+  f->dump_string("d_name", d_name);
+  f->dump_unsigned("d_type", d_type);
+}
+
+void Anchor::generate_test_instances(std::list<Anchor*>& ls)
+{
+  ls.push_back(new Anchor);
+  ls.push_back(new Anchor);
+  ls.back()->ino = 1;
+  ls.back()->dirino = 2;
+  ls.back()->d_name = "hello";
+  ls.back()->d_type = DT_DIR;
+}
+
+ostream& operator<<(ostream& out, const Anchor &a)
+{
+  return out << "a(" << a.ino << " " << a.dirino << "/'" << a.d_name << "' " << a.d_type << ")";
+}
diff --git a/src/mds/Anchor.h b/src/mds/Anchor.h
new file mode 100644
index 000000000..9e55ec807
--- /dev/null
+++ b/src/mds/Anchor.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ANCHOR_H
+#define CEPH_ANCHOR_H
+
+#include <string>
+
+#include "include/types.h"
+#include "mdstypes.h"
+#include "include/buffer.h"
+
+/*
+ * Anchor represents primary linkage of an inode. When adding inode to an
+ * anchor table, MDS ensures that the table also contains inode's ancestor
+ * inodes. MDS can get inode's path by looking up anchor table recursively.
+ */
+class Anchor {
+public:
+  Anchor() {}
+  Anchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp) :
+    ino(i), dirino(di), d_name(str), d_type(tp) {}
+
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::const_iterator &bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<Anchor*>& ls);
+  bool operator==(const Anchor &r) const {
+    return ino == r.ino && dirino == r.dirino &&
+	   d_name == r.d_name && d_type == r.d_type &&
+	   frags == r.frags;
+  }
+
+  inodeno_t ino;	// anchored ino
+  inodeno_t dirino;
+  std::string d_name;
+  __u8 d_type = 0;
+  std::set<frag_t> frags;
+
+  int omap_idx = -1;	// stored in which omap object
+};
+WRITE_CLASS_ENCODER(Anchor)
+
+class RecoveredAnchor : public Anchor {
+public:
+  RecoveredAnchor() {}
+
+  mds_rank_t auth = MDS_RANK_NONE; // auth hint
+};
+
+class OpenedAnchor : public Anchor {
+public:
+  OpenedAnchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp, int nr) :
+      Anchor(i, di, str, tp),
+      nref(nr)
+  {}
+
+  mutable int nref = 0; // how many children
+};
+
+ostream& operator<<(ostream& out, const Anchor &a);
+#endif
diff --git a/src/mds/BatchOp.cc b/src/mds/BatchOp.cc
new file mode 100644
index 000000000..c2152a69b
--- /dev/null
+++ b/src/mds/BatchOp.cc
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/debug.h"
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+#include "BatchOp.h"
+
+void BatchOp::forward(mds_rank_t target)
+{
+  dout(20) << __func__ << ": forwarding batch ops to " << target << ": ";
+  print(*_dout);
+  *_dout << dendl;
+  _forward(target);
+}
+
+void BatchOp::respond(int r)
+{
+  dout(20) << __func__ << ": responding to batch ops with result=" << r << ": ";
+  print(*_dout);
+  *_dout << dendl;
+  _respond(r);
+}
diff --git a/src/mds/BatchOp.h b/src/mds/BatchOp.h
new file mode 100644
index 000000000..bc4e21bce
--- /dev/null
+++ b/src/mds/BatchOp.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef MDS_BATCHOP_H
+#define MDS_BATCHOP_H
+
+#include "common/ref.h"
+
+#include "mdstypes.h"
+
+class BatchOp {
+public:
+  virtual ~BatchOp() {}
+
+  virtual void add_request(const ceph::ref_t<class MDRequestImpl>& mdr) = 0;
+  virtual ceph::ref_t<class MDRequestImpl> find_new_head() = 0;
+
+  virtual void print(std::ostream&) = 0;
+
+  void forward(mds_rank_t target);
+  void respond(int r);
+
+protected:
+  virtual void _forward(mds_rank_t) = 0;
+  virtual void _respond(mds_rank_t) = 0;
+};
+
+#endif
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
new file mode 100644
index 000000000..36ed489a3
--- /dev/null
+++ b/src/mds/Beacon.cc
@@ -0,0 +1,488 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "common/dout.h"
+#include "common/likely.h"
+#include "common/HeartbeatMap.h"
+
+#include "include/stringify.h"
+#include "include/util.h"
+
+#include "mon/MonClient.h"
+#include "mds/MDLog.h"
+#include "mds/MDSRank.h"
+#include "mds/MDSMap.h"
+#include "mds/Locker.h"
+
+#include "Beacon.h"
+
+#include <chrono>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds.beacon." << name << ' '
+
+using namespace std::chrono_literals;
+
+Beacon::Beacon(CephContext *cct, MonClient *monc, std::string_view name)
+  :
+    Dispatcher(cct),
+    beacon_interval(g_conf()->mds_beacon_interval),
+    monc(monc),
+    name(name),
+    compat(MDSMap::get_compat_set_all())
+{
+}
+
+Beacon::~Beacon()
+{
+  shutdown();
+}
+
+void Beacon::shutdown()
+{
+  std::unique_lock<std::mutex> lock(mutex);
+  if (!finished) {
+    finished = true;
+    lock.unlock();
+    if (sender.joinable())
+      sender.join();
+  }
+}
+
+void Beacon::init(const MDSMap &mdsmap)
+{
+  std::unique_lock lock(mutex);
+
+  _notify_mdsmap(mdsmap);
+
+  sender = std::thread([this]() {
+    std::unique_lock<std::mutex> lock(mutex);
+    std::condition_variable c; // no one wakes us
+    while (!finished) {
+      auto now = clock::now();
+      auto since = std::chrono::duration<double>(now-last_send).count();
+      auto interval = beacon_interval;
+      if (since >= interval*.90) {
+        if (!_send()) {
+          interval = 0.5; /* 500ms */
+        }
+      } else {
+        interval -= since;
+      }
+      dout(20) << "sender thread waiting interval " << interval << "s" << dendl;
+      c.wait_for(lock, interval*1s);
+    }
+  });
+}
+
+bool Beacon::ms_can_fast_dispatch2(const cref_t<Message>& m) const
+{
+  return m->get_type() == MSG_MDS_BEACON;
+}
+
+void Beacon::ms_fast_dispatch2(const ref_t<Message>& m)
+{
+  bool handled = ms_dispatch2(m);
+  ceph_assert(handled);
+}
+
+bool Beacon::ms_dispatch2(const ref_t<Message>& m)
+{
+  if (m->get_type() == MSG_MDS_BEACON) {
+    if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+      handle_mds_beacon(ref_cast<MMDSBeacon>(m));
+    }
+    return true;
+  }
+
+  return false;
+}
+
+
+/**
+ * Update lagginess state based on response from remote MDSMonitor
+ *
+ * This function puts the passed message before returning
+ */
+void Beacon::handle_mds_beacon(const cref_t<MMDSBeacon> &m)
+{
+  std::unique_lock lock(mutex);
+
+  version_t seq = m->get_seq();
+
+  // update lab
+  auto it = seq_stamp.find(seq);
+  if (it != seq_stamp.end()) {
+    auto now = clock::now();
+
+    last_acked_stamp = it->second;
+    auto rtt = std::chrono::duration<double>(now - last_acked_stamp).count();
+
+    dout(5) << "received beacon reply " << ceph_mds_state_name(m->get_state()) << " seq " << m->get_seq() << " rtt " << rtt << dendl;
+
+    if (laggy && rtt < g_conf()->mds_beacon_grace) {
+      dout(0) << " MDS is no longer laggy" << dendl;
+      laggy = false;
+      last_laggy = now;
+    }
+
+    // clean up seq_stamp map
+    seq_stamp.erase(seq_stamp.begin(), ++it);
+
+    // Wake a waiter up if present
+    cvar.notify_all();
+  } else {
+    dout(1) << "discarding unexpected beacon reply " << ceph_mds_state_name(m->get_state())
+	    << " seq " << m->get_seq() << " dne" << dendl;
+  }
+}
+
+
+void Beacon::send()
+{
+  std::unique_lock lock(mutex);
+  _send();
+}
+
+
+void Beacon::send_and_wait(const double duration)
+{
+  std::unique_lock lock(mutex);
+  _send();
+  auto awaiting_seq = last_seq;
+  dout(20) << __func__ << ": awaiting " << awaiting_seq
+           << " for up to " << duration << "s" << dendl;
+
+  auto start = clock::now();
+  while (!seq_stamp.empty() && seq_stamp.begin()->first <= awaiting_seq) {
+    auto now = clock::now();
+    auto s = duration*.95-std::chrono::duration<double>(now-start).count();
+    if (s < 0) break;
+    cvar.wait_for(lock, s*1s);
+  }
+}
+
+
+/**
+ * Call periodically, or when you have updated the desired state
+ */
+bool Beacon::_send()
+{
+  auto now = clock::now();
+  auto since = std::chrono::duration<double>(now-last_acked_stamp).count();
+
+  if (!cct->get_heartbeat_map()->is_healthy()) {
+    /* If anything isn't progressing, let avoid sending a beacon so that
+     * the MDS will consider us laggy */
+    dout(0) << "Skipping beacon heartbeat to monitors (last acked " << since << "s ago); MDS internal heartbeat is not healthy!" << dendl;
+    return false;
+  }
+
+  ++last_seq;
+  dout(5) << "Sending beacon " << ceph_mds_state_name(want_state) << " seq " << last_seq << dendl;
+
+  seq_stamp[last_seq] = now;
+
+  ceph_assert(want_state != MDSMap::STATE_NULL);
+  
+  auto beacon = make_message<MMDSBeacon>(
+      monc->get_fsid(), mds_gid_t(monc->get_global_id()),
+      name,
+      epoch,
+      want_state,
+      last_seq,
+      CEPH_FEATURES_SUPPORTED_DEFAULT);
+  beacon->set_health(health);
+  beacon->set_compat(compat);
+  beacon->set_fs(g_conf().get_val<std::string>("mds_join_fs"));
+  // piggyback the sys info on beacon msg
+  if (want_state == MDSMap::STATE_BOOT) {
+    map<string, string> sys_info;
+    collect_sys_info(&sys_info, cct);
+    sys_info["addr"] = stringify(monc->get_myaddrs());
+    beacon->set_sys_info(sys_info);
+  }
+  monc->send_mon_message(beacon.detach());
+  last_send = now;
+  return true;
+}
+
+/**
+ * Call this when there is a new MDSMap available
+ */
+void Beacon::notify_mdsmap(const MDSMap &mdsmap)
+{
+  std::unique_lock lock(mutex);
+
+  _notify_mdsmap(mdsmap);
+}
+
+void Beacon::_notify_mdsmap(const MDSMap &mdsmap)
+{
+  ceph_assert(mdsmap.get_epoch() >= epoch);
+
+  if (mdsmap.get_epoch() >= epoch) {
+    epoch = mdsmap.get_epoch();
+  }
+}
+
+
+bool Beacon::is_laggy()
+{
+  std::unique_lock lock(mutex);
+
+  auto now = clock::now();
+  auto since = std::chrono::duration<double>(now-last_acked_stamp).count();
+  if (since > g_conf()->mds_beacon_grace) {
+    if (!laggy) {
+      dout(1) << "MDS connection to Monitors appears to be laggy; " << since
+	      << "s since last acked beacon" << dendl;
+    }
+    laggy = true;
+    return true;
+  }
+  return false;
+}
+
+void Beacon::set_want_state(const MDSMap &mdsmap, MDSMap::DaemonState newstate)
+{
+  std::unique_lock lock(mutex);
+
+  // Update mdsmap epoch atomically with updating want_state, so that when
+  // we send a beacon with the new want state it has the latest epoch, and
+  // once we have updated to the latest epoch, we are not sending out
+  // a stale want_state (i.e. one from before making it through MDSMap
+  // handling)
+  _notify_mdsmap(mdsmap);
+
+  if (want_state != newstate) {
+    dout(5) << __func__ << ": "
+      << ceph_mds_state_name(want_state) << " -> "
+      << ceph_mds_state_name(newstate) << dendl;
+    want_state = newstate;
+  }
+}
+
+
+/**
+ * We are 'shown' an MDS briefly in order to update
+ * some health metrics that we will send in the next
+ * beacon.
+ */
+void Beacon::notify_health(MDSRank const *mds)
+{
+  std::unique_lock lock(mutex);
+  if (!mds) {
+    // No MDS rank held
+    return;
+  }
+
+  // I'm going to touch this MDS, so it must be locked
+  ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
+
+  health.metrics.clear();
+
+  if (unlikely(g_conf().get_val<bool>("mds_inject_health_dummy"))) {
+    MDSHealthMetric m(MDS_HEALTH_DUMMY, HEALTH_ERR, std::string("dummy"));
+    health.metrics.push_back(m);
+  }
+
+  // Detect presence of entries in DamageTable
+  if (!mds->damage_table.empty()) {
+    MDSHealthMetric m(MDS_HEALTH_DAMAGE, HEALTH_ERR, std::string(
+          "Metadata damage detected"));
+    health.metrics.push_back(m);
+  }
+
+  // Detect MDS_HEALTH_TRIM condition
+  // Indicates MDS is not trimming promptly
+  {
+    if (mds->mdlog->get_num_segments() > (size_t)(g_conf()->mds_log_max_segments * g_conf().get_val<double>("mds_log_warn_factor"))) {
+      CachedStackStringStream css;
+      *css << "Behind on trimming (" << mds->mdlog->get_num_segments()
+        << "/" << g_conf()->mds_log_max_segments << ")";
+
+      MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, css->strv());
+      m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments());
+      m.metadata["max_segments"] = stringify(g_conf()->mds_log_max_segments);
+      health.metrics.push_back(m);
+    }
+  }
+
+  // Detect clients failing to respond to modifications to capabilities in
+  // CLIENT_CAPS messages.
+  {
+    auto&& late_clients = mds->locker->get_late_revoking_clients(mds->mdsmap->get_session_timeout());
+    std::vector<MDSHealthMetric> late_cap_metrics;
+
+    for (const auto& client : late_clients) {
+      // client_t is equivalent to session.info.inst.name.num
+      // Construct an entity_name_t to lookup into SessionMap
+      entity_name_t ename(CEPH_ENTITY_TYPE_CLIENT, client.v);
+      Session const *s = mds->sessionmap.get_session(ename);
+      if (s == NULL) {
+        // Shouldn't happen, but not worth crashing if it does as this is
+        // just health-reporting code.
+        derr << "Client ID without session: " << client.v << dendl;
+        continue;
+      }
+
+      CachedStackStringStream css;
+      *css << "Client " << s->get_human_name() << " failing to respond to capability release";
+      MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, css->strv());
+      m.metadata["client_id"] = stringify(client.v);
+      late_cap_metrics.emplace_back(std::move(m));
+    }
+
+    if (late_cap_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) {
+      auto&& m = late_cap_metrics;
+      health.metrics.insert(std::end(health.metrics), std::cbegin(m), std::cend(m));
+    } else {
+      CachedStackStringStream css;
+      *css << "Many clients (" << late_cap_metrics.size()
+          << ") failing to respond to capability release";
+      MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY, HEALTH_WARN, css->strv());
+      m.metadata["client_count"] = stringify(late_cap_metrics.size());
+      health.metrics.push_back(std::move(m));
+    }
+  }
+
+  // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE
+  // messages. May be due to buggy client or resource-hogging application.
+  //
+  // Detect clients failing to advance their old_client_tid
+  {
+    set<Session*> sessions;
+    mds->sessionmap.get_client_session_set(sessions);
+
+    const auto min_caps_working_set = g_conf().get_val<uint64_t>("mds_min_caps_working_set");
+    const auto recall_warning_threshold = g_conf().get_val<Option::size_t>("mds_recall_warning_threshold");
+    const auto max_completed_requests = g_conf()->mds_max_completed_requests;
+    const auto max_completed_flushes = g_conf()->mds_max_completed_flushes;
+    std::vector<MDSHealthMetric> late_recall_metrics;
+    std::vector<MDSHealthMetric> large_completed_requests_metrics;
+    for (auto& session : sessions) {
+      const uint64_t num_caps = session->get_num_caps();
+      const uint64_t recall_caps = session->get_recall_caps();
+      if (recall_caps > recall_warning_threshold && num_caps > min_caps_working_set) {
+        dout(2) << "Session " << *session <<
+             " is not releasing caps fast enough. Recalled caps at " << recall_caps
+          << " > " << recall_warning_threshold << " (mds_recall_warning_threshold)." << dendl;
+        CachedStackStringStream css;
+        *css << "Client " << session->get_human_name() << " failing to respond to cache pressure";
+        MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, css->strv());
+        m.metadata["client_id"] = stringify(session->get_client());
+        late_recall_metrics.emplace_back(std::move(m));
+      }
+      if ((session->get_num_trim_requests_warnings() > 0 &&
+	   session->get_num_completed_requests() >= max_completed_requests) ||
+	  (session->get_num_trim_flushes_warnings() > 0 &&
+	   session->get_num_completed_flushes() >= max_completed_flushes)) {
+	CachedStackStringStream css;
+	*css << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid. ";
+	MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, css->strv());
+	m.metadata["client_id"] = stringify(session->get_client());
+	large_completed_requests_metrics.emplace_back(std::move(m));
+      }
+    }
+
+    if (late_recall_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) {
+      auto&& m = late_recall_metrics;
+      health.metrics.insert(std::end(health.metrics), std::cbegin(m), std::cend(m));
+    } else {
+      CachedStackStringStream css;
+      *css << "Many clients (" << late_recall_metrics.size()
+          << ") failing to respond to cache pressure";
+      MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL_MANY, HEALTH_WARN, css->strv());
+      m.metadata["client_count"] = stringify(late_recall_metrics.size());
+      health.metrics.push_back(m);
+      late_recall_metrics.clear();
+    }
+
+    if (large_completed_requests_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) {
+      auto&& m = large_completed_requests_metrics;
+      health.metrics.insert(std::end(health.metrics), std::cbegin(m), std::cend(m));
+    } else {
+      CachedStackStringStream css;
+      *css << "Many clients (" << large_completed_requests_metrics.size()
+	<< ") failing to advance their oldest client/flush tid";
+      MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, css->strv());
+      m.metadata["client_count"] = stringify(large_completed_requests_metrics.size());
+      health.metrics.push_back(m);
+      large_completed_requests_metrics.clear();
+    }
+  }
+
+  // Detect MDS_HEALTH_SLOW_REQUEST condition
+  {
+    int slow = mds->get_mds_slow_req_count();
+    if (slow) {
+      dout(20) << slow << " slow request found" << dendl;
+      CachedStackStringStream css;
+      *css << slow << " slow requests are blocked > " << g_conf()->mds_op_complaint_time << " secs";
+
+      MDSHealthMetric m(MDS_HEALTH_SLOW_REQUEST, HEALTH_WARN, css->strv());
+      health.metrics.push_back(m);
+    }
+  }
+
+  {
+    auto complaint_time = g_conf()->osd_op_complaint_time;
+    auto now = clock::now();
+    auto cutoff = now - ceph::make_timespan(complaint_time);
+
+    std::string count;
+    ceph::coarse_mono_time oldest;
+    if (MDSIOContextBase::check_ios_in_flight(cutoff, count, oldest)) {
+      dout(20) << count << " slow metadata IOs found" << dendl;
+
+      auto oldest_secs = std::chrono::duration<double>(now - oldest).count();
+      CachedStackStringStream css;
+      *css << count << " slow metadata IOs are blocked > " << complaint_time
+	  << " secs, oldest blocked for " << (int64_t)oldest_secs << " secs";
+
+      MDSHealthMetric m(MDS_HEALTH_SLOW_METADATA_IO, HEALTH_WARN, css->strv());
+      health.metrics.push_back(m);
+    }
+  }
+
+  // Report a health warning if we are readonly
+  if (mds->mdcache->is_readonly()) {
+    MDSHealthMetric m(MDS_HEALTH_READ_ONLY, HEALTH_WARN,
+                      "MDS in read-only mode");
+    health.metrics.push_back(m);
+  }
+
+  // Report if we have significantly exceeded our cache size limit
+  if (mds->mdcache->cache_overfull()) {
+    CachedStackStringStream css;
+    *css << "MDS cache is too large (" << bytes2str(mds->mdcache->cache_size())
+        << "/" << bytes2str(mds->mdcache->cache_limit_memory()) << "); "
+        << mds->mdcache->num_inodes_with_caps << " inodes in use by clients, "
+        << mds->mdcache->get_num_strays() << " stray files";
+
+    MDSHealthMetric m(MDS_HEALTH_CACHE_OVERSIZED, HEALTH_WARN, css->strv());
+    health.metrics.push_back(m);
+  }
+}
+
+MDSMap::DaemonState Beacon::get_want_state() const
+{
+  std::unique_lock lock(mutex);
+  return want_state;
+}
+
diff --git a/src/mds/Beacon.h b/src/mds/Beacon.h
new file mode 100644
index 000000000..8d3c33c8e
--- /dev/null
+++ b/src/mds/Beacon.h
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef BEACON_STATE_H
+#define BEACON_STATE_H
+
+#include <mutex>
+#include <string_view>
+#include <thread>
+
+#include "include/types.h"
+#include "include/Context.h"
+#include "msg/Dispatcher.h"
+
+#include "messages/MMDSBeacon.h"
+
+class MonClient;
+class MDSRank;
+
+
+/**
+ * One of these per MDS.  Handle beacon logic in this separate class so
+ * that a busy MDS holding its own lock does not hold up sending beacon
+ * messages to the mon and cause false lagginess.
+ *
+ * So that we can continue to operate while the MDS is holding its own lock,
+ * we keep copies of the data needed to generate beacon messages.  The MDS is
+ * responsible for calling Beacon::notify_* when things change.
+ */
+class Beacon : public Dispatcher
+{
+public:
+  using clock = ceph::coarse_mono_clock;
+  using time = ceph::coarse_mono_time;
+
+  Beacon(CephContext *cct, MonClient *monc, std::string_view name);
+  ~Beacon() override;
+
+  void init(const MDSMap &mdsmap);
+  void shutdown();
+
+  bool ms_can_fast_dispatch_any() const override { return true; }
+  bool ms_can_fast_dispatch2(const cref_t<Message>& m) const override;
+  void ms_fast_dispatch2(const ref_t<Message>& m) override;
+  bool ms_dispatch2(const ref_t<Message> &m) override;
+  void ms_handle_connect(Connection *c) override {}
+  bool ms_handle_reset(Connection *c) override {return false;}
+  void ms_handle_remote_reset(Connection *c) override {}
+  bool ms_handle_refused(Connection *c) override {return false;}
+
+  void notify_mdsmap(const MDSMap &mdsmap);
+  void notify_health(const MDSRank *mds);
+
+  void handle_mds_beacon(const cref_t<MMDSBeacon> &m);
+  void send();
+
+  void set_want_state(const MDSMap &mdsmap, MDSMap::DaemonState newstate);
+  MDSMap::DaemonState get_want_state() const;
+
+  /**
+   * Send a beacon, and block until the ack is received from the mon
+   * or `duration` seconds pass, whichever happens sooner.  Useful
+   * for emitting a last message on shutdown.
+   */
+  void send_and_wait(const double duration);
+
+  bool is_laggy();
+  double last_cleared_laggy() const {
+    std::unique_lock lock(mutex);
+    return std::chrono::duration<double>(clock::now()-last_laggy).count();
+  }
+
+private:
+  void _notify_mdsmap(const MDSMap &mdsmap);
+  bool _send();
+
+  mutable std::mutex mutex;
+  std::thread sender;
+  std::condition_variable cvar;
+  time last_send = clock::zero();
+  double beacon_interval = 5.0;
+  bool finished = false;
+  MonClient*    monc;
+
+  // Items we duplicate from the MDS to have access under our own lock
+  std::string name;
+  version_t epoch = 0;
+  CompatSet compat;
+  MDSMap::DaemonState want_state = MDSMap::STATE_BOOT;
+
+  // Internal beacon state
+  version_t last_seq = 0; // last seq sent to monitor
+  std::map<version_t,time>  seq_stamp;    // seq # -> time sent
+  time last_acked_stamp = clock::zero();  // last time we sent a beacon that got acked
+  bool laggy = false;
+  time last_laggy = clock::zero();
+
+  // Health status to be copied into each beacon message
+  MDSHealth health;
+};
+
+#endif // BEACON_STATE_H
diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc
new file mode 100644
index 000000000..f0825d7fe
--- /dev/null
+++ b/src/mds/CDentry.cc
@@ -0,0 +1,645 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#include "CDentry.h"
+#include "CInode.h"
+#include "CDir.h"
+
+#include "MDSRank.h"
+#include "MDCache.h"
+#include "Locker.h"
+#include "LogSegment.h"
+
+#include "messages/MLock.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << dir->mdcache->mds->get_nodeid() << ".cache.den(" << dir->dirfrag() << " " << name << ") "
+
+
+ostream& CDentry::print_db_line_prefix(ostream& out)
+{
+  return out << ceph_clock_now() << " mds." << dir->mdcache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") ";
+}
+
+LockType CDentry::lock_type(CEPH_LOCK_DN);
+LockType CDentry::versionlock_type(CEPH_LOCK_DVERSION);
+
+
+// CDentry
+
+ostream& operator<<(ostream& out, const CDentry& dn)
+{
+  filepath path;
+  dn.make_path(path);
+  
+  out << "[dentry " << path;
+  
+  if (true || dn.first != 0 || dn.last != CEPH_NOSNAP) {
+    out << " [" << dn.first << ",";
+    if (dn.last == CEPH_NOSNAP) 
+      out << "head";
+    else
+      out << dn.last;
+    out << ']';
+  }
+
+  if (dn.is_auth()) {
+    out << " auth";
+    if (dn.is_replicated()) 
+      out << dn.get_replicas();
+  } else {
+    out << " rep@" << dn.authority();
+    out << "." << dn.get_replica_nonce();
+  }
+
+  if (dn.get_linkage()->is_null()) out << " NULL";
+  if (dn.get_linkage()->is_remote()) {
+    out << " REMOTE(";
+    out << dn.get_linkage()->get_remote_d_type_string();
+    out << ")";
+  }
+
+  if (!dn.lock.is_sync_and_unlocked())
+    out << " " << dn.lock;
+  if (!dn.versionlock.is_sync_and_unlocked())
+    out << " " << dn.versionlock;
+
+  if (dn.get_projected_version() != dn.get_version())
+    out << " pv=" << dn.get_projected_version();
+  out << " v=" << dn.get_version();
+
+  if (dn.get_num_auth_pins()) {
+    out << " ap=" << dn.get_num_auth_pins();
+#ifdef MDS_AUTHPIN_SET
+    dn.print_authpin_set(out);
+#endif
+  }
+
+  {
+    const CInode *inode = dn.get_linkage()->get_inode();
+    out << " ino=";
+     if (inode) {
+       out << inode->ino();
+     } else {
+       out << "(nil)";
+     }
+  }
+
+  out << " state=" << dn.get_state();
+  if (dn.is_new()) out << "|new";
+  if (dn.state_test(CDentry::STATE_BOTTOMLRU)) out << "|bottomlru";
+
+  if (dn.get_num_ref()) {
+    out << " |";
+    dn.print_pin_set(out);
+  }
+
+  if (dn.get_alternate_name().size()) {
+    out << " altname=" << binstrprint(dn.get_alternate_name(), 16);
+  }
+
+  out << " " << &dn;
+  out << "]";
+  return out;
+}
+
+
+bool operator<(const CDentry& l, const CDentry& r)
+{
+  if ((l.get_dir()->ino() < r.get_dir()->ino()) ||
+      (l.get_dir()->ino() == r.get_dir()->ino() &&
+       (l.get_name() < r.get_name() ||
+	(l.get_name() == r.get_name() && l.last < r.last))))
+    return true;
+  return false;
+}
+
+
+void CDentry::print(ostream& out)
+{
+  out << *this;
+}
+
+
+/*
+inodeno_t CDentry::get_ino()
+{
+  if (get_inode()) 
+    return get_inode()->ino();
+  return inodeno_t();
+}
+*/
+
+mds_authority_t CDentry::authority() const
+{
+  return dir->authority();
+}
+
+
+void CDentry::add_waiter(uint64_t tag, MDSContext *c)
+{
+  // wait on the directory?
+  if (tag & (WAIT_UNFREEZE|WAIT_SINGLEAUTH)) {
+    dir->add_waiter(tag, c);
+    return;
+  }
+  MDSCacheObject::add_waiter(tag, c);
+}
+
+
+version_t CDentry::pre_dirty(version_t min)
+{
+  projected_version = dir->pre_dirty(min);
+  dout(10) << __func__ << " " << *this << dendl;
+  return projected_version;
+}
+
+
+void CDentry::_mark_dirty(LogSegment *ls)
+{
+  // state+pin
+  if (!state_test(STATE_DIRTY)) {
+    state_set(STATE_DIRTY);
+    get(PIN_DIRTY);
+    dir->inc_num_dirty();
+    dir->dirty_dentries.push_back(&item_dir_dirty);
+    ceph_assert(ls);
+  }
+  if (ls) 
+    ls->dirty_dentries.push_back(&item_dirty);
+}
+
+void CDentry::mark_dirty(version_t pv, LogSegment *ls) 
+{
+  dout(10) << __func__ << " " << *this << dendl;
+
+  // i now live in this new dir version
+  ceph_assert(pv <= projected_version);
+  version = pv;
+  _mark_dirty(ls);
+
+  // mark dir too
+  dir->mark_dirty(ls, pv);
+}
+
+
+void CDentry::mark_clean() 
+{
+  dout(10) << __func__ << " " << *this << dendl;
+  ceph_assert(is_dirty());
+
+  // not always true for recalc_auth_bits during resolve finish
+  //assert(dir->get_version() == 0 || version <= dir->get_version());  // hmm?
+
+  state_clear(STATE_DIRTY|STATE_NEW);
+  dir->dec_num_dirty();
+
+  item_dir_dirty.remove_myself();
+  item_dirty.remove_myself();
+
+  put(PIN_DIRTY);
+}
+
+void CDentry::mark_new() 
+{
+  dout(10) << __func__ << " " << *this << dendl;
+  state_set(STATE_NEW);
+}
+
+void CDentry::make_path_string(string& s, bool projected) const
+{
+  if (dir) {
+    dir->inode->make_path_string(s, projected);
+  } else {
+    s = "???";
+  }
+  s += "/";
+  s.append(name.data(), name.length());
+}
+
+void CDentry::make_path(filepath& fp, bool projected) const
+{
+  ceph_assert(dir);
+  dir->inode->make_path(fp, projected);
+  fp.push_dentry(get_name());
+}
+
+/*
+ * we only add ourselves to remote_parents when the linkage is
+ * active (no longer projected).  if the passed dnl is projected,
+ * don't link in, and do that work later in pop_projected_linkage().
+ */
+void CDentry::link_remote(CDentry::linkage_t *dnl, CInode *in)
+{
+  ceph_assert(dnl->is_remote());
+  ceph_assert(in->ino() == dnl->get_remote_ino());
+  dnl->inode = in;
+
+  if (dnl == &linkage)
+    in->add_remote_parent(this);
+
+  // check for reintegration
+  dir->mdcache->eval_remote(this);
+}
+
+void CDentry::unlink_remote(CDentry::linkage_t *dnl)
+{
+  ceph_assert(dnl->is_remote());
+  ceph_assert(dnl->inode);
+  
+  if (dnl == &linkage)
+    dnl->inode->remove_remote_parent(this);
+
+  dnl->inode = 0;
+}
+
+void CDentry::push_projected_linkage()
+{
+  _project_linkage();
+
+  if (is_auth()) {
+    CInode *diri = dir->inode;
+    if (diri->is_stray())
+      diri->mdcache->notify_stray_removed();
+  }
+}
+
+
+void CDentry::push_projected_linkage(CInode *inode)
+{
+  // dirty rstat tracking is in the projected plane
+  bool dirty_rstat = inode->is_dirty_rstat();
+  if (dirty_rstat)
+    inode->clear_dirty_rstat();
+
+  _project_linkage()->inode = inode;
+  inode->push_projected_parent(this);
+
+  if (dirty_rstat)
+    inode->mark_dirty_rstat();
+
+  if (is_auth()) {
+    CInode *diri = dir->inode;
+    if (diri->is_stray())
+      diri->mdcache->notify_stray_created();
+  }
+}
+
+CDentry::linkage_t *CDentry::pop_projected_linkage()
+{
+  ceph_assert(projected.size());
+  
+  linkage_t& n = projected.front();
+
+  /*
+   * the idea here is that the link_remote_inode(), link_primary_inode(), 
+   * etc. calls should make linkage identical to &n (and we assert as
+   * much).
+   */
+
+  if (n.remote_ino) {
+    dir->link_remote_inode(this, n.remote_ino, n.remote_d_type);
+    if (n.inode) {
+      linkage.inode = n.inode;
+      linkage.inode->add_remote_parent(this);
+    }
+  } else {
+    if (n.inode) {
+      dir->link_primary_inode(this, n.inode);
+      n.inode->pop_projected_parent();
+    }
+  }
+
+  ceph_assert(n.inode == linkage.inode);
+  ceph_assert(n.remote_ino == linkage.remote_ino);
+  ceph_assert(n.remote_d_type == linkage.remote_d_type);
+
+  projected.pop_front();
+
+  return &linkage;
+}
+
+
+
+// ----------------------------
+// auth pins
+
+int CDentry::get_num_dir_auth_pins() const
+{
+  ceph_assert(!is_projected());
+  if (get_linkage()->is_primary())
+    return auth_pins + get_linkage()->get_inode()->get_num_auth_pins();
+  return auth_pins;
+}
+
+bool CDentry::can_auth_pin(int *err_ret) const
+{
+  ceph_assert(dir);
+  return dir->can_auth_pin(err_ret);
+}
+
+void CDentry::auth_pin(void *by)
+{
+  if (auth_pins == 0)
+    get(PIN_AUTHPIN);
+  auth_pins++;
+
+#ifdef MDS_AUTHPIN_SET
+  auth_pin_set.insert(by);
+#endif
+
+  dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl;
+
+  dir->adjust_nested_auth_pins(1, by);
+}
+
+void CDentry::auth_unpin(void *by)
+{
+  auth_pins--;
+
+#ifdef MDS_AUTHPIN_SET
+  {
+    auto it = auth_pin_set.find(by);
+    ceph_assert(it != auth_pin_set.end());
+    auth_pin_set.erase(it);
+  }
+#endif
+
+  if (auth_pins == 0)
+    put(PIN_AUTHPIN);
+
+  dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl;
+  ceph_assert(auth_pins >= 0);
+
+  dir->adjust_nested_auth_pins(-1, by);
+}
+
+void CDentry::adjust_nested_auth_pins(int diradj, void *by)
+{
+  dir->adjust_nested_auth_pins(diradj, by);
+}
+
+bool CDentry::is_frozen() const
+{
+  return dir->is_frozen();
+}
+
+bool CDentry::is_freezing() const
+{
+  return dir->is_freezing();
+}
+
+// ----------------------------
+// locking
+
+void CDentry::set_object_info(MDSCacheObjectInfo &info)
+{
+  info.dirfrag = dir->dirfrag();
+  info.dname = name;
+  info.snapid = last;
+}
+
+void CDentry::encode_lock_state(int type, bufferlist& bl)
+{
+  encode(first, bl);
+
+  // null, ino, or remote_ino?
+  char c;
+  if (linkage.is_primary()) {
+    c = 1;
+    encode(c, bl);
+    encode(linkage.get_inode()->ino(), bl);
+  }
+  else if (linkage.is_remote()) {
+    c = 2;
+    encode(c, bl);
+    encode(linkage.get_remote_ino(), bl);
+  }
+  else if (linkage.is_null()) {
+    // encode nothing.
+  }
+  else ceph_abort();
+}
+
+void CDentry::decode_lock_state(int type, const bufferlist& bl)
+{  
+  auto p = bl.cbegin();
+
+  snapid_t newfirst;
+  decode(newfirst, p);
+
+  if (!is_auth() && newfirst != first) {
+    dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl;
+    ceph_assert(newfirst > first);
+    first = newfirst;
+  }
+
+  if (p.end()) {
+    // null
+    ceph_assert(linkage.is_null());
+    return;
+  }
+
+  char c;
+  inodeno_t ino;
+  decode(c, p);
+
+  switch (c) {
+  case 1:
+  case 2:
+    decode(ino, p);
+    // newly linked?
+    if (linkage.is_null() && !is_auth()) {
+      // force trim from cache!
+      dout(10) << __func__ << " replica dentry null -> non-null, must trim" << dendl;
+      //assert(get_num_ref() == 0);
+    } else {
+      // verify?
+      
+    }
+    break;
+  default: 
+    ceph_abort();
+  }
+}
+
+
+ClientLease *CDentry::add_client_lease(client_t c, Session *session) 
+{
+  ClientLease *l;
+  if (client_lease_map.count(c))
+    l = client_lease_map[c];
+  else {
+    dout(20) << __func__ << " client." << c << " on " << lock << dendl;
+    if (client_lease_map.empty()) {
+      get(PIN_CLIENTLEASE);
+      lock.get_client_lease();
+    }
+    l = client_lease_map[c] = new ClientLease(c, this);
+    l->seq = ++session->lease_seq;
+  
+  }
+  
+  return l;
+}
+
+void CDentry::remove_client_lease(ClientLease *l, Locker *locker) 
+{
+  ceph_assert(l->parent == this);
+
+  bool gather = false;
+
+  dout(20) << __func__ << " client." << l->client << " on " << lock << dendl;
+
+  client_lease_map.erase(l->client);
+  l->item_lease.remove_myself();
+  l->item_session_lease.remove_myself();
+  delete l;
+
+  if (client_lease_map.empty()) {
+    gather = !lock.is_stable();
+    lock.put_client_lease();
+    put(PIN_CLIENTLEASE);
+  }
+
+  if (gather)
+    locker->eval_gather(&lock);
+}
+
+void CDentry::remove_client_leases(Locker *locker)
+{
+  while (!client_lease_map.empty())
+    remove_client_lease(client_lease_map.begin()->second, locker);
+}
+
+void CDentry::_put()
+{
+  if (get_num_ref() <= ((int)is_dirty() + 1)) {
+    CDentry::linkage_t *dnl = get_projected_linkage();
+    if (dnl->is_primary()) {
+      CInode *in = dnl->get_inode();
+      if (get_num_ref() == (int)is_dirty() + !!in->get_num_ref())
+	in->mdcache->maybe_eval_stray(in, true);
+    }
+  }
+}
+
+void CDentry::encode_remote(inodeno_t& ino, unsigned char d_type,
+                            std::string_view alternate_name,
+                            bufferlist &bl)
+{
+  bl.append('l');  // remote link
+
+  // marker, name, ino
+  ENCODE_START(2, 1, bl);
+  encode(ino, bl);
+  encode(d_type, bl);
+  encode(alternate_name, bl);
+  ENCODE_FINISH(bl);
+}
+
+void CDentry::decode_remote(char icode, inodeno_t& ino, unsigned char& d_type,
+                            mempool::mds_co::string& alternate_name,
+                            ceph::buffer::list::const_iterator& bl)
+{
+  if (icode == 'l') {
+    DECODE_START(2, bl);
+    decode(ino, bl);
+    decode(d_type, bl);
+    if (struct_v >= 2)
+      decode(alternate_name, bl);
+    DECODE_FINISH(bl);
+  } else if (icode == 'L') {
+    decode(ino, bl);
+    decode(d_type, bl);
+  } else ceph_assert(0);
+}
+
+void CDentry::dump(Formatter *f) const
+{
+  ceph_assert(f != NULL);
+
+  filepath path;
+  make_path(path);
+
+  f->dump_string("path", path.get_path());
+  f->dump_unsigned("path_ino", path.get_ino().val);
+  f->dump_unsigned("snap_first", first);
+  f->dump_unsigned("snap_last", last);
+  
+  f->dump_bool("is_primary", get_linkage()->is_primary());
+  f->dump_bool("is_remote", get_linkage()->is_remote());
+  f->dump_bool("is_null", get_linkage()->is_null());
+  f->dump_bool("is_new", is_new());
+  if (get_linkage()->get_inode()) {
+    f->dump_unsigned("inode", get_linkage()->get_inode()->ino());
+  } else {
+    f->dump_unsigned("inode", 0);
+  }
+
+  if (linkage.is_remote()) {
+    f->dump_string("remote_type", linkage.get_remote_d_type_string());
+  } else {
+    f->dump_string("remote_type", "");
+  }
+
+  f->dump_unsigned("version", get_version());
+  f->dump_unsigned("projected_version", get_projected_version());
+
+  f->dump_int("auth_pins", auth_pins);
+
+  MDSCacheObject::dump(f);
+
+  f->open_object_section("lock");
+  lock.dump(f);
+  f->close_section();
+
+  f->open_object_section("versionlock");
+  versionlock.dump(f);
+  f->close_section();
+
+  f->open_array_section("states");
+  MDSCacheObject::dump_states(f);
+  if (state_test(STATE_NEW))
+    f->dump_string("state", "new");
+  if (state_test(STATE_FRAGMENTING))
+    f->dump_string("state", "fragmenting");
+  if (state_test(STATE_PURGING))
+    f->dump_string("state", "purging");
+  if (state_test(STATE_BADREMOTEINO))
+    f->dump_string("state", "badremoteino");
+  if (state_test(STATE_STRAY))
+    f->dump_string("state", "stray");
+  f->close_section();
+}
+
+std::string CDentry::linkage_t::get_remote_d_type_string() const
+{
+  switch (DTTOIF(remote_d_type)) {
+    case S_IFSOCK: return "sock";
+    case S_IFLNK: return "lnk";
+    case S_IFREG: return "reg";
+    case S_IFBLK: return "blk";
+    case S_IFDIR: return "dir";
+    case S_IFCHR: return "chr";
+    case S_IFIFO: return "fifo";
+    default: ceph_abort(); return "";
+  }
+}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(CDentry, co_dentry, mds_co);
diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h
new file mode 100644
index 000000000..efd277192
--- /dev/null
+++ b/src/mds/CDentry.h
@@ -0,0 +1,403 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CDENTRY_H
+#define CEPH_CDENTRY_H
+
+#include <string>
+#include <string_view>
+#include <set>
+
+#include "include/counter.h"
+#include "include/types.h"
+#include "include/buffer_fwd.h"
+#include "include/lru.h"
+#include "include/elist.h"
+#include "include/filepath.h"
+
+#include "BatchOp.h"
+#include "MDSCacheObject.h"
+#include "MDSContext.h"
+#include "Mutation.h"
+#include "SimpleLock.h"
+#include "LocalLockC.h"
+#include "ScrubHeader.h"
+
+class CInode;
+class CDir;
+class Locker;
+class CDentry;
+class LogSegment;
+
+class Session;
+
+// define an ordering
+bool operator<(const CDentry& l, const CDentry& r);
+
+// dentry
+class CDentry : public MDSCacheObject, public LRUObject, public Counter<CDentry> {
+public:
+  MEMPOOL_CLASS_HELPERS();
+  friend class CDir;
+
+  struct linkage_t {
+    CInode *inode = nullptr;
+    inodeno_t remote_ino = 0;
+    unsigned char remote_d_type = 0;
+    
+    linkage_t() {}
+
+    // dentry type is primary || remote || null
+    // inode ptr is required for primary, optional for remote, undefined for null
+    bool is_primary() const { return remote_ino == 0 && inode != 0; }
+    bool is_remote() const { return remote_ino > 0; }
+    bool is_null() const { return remote_ino == 0 && inode == 0; }
+
+    CInode *get_inode() { return inode; }
+    const CInode *get_inode() const { return inode; }
+    inodeno_t get_remote_ino() const { return remote_ino; }
+    unsigned char get_remote_d_type() const { return remote_d_type; }
+    std::string get_remote_d_type_string() const;
+
+    void set_remote(inodeno_t ino, unsigned char d_type) {
+      remote_ino = ino;
+      remote_d_type = d_type;
+      inode = 0;
+    }
+    void link_remote(CInode *in);
+  };
+
+
+  // -- state --
+  static const int STATE_NEW =          (1<<0);
+  static const int STATE_FRAGMENTING =  (1<<1);
+  static const int STATE_PURGING =      (1<<2);
+  static const int STATE_BADREMOTEINO = (1<<3);
+  static const int STATE_EVALUATINGSTRAY = (1<<4);
+  static const int STATE_PURGINGPINNED =  (1<<5);
+  static const int STATE_BOTTOMLRU =    (1<<6);
+  static const int STATE_UNLINKING =    (1<<7);
+  // stray dentry needs notification of releasing reference
+  static const int STATE_STRAY =	STATE_NOTIFYREF;
+  static const int MASK_STATE_IMPORT_KEPT = STATE_BOTTOMLRU;
+
+  // -- pins --
+  static const int PIN_INODEPIN =         1;  // linked inode is pinned
+  static const int PIN_FRAGMENTING =     -2;  // containing dir is refragmenting
+  static const int PIN_PURGING =          3;
+  static const int PIN_SCRUBPARENT =      4;
+  static const int PIN_WAITUNLINKSTATE  = 5;
+
+  static const unsigned EXPORT_NONCE = 1;
+
+  const static uint64_t WAIT_UNLINK_STATE  = (1<<0);
+  const static uint64_t WAIT_UNLINK_FINISH = (1<<1);
+  uint32_t replica_unlinking_ref = 0;
+
+  CDentry(std::string_view n, __u32 h,
+          mempool::mds_co::string alternate_name,
+	  snapid_t f, snapid_t l) :
+    hash(h),
+    first(f), last(l),
+    item_dirty(this),
+    lock(this, &lock_type),
+    versionlock(this, &versionlock_type),
+    name(n),
+    alternate_name(std::move(alternate_name))
+  {}
+  CDentry(std::string_view n, __u32 h,
+          mempool::mds_co::string alternate_name,
+          inodeno_t ino, unsigned char dt,
+	  snapid_t f, snapid_t l) :
+    hash(h),
+    first(f), last(l),
+    item_dirty(this),
+    lock(this, &lock_type),
+    versionlock(this, &versionlock_type),
+    name(n),
+    alternate_name(std::move(alternate_name))
+  {
+    linkage.remote_ino = ino;
+    linkage.remote_d_type = dt;
+  }
+
+  ~CDentry() override {
+    ceph_assert(batch_ops.empty());
+  }
+
+  std::string_view pin_name(int p) const override {
+    switch (p) {
+    case PIN_INODEPIN: return "inodepin";
+    case PIN_FRAGMENTING: return "fragmenting";
+    case PIN_PURGING: return "purging";
+    case PIN_SCRUBPARENT: return "scrubparent";
+    case PIN_WAITUNLINKSTATE: return "waitunlinkstate";
+    default: return generic_pin_name(p);
+    }
+  }
+
+  // -- wait --
+  //static const int WAIT_LOCK_OFFSET = 8;
+
+  void add_waiter(uint64_t tag, MDSContext *c) override;
+
+  bool is_lt(const MDSCacheObject *r) const override {
+    return *this < *static_cast<const CDentry*>(r);
+  }
+
+  dentry_key_t key() {
+    return dentry_key_t(last, name.c_str(), hash);
+  }
+
+  const CDir *get_dir() const { return dir; }
+  CDir *get_dir() { return dir; }
+  std::string_view get_name() const { return std::string_view(name); }
+  std::string_view get_alternate_name() const {
+    return std::string_view(alternate_name);
+  }
+  void set_alternate_name(mempool::mds_co::string altn) {
+    alternate_name = std::move(altn);
+  }
+  void set_alternate_name(std::string_view altn) {
+    alternate_name = mempool::mds_co::string(altn);
+  }
+
+  __u32 get_hash() const { return hash; }
+
+  // linkage
+  const linkage_t *get_linkage() const { return &linkage; }
+  linkage_t *get_linkage() { return &linkage; }
+
+  linkage_t *_project_linkage() {
+    projected.push_back(linkage_t());
+    return &projected.back();
+  }
+  void push_projected_linkage();
+  void push_projected_linkage(inodeno_t ino, char d_type) {
+    linkage_t *p = _project_linkage();
+    p->remote_ino = ino;
+    p->remote_d_type = d_type;
+  }
+  void push_projected_linkage(CInode *inode); 
+  linkage_t *pop_projected_linkage();
+
+  bool is_projected() const { return !projected.empty(); }
+
+  linkage_t *get_projected_linkage() {
+    if (!projected.empty())
+      return &projected.back();
+    return &linkage;
+  }
+
+  const linkage_t *get_projected_linkage() const {
+    if (!projected.empty())
+      return &projected.back();
+    return &linkage;
+  }
+
+  CInode *get_projected_inode() {
+    return get_projected_linkage()->inode;
+  }
+
+  bool use_projected(client_t client, const MutationRef& mut) const {
+    return lock.can_read_projected(client) || 
+      lock.get_xlock_by() == mut;
+  }
+  linkage_t *get_linkage(client_t client, const MutationRef& mut) {
+    return use_projected(client, mut) ? get_projected_linkage() : get_linkage();
+  }
+
+  // ref counts: pin ourselves in the LRU when we're pinned.
+  void first_get() override {
+    lru_pin();
+  }
+  void last_put() override {
+    lru_unpin();
+  }
+  void _put() override;
+
+  // auth pins
+  bool can_auth_pin(int *err_ret=nullptr) const override;
+  void auth_pin(void *by) override;
+  void auth_unpin(void *by) override;
+  void adjust_nested_auth_pins(int diradj, void *by);
+  bool is_frozen() const override;
+  bool is_freezing() const override;
+  int get_num_dir_auth_pins() const;
+  
+  // remote links
+  void link_remote(linkage_t *dnl, CInode *in);
+  void unlink_remote(linkage_t *dnl);
+  
+  // copy cons
+  CDentry(const CDentry& m);
+  const CDentry& operator= (const CDentry& right);
+
+  // misc
+  void make_path_string(std::string& s, bool projected=false) const;
+  void make_path(filepath& fp, bool projected=false) const;
+
+  // -- version --
+  version_t get_version() const { return version; }
+  void set_version(version_t v) { projected_version = version = v; }
+  version_t get_projected_version() const { return projected_version; }
+  void set_projected_version(version_t v) { projected_version = v; }
+  
+  mds_authority_t authority() const override;
+
+  version_t pre_dirty(version_t min=0);
+  void _mark_dirty(LogSegment *ls);
+  void mark_dirty(version_t pv, LogSegment *ls);
+  void mark_clean();
+
+  void mark_new();
+  bool is_new() const { return state_test(STATE_NEW); }
+  void clear_new() { state_clear(STATE_NEW); }
+  
+  // -- exporting
+  // note: this assumes the dentry already exists.  
+  // i.e., the name is already extracted... so we just need the other state.
+  void encode_export(ceph::buffer::list& bl) {
+    ENCODE_START(1, 1, bl);
+    encode(first, bl);
+    encode(state, bl);
+    encode(version, bl);
+    encode(projected_version, bl);
+    encode(lock, bl);
+    encode(get_replicas(), bl);
+    get(PIN_TEMPEXPORTING);
+    ENCODE_FINISH(bl);
+  }
+  void finish_export() {
+    // twiddle
+    clear_replica_map();
+    replica_nonce = EXPORT_NONCE;
+    state_clear(CDentry::STATE_AUTH);
+    if (is_dirty())
+      mark_clean();
+    put(PIN_TEMPEXPORTING);
+  }
+  void abort_export() {
+    put(PIN_TEMPEXPORTING);
+  }
+  void decode_import(ceph::buffer::list::const_iterator& blp, LogSegment *ls) {
+    DECODE_START(1, blp);
+    decode(first, blp);
+    __u32 nstate;
+    decode(nstate, blp);
+    decode(version, blp);
+    decode(projected_version, blp);
+    decode(lock, blp);
+    decode(get_replicas(), blp);
+
+    // twiddle
+    state &= MASK_STATE_IMPORT_KEPT;
+    state_set(CDentry::STATE_AUTH);
+    if (nstate & STATE_DIRTY)
+      _mark_dirty(ls);
+    if (is_replicated())
+      get(PIN_REPLICATED);
+    replica_nonce = 0;
+    DECODE_FINISH(blp);
+  }
+
+  // -- locking --
+  SimpleLock* get_lock(int type) override {
+    ceph_assert(type == CEPH_LOCK_DN);
+    return &lock;
+  }
+  void set_object_info(MDSCacheObjectInfo &info) override;
+  void encode_lock_state(int type, ceph::buffer::list& bl) override;
+  void decode_lock_state(int type, const ceph::buffer::list& bl) override;
+
+  // ---------------------------------------------
+  // replicas (on clients)
+
+  bool is_any_leases() const {
+    return !client_lease_map.empty();
+  }
+  const ClientLease *get_client_lease(client_t c) const {
+    if (client_lease_map.count(c))
+      return client_lease_map.find(c)->second;
+    return 0;
+  }
+  ClientLease *get_client_lease(client_t c) {
+    if (client_lease_map.count(c))
+      return client_lease_map.find(c)->second;
+    return 0;
+  }
+  bool have_client_lease(client_t c) const {
+    const ClientLease *l = get_client_lease(c);
+    if (l) 
+      return true;
+    else
+      return false;
+  }
+
+  ClientLease *add_client_lease(client_t c, Session *session);
+  void remove_client_lease(ClientLease *r, Locker *locker);  // returns remaining mask (if any), and kicks locker eval_gathers
+  void remove_client_leases(Locker *locker);
+
+  std::ostream& print_db_line_prefix(std::ostream& out) override;
+  void print(std::ostream& out) override;
+  void dump(ceph::Formatter *f) const;
+
+  static void encode_remote(inodeno_t& ino, unsigned char d_type,
+                            std::string_view alternate_name,
+                            bufferlist &bl);
+  static void decode_remote(char icode, inodeno_t& ino, unsigned char& d_type,
+                            mempool::mds_co::string& alternate_name,
+                            ceph::buffer::list::const_iterator& bl);
+
+  __u32 hash;
+  snapid_t first, last;
+
+  elist<CDentry*>::item item_dirty, item_dir_dirty;
+  elist<CDentry*>::item item_stray;
+
+  // lock
+  static LockType lock_type;
+  static LockType versionlock_type;
+
+  SimpleLock lock; // FIXME referenced containers not in mempool
+  LocalLockC versionlock; // FIXME referenced containers not in mempool
+
+  mempool::mds_co::map<client_t,ClientLease*> client_lease_map;
+  std::map<int, std::unique_ptr<BatchOp>> batch_ops;
+
+
+protected:
+  friend class Migrator;
+  friend class Locker;
+  friend class MDCache;
+  friend class StrayManager;
+  friend class CInode;
+  friend class C_MDC_XlockRequest;
+
+  CDir *dir = nullptr;     // containing dirfrag
+  linkage_t linkage; /* durable */
+  mempool::mds_co::list<linkage_t> projected;
+
+  version_t version = 0;  // dir version when last touched.
+  version_t projected_version = 0;  // what it will be when i unlock/commit.
+
+private:
+  mempool::mds_co::string name;
+  mempool::mds_co::string alternate_name;
+};
+
+std::ostream& operator<<(std::ostream& out, const CDentry& dn);
+
+
+#endif
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
new file mode 100644
index 000000000..e9b9c38d2
--- /dev/null
+++ b/src/mds/CDir.cc
@@ -0,0 +1,3676 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <string_view>
+#include <algorithm>
+
+#include "include/types.h"
+
+#include "CDir.h"
+#include "CDentry.h"
+#include "CInode.h"
+#include "Mutation.h"
+
+#include "MDSMap.h"
+#include "MDSRank.h"
+#include "MDCache.h"
+#include "Locker.h"
+#include "MDLog.h"
+#include "LogSegment.h"
+#include "MDBalancer.h"
+
+#include "common/bloom_filter.hpp"
+#include "include/Context.h"
+#include "common/Clock.h"
+
+#include "osdc/Objecter.h"
+
+#include "common/config.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") "
+
+int CDir::num_frozen_trees = 0;
+int CDir::num_freezing_trees = 0;
+
+CDir::fnode_const_ptr CDir::empty_fnode = CDir::allocate_fnode();
+
+class CDirContext : public MDSContext
+{
+protected:
+  CDir *dir;
+  MDSRank* get_mds() override {return dir->mdcache->mds;}
+
+public:
+  explicit CDirContext(CDir *d) : dir(d) {
+    ceph_assert(dir != NULL);
+  }
+};
+
+
+class CDirIOContext : public MDSIOContextBase
+{
+protected:
+  CDir *dir;
+  MDSRank* get_mds() override {return dir->mdcache->mds;}
+
+public:
+  explicit CDirIOContext(CDir *d) : dir(d) {
+    ceph_assert(dir != NULL);
+  }
+};
+
+
+// PINS
+//int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+
+
+ostream& operator<<(ostream& out, const CDir& dir)
+{
+  out << "[dir " << dir.dirfrag() << " " << dir.get_path() << "/"
+      << " [" << dir.first << ",head]";
+  if (dir.is_auth()) {
+    out << " auth";
+    if (dir.is_replicated())
+      out << dir.get_replicas();
+
+    if (dir.is_projected())
+      out << " pv=" << dir.get_projected_version();
+    out << " v=" << dir.get_version();
+    out << " cv=" << dir.get_committing_version();
+    out << "/" << dir.get_committed_version();
+  } else {
+    mds_authority_t a = dir.authority();
+    out << " rep@" << a.first;
+    if (a.second != CDIR_AUTH_UNKNOWN)
+      out << "," << a.second;
+    out << "." << dir.get_replica_nonce();
+  }
+
+  if (dir.is_rep()) out << " REP";
+
+  if (dir.get_dir_auth() != CDIR_AUTH_DEFAULT) {
+    if (dir.get_dir_auth().second == CDIR_AUTH_UNKNOWN)
+      out << " dir_auth=" << dir.get_dir_auth().first;
+    else
+      out << " dir_auth=" << dir.get_dir_auth();
+  }
+  
+  if (dir.get_auth_pins() || dir.get_dir_auth_pins()) {
+    out << " ap=" << dir.get_auth_pins() 
+	<< "+" << dir.get_dir_auth_pins();
+#ifdef MDS_AUTHPIN_SET
+    dir.print_authpin_set(out);
+#endif
+  }
+
+  out << " state=" << dir.get_state();
+  if (dir.state_test(CDir::STATE_COMPLETE)) out << "|complete";
+  if (dir.state_test(CDir::STATE_FREEZINGTREE)) out << "|freezingtree";
+  if (dir.state_test(CDir::STATE_FROZENTREE)) out << "|frozentree";
+  if (dir.state_test(CDir::STATE_AUXSUBTREE)) out << "|auxsubtree";
+  if (dir.state_test(CDir::STATE_FROZENDIR)) out << "|frozendir";
+  if (dir.state_test(CDir::STATE_FREEZINGDIR)) out << "|freezingdir";
+  if (dir.state_test(CDir::STATE_EXPORTBOUND)) out << "|exportbound";
+  if (dir.state_test(CDir::STATE_IMPORTBOUND)) out << "|importbound";
+  if (dir.state_test(CDir::STATE_BADFRAG)) out << "|badfrag";
+  if (dir.state_test(CDir::STATE_FRAGMENTING)) out << "|fragmenting";
+  if (dir.state_test(CDir::STATE_CREATING)) out << "|creating";
+  if (dir.state_test(CDir::STATE_COMMITTING)) out << "|committing";
+  if (dir.state_test(CDir::STATE_FETCHING)) out << "|fetching";
+  if (dir.state_test(CDir::STATE_EXPORTING)) out << "|exporting";
+  if (dir.state_test(CDir::STATE_IMPORTING)) out << "|importing";
+  if (dir.state_test(CDir::STATE_STICKY)) out << "|sticky";
+  if (dir.state_test(CDir::STATE_DNPINNEDFRAG)) out << "|dnpinnedfrag";
+  if (dir.state_test(CDir::STATE_ASSIMRSTAT)) out << "|assimrstat";
+
+  // fragstat
+  out << " " << dir.get_fnode()->fragstat;
+  if (!(dir.get_fnode()->fragstat == dir.get_fnode()->accounted_fragstat))
+    out << "/" << dir.get_fnode()->accounted_fragstat;
+  if (g_conf()->mds_debug_scatterstat && dir.is_projected()) {
+    const auto& pf = dir.get_projected_fnode();
+    out << "->" << pf->fragstat;
+    if (!(pf->fragstat == pf->accounted_fragstat))
+      out << "/" << pf->accounted_fragstat;
+  }
+  
+  // rstat
+  out << " " << dir.get_fnode()->rstat;
+  if (!(dir.get_fnode()->rstat == dir.get_fnode()->accounted_rstat))
+    out << "/" << dir.get_fnode()->accounted_rstat;
+  if (g_conf()->mds_debug_scatterstat && dir.is_projected()) {
+    const auto& pf = dir.get_projected_fnode();
+    out << "->" << pf->rstat;
+    if (!(pf->rstat == pf->accounted_rstat))
+      out << "/" << pf->accounted_rstat;
+ }
+
+  out << " hs=" << dir.get_num_head_items() << "+" << dir.get_num_head_null();
+  out << ",ss=" << dir.get_num_snap_items() << "+" << dir.get_num_snap_null();
+  if (dir.get_num_dirty())
+    out << " dirty=" << dir.get_num_dirty();
+  
+  if (dir.get_num_ref()) {
+    out << " |";
+    dir.print_pin_set(out);
+  }
+
+  out << " " << &dir;
+  return out << "]";
+}
+
+
+void CDir::print(ostream& out) 
+{
+  out << *this;
+}
+
+
+
+
+ostream& CDir::print_db_line_prefix(ostream& out) 
+{
+  return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") ";
+}
+
+
+
+// -------------------------------------------------------------------
+// CDir
+
+CDir::CDir(CInode *in, frag_t fg, MDCache *mdc, bool auth) :
+  mdcache(mdc), inode(in), frag(fg),
+  dirty_rstat_inodes(member_offset(CInode, dirty_rstat_item)),
+  dirty_dentries(member_offset(CDentry, item_dir_dirty)),
+  item_dirty(this), item_new(this),
+  lock_caches_with_auth_pins(member_offset(MDLockCache::DirItem, item_dir)),
+  freezing_inodes(member_offset(CInode, item_freezing_inode)),
+  dir_rep(REP_NONE),
+  pop_me(mdc->decayrate),
+  pop_nested(mdc->decayrate),
+  pop_auth_subtree(mdc->decayrate),
+  pop_auth_subtree_nested(mdc->decayrate),
+  pop_spread(mdc->decayrate),
+  pop_lru_subdirs(member_offset(CInode, item_pop_lru)),
+  dir_auth(CDIR_AUTH_DEFAULT)
+{
+  // auth
+  ceph_assert(in->is_dir());
+  if (auth)
+    state_set(STATE_AUTH);
+}
+
+/**
+ * Check the recursive statistics on size for consistency.
+ * If mds_debug_scatterstat is enabled, assert for correctness,
+ * otherwise just print out the mismatch and continue.
+ */
+bool CDir::check_rstats(bool scrub)
+{
+  if (!g_conf()->mds_debug_scatterstat && !scrub)
+    return true;
+
+  dout(25) << "check_rstats on " << this << dendl;
+  if (!is_complete() || !is_auth() || is_frozen()) {
+    dout(3) << "check_rstats " << (scrub ? "(scrub) " : "")
+            << "bailing out -- incomplete or non-auth or frozen dir on " 
+            << *this << dendl;
+    return !scrub;
+  }
+
+  frag_info_t frag_info;
+  nest_info_t nest_info;
+  for (auto i = items.begin(); i != items.end(); ++i) {
+    if (i->second->last != CEPH_NOSNAP)
+      continue;
+    CDentry::linkage_t *dnl = i->second->get_linkage();
+    if (dnl->is_primary()) {
+      CInode *in = dnl->get_inode();
+      nest_info.add(in->get_inode()->accounted_rstat);
+      if (in->is_dir())
+	frag_info.nsubdirs++;
+      else
+	frag_info.nfiles++;
+    } else if (dnl->is_remote())
+      frag_info.nfiles++;
+  }
+
+  bool good = true;
+  // fragstat
+  if(!frag_info.same_sums(fnode->fragstat)) {
+    dout(1) << "mismatch between head items and fnode.fragstat! printing dentries" << dendl;
+    dout(1) << "get_num_head_items() = " << get_num_head_items()
+             << "; fnode.fragstat.nfiles=" << fnode->fragstat.nfiles
+             << " fnode.fragstat.nsubdirs=" << fnode->fragstat.nsubdirs << dendl;
+    good = false;
+  } else {
+    dout(20) << "get_num_head_items() = " << get_num_head_items()
+             << "; fnode.fragstat.nfiles=" << fnode->fragstat.nfiles
+             << " fnode.fragstat.nsubdirs=" << fnode->fragstat.nsubdirs << dendl;
+  }
+
+  // rstat
+  if (!nest_info.same_sums(fnode->rstat)) {
+    dout(1) << "mismatch between child accounted_rstats and my rstats!" << dendl;
+    dout(1) << "total of child dentries: " << nest_info << dendl;
+    dout(1) << "my rstats:              " << fnode->rstat << dendl;
+    good = false;
+  } else {
+    dout(20) << "total of child dentries: " << nest_info << dendl;
+    dout(20) << "my rstats:              " << fnode->rstat << dendl;
+  }
+
+  if (!good) {
+    if (!scrub) {
+      for (auto i = items.begin(); i != items.end(); ++i) {
+	CDentry *dn = i->second;
+	if (dn->get_linkage()->is_primary()) {
+	  CInode *in = dn->get_linkage()->inode;
+	  dout(1) << *dn << " rstat " << in->get_inode()->accounted_rstat << dendl;
+	} else {
+	  dout(1) << *dn << dendl;
+	}
+      }
+
+      ceph_assert(frag_info.nfiles == fnode->fragstat.nfiles);
+      ceph_assert(frag_info.nsubdirs == fnode->fragstat.nsubdirs);
+      ceph_assert(nest_info.rbytes == fnode->rstat.rbytes);
+      ceph_assert(nest_info.rfiles == fnode->rstat.rfiles);
+      ceph_assert(nest_info.rsubdirs == fnode->rstat.rsubdirs);
+    }
+  }
+  dout(10) << "check_rstats complete on " << this << dendl;
+  return good;
+}
+
+void CDir::adjust_num_inodes_with_caps(int d)
+{
+  // FIXME: smarter way to decide if adding 'this' to open file table
+  if (num_inodes_with_caps == 0 && d > 0)
+    mdcache->open_file_table.add_dirfrag(this);
+  else if (num_inodes_with_caps > 0 && num_inodes_with_caps == -d)
+    mdcache->open_file_table.remove_dirfrag(this);
+
+  num_inodes_with_caps += d;
+  ceph_assert(num_inodes_with_caps >= 0);
+}
+
+CDentry *CDir::lookup(std::string_view name, snapid_t snap)
+{ 
+  dout(20) << "lookup (" << snap << ", '" << name << "')" << dendl;
+  auto iter = items.lower_bound(dentry_key_t(snap, name, inode->hash_dentry_name(name)));
+  if (iter == items.end())
+    return 0;
+  if (iter->second->get_name() == name &&
+      iter->second->first <= snap &&
+      iter->second->last >= snap) {
+    dout(20) << "  hit -> " << iter->first << dendl;
+    return iter->second;
+  }
+  dout(20) << "  miss -> " << iter->first << dendl;
+  return 0;
+}
+
+CDentry *CDir::lookup_exact_snap(std::string_view name, snapid_t last) {
+  dout(20) << __func__ << " (" << last << ", '" << name << "')" << dendl;
+  auto p = items.find(dentry_key_t(last, name, inode->hash_dentry_name(name)));
+  if (p == items.end())
+    return NULL;
+  return p->second;
+}
+
+/***
+ * linking fun
+ */
+
+CDentry* CDir::add_null_dentry(std::string_view dname,
+			       snapid_t first, snapid_t last)
+{
+  // foreign
+  ceph_assert(lookup_exact_snap(dname, last) == 0);
+   
+  // create dentry
+  CDentry* dn = new CDentry(dname, inode->hash_dentry_name(dname), "", first, last);
+  if (is_auth()) 
+    dn->state_set(CDentry::STATE_AUTH);
+
+  mdcache->bottom_lru.lru_insert_mid(dn);
+  dn->state_set(CDentry::STATE_BOTTOMLRU);
+
+  dn->dir = this;
+  dn->version = get_projected_version();
+  
+  // add to dir
+  ceph_assert(items.count(dn->key()) == 0);
+  //assert(null_items.count(dn->get_name()) == 0);
+
+  items[dn->key()] = dn;
+  if (last == CEPH_NOSNAP)
+    num_head_null++;
+  else
+    num_snap_null++;
+
+  if (state_test(CDir::STATE_DNPINNEDFRAG)) {
+    dn->get(CDentry::PIN_FRAGMENTING);
+    dn->state_set(CDentry::STATE_FRAGMENTING);
+  }    
+
+  dout(12) << __func__ << " " << *dn << dendl;
+
+  // pin?
+  if (get_num_any() == 1)
+    get(PIN_CHILD);
+  
+  ceph_assert(get_num_any() == items.size());
+  return dn;
+}
+
+
+CDentry* CDir::add_primary_dentry(std::string_view dname, CInode *in,
+                                  mempool::mds_co::string alternate_name,
+				  snapid_t first, snapid_t last) 
+{
+  // primary
+  ceph_assert(lookup_exact_snap(dname, last) == 0);
+  
+  // create dentry
+  CDentry* dn = new CDentry(dname, inode->hash_dentry_name(dname), std::move(alternate_name), first, last);
+  if (is_auth()) 
+    dn->state_set(CDentry::STATE_AUTH);
+  if (is_auth() || !inode->is_stray()) {
+    mdcache->lru.lru_insert_mid(dn);
+  } else {
+    mdcache->bottom_lru.lru_insert_mid(dn);
+    dn->state_set(CDentry::STATE_BOTTOMLRU);
+  }
+
+  dn->dir = this;
+  dn->version = get_projected_version();
+  
+  // add to dir
+  ceph_assert(items.count(dn->key()) == 0);
+  //assert(null_items.count(dn->get_name()) == 0);
+
+  items[dn->key()] = dn;
+
+  dn->get_linkage()->inode = in;
+
+  link_inode_work(dn, in);
+
+  if (dn->last == CEPH_NOSNAP)
+    num_head_items++;
+  else
+    num_snap_items++;
+  
+  if (state_test(CDir::STATE_DNPINNEDFRAG)) {
+    dn->get(CDentry::PIN_FRAGMENTING);
+    dn->state_set(CDentry::STATE_FRAGMENTING);
+  }    
+
+  dout(12) << __func__ << " " << *dn << dendl;
+
+  // pin?
+  if (get_num_any() == 1)
+    get(PIN_CHILD);
+  ceph_assert(get_num_any() == items.size());
+  return dn;
+}
+
+CDentry* CDir::add_remote_dentry(std::string_view dname, inodeno_t ino, unsigned char d_type,
+                                 mempool::mds_co::string alternate_name,
+				 snapid_t first, snapid_t last) 
+{
+  // foreign
+  ceph_assert(lookup_exact_snap(dname, last) == 0);
+
+  // create dentry
+  CDentry* dn = new CDentry(dname, inode->hash_dentry_name(dname), std::move(alternate_name), ino, d_type, first, last);
+  if (is_auth()) 
+    dn->state_set(CDentry::STATE_AUTH);
+  mdcache->lru.lru_insert_mid(dn);
+
+  dn->dir = this;
+  dn->version = get_projected_version();
+  
+  // add to dir
+  ceph_assert(items.count(dn->key()) == 0);
+  //assert(null_items.count(dn->get_name()) == 0);
+
+  items[dn->key()] = dn;
+  if (last == CEPH_NOSNAP)
+    num_head_items++;
+  else
+    num_snap_items++;
+
+  if (state_test(CDir::STATE_DNPINNEDFRAG)) {
+    dn->get(CDentry::PIN_FRAGMENTING);
+    dn->state_set(CDentry::STATE_FRAGMENTING);
+  }    
+
+  dout(12) << __func__ << " " << *dn << dendl;
+
+  // pin?
+  if (get_num_any() == 1)
+    get(PIN_CHILD);
+  
+  ceph_assert(get_num_any() == items.size());
+  return dn;
+}
+
+
+
+void CDir::remove_dentry(CDentry *dn) 
+{
+  dout(12) << __func__ << " " << *dn << dendl;
+
+  // there should be no client leases at this point!
+  ceph_assert(dn->client_lease_map.empty());
+
+  if (state_test(CDir::STATE_DNPINNEDFRAG)) {
+    dn->put(CDentry::PIN_FRAGMENTING);
+    dn->state_clear(CDentry::STATE_FRAGMENTING);
+  }    
+
+  if (dn->get_linkage()->is_null()) {
+    if (dn->last == CEPH_NOSNAP)
+      num_head_null--;
+    else
+      num_snap_null--;
+  } else {
+    if (dn->last == CEPH_NOSNAP)
+      num_head_items--;
+    else
+      num_snap_items--;
+  }
+
+  if (!dn->get_linkage()->is_null())
+    // detach inode and dentry
+    unlink_inode_work(dn);
+  
+  // remove from list
+  ceph_assert(items.count(dn->key()) == 1);
+  items.erase(dn->key());
+
+  // clean?
+  if (dn->is_dirty())
+    dn->mark_clean();
+
+  if (dn->state_test(CDentry::STATE_BOTTOMLRU))
+    mdcache->bottom_lru.lru_remove(dn);
+  else
+    mdcache->lru.lru_remove(dn);
+  delete dn;
+
+  // unpin?
+  if (get_num_any() == 0)
+    put(PIN_CHILD);
+  ceph_assert(get_num_any() == items.size());
+}
+
+void CDir::link_remote_inode(CDentry *dn, CInode *in)
+{
+  link_remote_inode(dn, in->ino(), IFTODT(in->get_projected_inode()->mode));
+}
+
+void CDir::link_remote_inode(CDentry *dn, inodeno_t ino, unsigned char d_type)
+{
+  dout(12) << __func__ << " " << *dn << " remote " << ino << dendl;
+  ceph_assert(dn->get_linkage()->is_null());
+
+  dn->get_linkage()->set_remote(ino, d_type);
+
+  if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
+    mdcache->bottom_lru.lru_remove(dn);
+    mdcache->lru.lru_insert_mid(dn);
+    dn->state_clear(CDentry::STATE_BOTTOMLRU);
+  }
+
+  if (dn->last == CEPH_NOSNAP) {
+    num_head_items++;
+    num_head_null--;
+  } else {
+    num_snap_items++;
+    num_snap_null--;
+  }
+  ceph_assert(get_num_any() == items.size());
+}
+
+void CDir::link_primary_inode(CDentry *dn, CInode *in)
+{
+  dout(12) << __func__ << " " << *dn << " " << *in << dendl;
+  ceph_assert(dn->get_linkage()->is_null());
+
+  dn->get_linkage()->inode = in;
+
+  link_inode_work(dn, in);
+
+  if (dn->state_test(CDentry::STATE_BOTTOMLRU) &&
+      (is_auth() || !inode->is_stray())) {
+    mdcache->bottom_lru.lru_remove(dn);
+    mdcache->lru.lru_insert_mid(dn);
+    dn->state_clear(CDentry::STATE_BOTTOMLRU);
+  }
+  
+  if (dn->last == CEPH_NOSNAP) {
+    num_head_items++;
+    num_head_null--;
+  } else {
+    num_snap_items++;
+    num_snap_null--;
+  }
+
+  ceph_assert(get_num_any() == items.size());
+}
+
+void CDir::link_inode_work( CDentry *dn, CInode *in)
+{
+  ceph_assert(dn->get_linkage()->get_inode() == in);
+  in->set_primary_parent(dn);
+
+  // set inode version
+  //in->inode.version = dn->get_version();
+  
+  // pin dentry?
+  if (in->get_num_ref())
+    dn->get(CDentry::PIN_INODEPIN);
+
+  if (in->state_test(CInode::STATE_TRACKEDBYOFT))
+    mdcache->open_file_table.notify_link(in);
+  if (in->is_any_caps())
+    adjust_num_inodes_with_caps(1);
+  
+  // adjust auth pin count
+  if (in->auth_pins)
+    dn->adjust_nested_auth_pins(in->auth_pins, NULL);
+
+  if (in->is_freezing_inode())
+    freezing_inodes.push_back(&in->item_freezing_inode);
+  else if (in->is_frozen_inode() || in->is_frozen_auth_pin())
+    num_frozen_inodes++;
+
+  // verify open snaprealm parent
+  if (in->snaprealm)
+    in->snaprealm->adjust_parent();
+  else if (in->is_any_caps())
+    in->move_to_realm(inode->find_snaprealm());
+}
+
+void CDir::unlink_inode(CDentry *dn, bool adjust_lru)
+{
+  if (dn->get_linkage()->is_primary()) {
+    dout(12) << __func__ << " " << *dn << " " << *dn->get_linkage()->get_inode() << dendl;
+  } else {
+    dout(12) << __func__ << " " << *dn << dendl;
+  }
+
+  unlink_inode_work(dn);
+
+  if (adjust_lru && !dn->state_test(CDentry::STATE_BOTTOMLRU)) {
+    mdcache->lru.lru_remove(dn);
+    mdcache->bottom_lru.lru_insert_mid(dn);
+    dn->state_set(CDentry::STATE_BOTTOMLRU);
+  }
+
+  if (dn->last == CEPH_NOSNAP) {
+    num_head_items--;
+    num_head_null++;
+  } else {
+    num_snap_items--;
+    num_snap_null++;
+  }
+  ceph_assert(get_num_any() == items.size());
+}
+
+
+void CDir::try_remove_unlinked_dn(CDentry *dn)
+{
+  ceph_assert(dn->dir == this);
+  ceph_assert(dn->get_linkage()->is_null());
+  
+  // no pins (besides dirty)?
+  if (dn->get_num_ref() != dn->is_dirty()) 
+    return;
+
+  // was the dn new?
+  if (dn->is_new()) {
+    dout(10) << __func__ << " " << *dn << " in " << *this << dendl;
+    if (dn->is_dirty())
+      dn->mark_clean();
+    remove_dentry(dn);
+
+    // NOTE: we may not have any more dirty dentries, but the fnode
+    // still changed, so the directory must remain dirty.
+  }
+}
+
+
+void CDir::unlink_inode_work(CDentry *dn)
+{
+  CInode *in = dn->get_linkage()->get_inode();
+
+  if (dn->get_linkage()->is_remote()) {
+    // remote
+    if (in) 
+      dn->unlink_remote(dn->get_linkage());
+
+    dn->get_linkage()->set_remote(0, 0);
+  } else if (dn->get_linkage()->is_primary()) {
+    // primary
+    // unpin dentry?
+    if (in->get_num_ref())
+      dn->put(CDentry::PIN_INODEPIN);
+
+    if (in->state_test(CInode::STATE_TRACKEDBYOFT))
+      mdcache->open_file_table.notify_unlink(in);
+    if (in->is_any_caps())
+      adjust_num_inodes_with_caps(-1);
+    
+    // unlink auth_pin count
+    if (in->auth_pins)
+      dn->adjust_nested_auth_pins(-in->auth_pins, nullptr);
+
+    if (in->is_freezing_inode())
+      in->item_freezing_inode.remove_myself();
+    else if (in->is_frozen_inode() || in->is_frozen_auth_pin())
+      num_frozen_inodes--;
+
+    // detach inode
+    in->remove_primary_parent(dn);
+    if (in->is_dir())
+      in->item_pop_lru.remove_myself();
+    dn->get_linkage()->inode = 0;
+  } else {
+    ceph_assert(!dn->get_linkage()->is_null());
+  }
+}
+
+void CDir::add_to_bloom(CDentry *dn)
+{
+  ceph_assert(dn->last == CEPH_NOSNAP);
+  if (!bloom) {
+    /* not create bloom filter for incomplete dir that was added by log replay */
+    if (!is_complete())
+      return;
+
+    /* don't maintain bloom filters in standby replay (saves cycles, and also
+     * avoids need to implement clearing it in EExport for #16924) */
+    if (mdcache->mds->is_standby_replay()) {
+      return;
+    }
+
+    unsigned size = get_num_head_items() + get_num_snap_items();
+    if (size < 100) size = 100;
+    bloom.reset(new bloom_filter(size, 1.0 / size, 0));
+  }
+  /* This size and false positive probability is completely random.*/
+  bloom->insert(dn->get_name().data(), dn->get_name().size());
+}
+
+bool CDir::is_in_bloom(std::string_view name)
+{
+  if (!bloom)
+    return false;
+  return bloom->contains(name.data(), name.size());
+}
+
+void CDir::remove_null_dentries() {
+  dout(12) << __func__ << " " << *this << dendl;
+
+  auto p = items.begin();
+  while (p != items.end()) {
+    CDentry *dn = p->second;
+    ++p;
+    if (dn->get_linkage()->is_null() && !dn->is_projected())
+      remove_dentry(dn);
+  }
+
+  ceph_assert(num_snap_null == 0);
+  ceph_assert(num_head_null == 0);
+  ceph_assert(get_num_any() == items.size());
+}
+
+/** remove dirty null dentries for deleted directory. the dirfrag will be
+ *  deleted soon, so it's safe to not commit dirty dentries.
+ *
+ *  This is called when a directory is being deleted, a prerequisite
+ *  of which is that its children have been unlinked: we expect to only see
+ *  null, unprojected dentries here.
+ */
+void CDir::try_remove_dentries_for_stray()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(get_parent_dir()->inode->is_stray());
+
+  // clear dirty only when the directory was not snapshotted
+  bool clear_dirty = !inode->snaprealm;
+
+  auto p = items.begin();
+  while (p != items.end()) {
+    CDentry *dn = p->second;
+    ++p;
+    if (dn->last == CEPH_NOSNAP) {
+      ceph_assert(!dn->is_projected());
+      ceph_assert(dn->get_linkage()->is_null());
+      if (clear_dirty && dn->is_dirty())
+	dn->mark_clean();
+      // It's OK to remove lease prematurely because we will never link
+      // the dentry to inode again.
+      if (dn->is_any_leases())
+	dn->remove_client_leases(mdcache->mds->locker);
+      if (dn->get_num_ref() == 0)
+	remove_dentry(dn);
+    } else {
+      ceph_assert(!dn->is_projected());
+      CDentry::linkage_t *dnl= dn->get_linkage();
+      CInode *in = NULL;
+      if (dnl->is_primary()) {
+	in = dnl->get_inode();
+	if (clear_dirty && in->is_dirty())
+	  in->mark_clean();
+      }
+      if (clear_dirty && dn->is_dirty())
+	dn->mark_clean();
+      if (dn->get_num_ref() == 0) {
+	remove_dentry(dn);
+	if (in)
+	  mdcache->remove_inode(in);
+      }
+    }
+  }
+
+  if (clear_dirty && is_dirty())
+    mark_clean();
+}
+
+bool CDir::try_trim_snap_dentry(CDentry *dn, const set<snapid_t>& snaps)
+{
+  ceph_assert(dn->last != CEPH_NOSNAP);
+  set<snapid_t>::const_iterator p = snaps.lower_bound(dn->first);
+  CDentry::linkage_t *dnl= dn->get_linkage();
+  CInode *in = 0;
+  if (dnl->is_primary())
+    in = dnl->get_inode();
+  if ((p == snaps.end() || *p > dn->last) &&
+      (dn->get_num_ref() == dn->is_dirty()) &&
+      (!in || in->get_num_ref() == in->is_dirty())) {
+    dout(10) << " purging snapped " << *dn << dendl;
+    if (in && in->is_dirty())
+      in->mark_clean();
+    remove_dentry(dn);
+    if (in) {
+      dout(10) << " purging snapped " << *in << dendl;
+      mdcache->remove_inode(in);
+    }
+    return true;
+  }
+  return false;
+}
+
+
+void CDir::purge_stale_snap_data(const set<snapid_t>& snaps)
+{
+  dout(10) << __func__ << " " << snaps << dendl;
+
+  auto p = items.begin();
+  while (p != items.end()) {
+    CDentry *dn = p->second;
+    ++p;
+
+    if (dn->last == CEPH_NOSNAP)
+      continue;
+
+    try_trim_snap_dentry(dn, snaps);
+  }
+}
+
+
+/**
+ * steal_dentry -- semi-violently move a dentry from one CDir to another
+ * (*) violently, in that nitems, most pins, etc. are not correctly maintained 
+ * on the old CDir corpse; must call finish_old_fragment() when finished.
+ */
+void CDir::steal_dentry(CDentry *dn)
+{
+  dout(15) << __func__ << " " << *dn << dendl;
+
+  items[dn->key()] = dn;
+
+  dn->dir->items.erase(dn->key());
+  if (dn->dir->items.empty())
+    dn->dir->put(PIN_CHILD);
+
+  if (get_num_any() == 0)
+    get(PIN_CHILD);
+  if (dn->get_linkage()->is_null()) {
+    if (dn->last == CEPH_NOSNAP)
+      num_head_null++;
+    else
+      num_snap_null++;
+  } else if (dn->last == CEPH_NOSNAP) {
+      num_head_items++;
+
+    auto _fnode = _get_fnode();
+
+    if (dn->get_linkage()->is_primary()) {
+      CInode *in = dn->get_linkage()->get_inode();
+      const auto& pi = in->get_projected_inode();
+      if (in->is_dir()) {
+	_fnode->fragstat.nsubdirs++;
+	if (in->item_pop_lru.is_on_list())
+	  pop_lru_subdirs.push_back(&in->item_pop_lru);
+      } else {
+	_fnode->fragstat.nfiles++;
+      }
+      _fnode->rstat.rbytes += pi->accounted_rstat.rbytes;
+      _fnode->rstat.rfiles += pi->accounted_rstat.rfiles;
+      _fnode->rstat.rsubdirs += pi->accounted_rstat.rsubdirs;
+      _fnode->rstat.rsnaps += pi->accounted_rstat.rsnaps;
+      if (pi->accounted_rstat.rctime > fnode->rstat.rctime)
+	_fnode->rstat.rctime = pi->accounted_rstat.rctime;
+
+      if (in->is_any_caps())
+	adjust_num_inodes_with_caps(1);
+
+      // move dirty inode rstat to new dirfrag
+      if (in->is_dirty_rstat())
+	dirty_rstat_inodes.push_back(&in->dirty_rstat_item);
+    } else if (dn->get_linkage()->is_remote()) {
+      if (dn->get_linkage()->get_remote_d_type() == DT_DIR)
+	_fnode->fragstat.nsubdirs++;
+      else
+	_fnode->fragstat.nfiles++;
+    }
+  } else {
+    num_snap_items++;
+    if (dn->get_linkage()->is_primary()) {
+      CInode *in = dn->get_linkage()->get_inode();
+      if (in->is_dirty_rstat())
+	dirty_rstat_inodes.push_back(&in->dirty_rstat_item);
+    }
+  }
+
+  {
+    int dap = dn->get_num_dir_auth_pins();
+    if (dap) {
+      adjust_nested_auth_pins(dap, NULL);
+      dn->dir->adjust_nested_auth_pins(-dap, NULL);
+    }
+  }
+
+  if (dn->is_dirty()) {
+    dirty_dentries.push_back(&dn->item_dir_dirty);
+    num_dirty++;
+  }
+
+  dn->dir = this;
+}
+
+void CDir::prepare_old_fragment(map<string_snap_t, MDSContext::vec >& dentry_waiters, bool replay)
+{
+  // auth_pin old fragment for duration so that any auth_pinning
+  // during the dentry migration doesn't trigger side effects
+  if (!replay && is_auth())
+    auth_pin(this);
+
+  if (!waiting_on_dentry.empty()) {
+    for (const auto &p : waiting_on_dentry) {
+      auto &e = dentry_waiters[p.first];
+      for (const auto &waiter : p.second) {
+        e.push_back(waiter);
+      }
+    }
+    waiting_on_dentry.clear();
+    put(PIN_DNWAITER);
+  }
+}
+
+void CDir::prepare_new_fragment(bool replay)
+{
+  if (!replay && is_auth()) {
+    _freeze_dir();
+    mark_complete();
+  }
+  inode->add_dirfrag(this);
+}
+
+void CDir::finish_old_fragment(MDSContext::vec& waiters, bool replay)
+{
+  // take waiters _before_ unfreeze...
+  if (!replay) {
+    take_waiting(WAIT_ANY_MASK, waiters);
+    if (is_auth()) {
+      auth_unpin(this);  // pinned in prepare_old_fragment
+      ceph_assert(is_frozen_dir());
+      unfreeze_dir();
+    }
+  }
+
+  ceph_assert(dir_auth_pins == 0);
+  ceph_assert(auth_pins == 0);
+
+  num_head_items = num_head_null = 0;
+  num_snap_items = num_snap_null = 0;
+  adjust_num_inodes_with_caps(-num_inodes_with_caps);
+
+  // this mirrors init_fragment_pins()
+  if (is_auth()) 
+    clear_replica_map();
+  if (is_dirty())
+    mark_clean();
+  if (state_test(STATE_IMPORTBOUND))
+    put(PIN_IMPORTBOUND);
+  if (state_test(STATE_EXPORTBOUND))
+    put(PIN_EXPORTBOUND);
+  if (is_subtree_root())
+    put(PIN_SUBTREE);
+
+  if (auth_pins > 0)
+    put(PIN_AUTHPIN);
+
+  ceph_assert(get_num_ref() == (state_test(STATE_STICKY) ? 1:0));
+}
+
+void CDir::init_fragment_pins()
+{
+  if (is_replicated())
+    get(PIN_REPLICATED);
+  if (state_test(STATE_DIRTY))
+    get(PIN_DIRTY);
+  if (state_test(STATE_EXPORTBOUND))
+    get(PIN_EXPORTBOUND);
+  if (state_test(STATE_IMPORTBOUND))
+    get(PIN_IMPORTBOUND);
+  if (is_subtree_root())
+    get(PIN_SUBTREE);
+}
+
+void CDir::split(int bits, std::vector<CDir*>* subs, MDSContext::vec& waiters, bool replay)
+{
+  dout(10) << "split by " << bits << " bits on " << *this << dendl;
+
+  ceph_assert(replay || is_complete() || !is_auth());
+
+  frag_vec_t frags;
+  frag.split(bits, frags);
+
+  vector<CDir*> subfrags(1 << bits);
+  
+  double fac = 1.0 / (double)(1 << bits);  // for scaling load vecs
+
+  version_t rstat_version = inode->get_projected_inode()->rstat.version;
+  version_t dirstat_version = inode->get_projected_inode()->dirstat.version;
+
+  nest_info_t rstatdiff;
+  frag_info_t fragstatdiff;
+  if (fnode->accounted_rstat.version == rstat_version)
+    rstatdiff.add_delta(fnode->accounted_rstat, fnode->rstat);
+  if (fnode->accounted_fragstat.version == dirstat_version)
+    fragstatdiff.add_delta(fnode->accounted_fragstat, fnode->fragstat);
+  dout(10) << " rstatdiff " << rstatdiff << " fragstatdiff " << fragstatdiff << dendl;
+
+  map<string_snap_t, MDSContext::vec > dentry_waiters;
+  prepare_old_fragment(dentry_waiters, replay);
+
+  // create subfrag dirs
+  int n = 0;
+  for (const auto& fg : frags) {
+    CDir *f = new CDir(inode, fg, mdcache, is_auth());
+    f->state_set(state & (MASK_STATE_FRAGMENT_KEPT | STATE_COMPLETE));
+    f->get_replicas() = get_replicas();
+    f->pop_me = pop_me;
+    f->pop_me.scale(fac);
+
+    // FIXME; this is an approximation
+    f->pop_nested = pop_nested;
+    f->pop_nested.scale(fac);
+    f->pop_auth_subtree = pop_auth_subtree;
+    f->pop_auth_subtree.scale(fac);
+    f->pop_auth_subtree_nested = pop_auth_subtree_nested;
+    f->pop_auth_subtree_nested.scale(fac);
+
+    dout(10) << " subfrag " << fg << " " << *f << dendl;
+    subfrags[n++] = f;
+    subs->push_back(f);
+
+    f->set_dir_auth(get_dir_auth());
+    f->freeze_tree_state = freeze_tree_state;
+    f->prepare_new_fragment(replay);
+    f->init_fragment_pins();
+  }
+  
+  // repartition dentries
+  while (!items.empty()) {
+    auto p = items.begin();
+    
+    CDentry *dn = p->second;
+    frag_t subfrag = inode->pick_dirfrag(dn->get_name());
+    int n = (subfrag.value() & (subfrag.mask() ^ frag.mask())) >> subfrag.mask_shift();
+    dout(15) << " subfrag " << subfrag << " n=" << n << " for " << p->first << dendl;
+    CDir *f = subfrags[n];
+    f->steal_dentry(dn);
+  }
+
+  for (const auto &p : dentry_waiters) {
+    frag_t subfrag = inode->pick_dirfrag(p.first.name);
+    int n = (subfrag.value() & (subfrag.mask() ^ frag.mask())) >> subfrag.mask_shift();
+    CDir *f = subfrags[n];
+
+    if (f->waiting_on_dentry.empty())
+      f->get(PIN_DNWAITER);
+    auto &e = f->waiting_on_dentry[p.first];
+    for (const auto &waiter : p.second) {
+      e.push_back(waiter);
+    }
+  }
+
+  // FIXME: handle dirty old rstat
+
+  // fix up new frag fragstats
+  for (int i = 0; i < n; i++) {
+    CDir *f = subfrags[i];
+    auto _fnode = f->_get_fnode();
+    _fnode->version = f->projected_version = get_version();
+    _fnode->rstat.version = rstat_version;
+    _fnode->accounted_rstat = _fnode->rstat;
+    _fnode->fragstat.version = dirstat_version;
+    _fnode->accounted_fragstat = _fnode->fragstat;
+    dout(10) << " rstat " << _fnode->rstat << " fragstat " << _fnode->fragstat
+	     << " on " << *f << dendl;
+
+    if (i == 0) {
+      // give any outstanding frag stat differential to first frag
+      dout(10) << " giving rstatdiff " << rstatdiff << " fragstatdiff" << fragstatdiff
+	       << " to " << *subfrags[0] << dendl;
+      _fnode->accounted_rstat.add(rstatdiff);
+      _fnode->accounted_fragstat.add(fragstatdiff);
+    }
+  }
+
+  finish_old_fragment(waiters, replay);
+}
+
+void CDir::merge(const std::vector<CDir*>& subs, MDSContext::vec& waiters, bool replay)
+{
+  dout(10) << "merge " << subs << dendl;
+
+  ceph_assert(subs.size() > 0);
+
+  set_dir_auth(subs.front()->get_dir_auth());
+  freeze_tree_state = subs.front()->freeze_tree_state;
+
+  for (const auto& dir : subs) {
+    ceph_assert(get_dir_auth() == dir->get_dir_auth());
+    ceph_assert(freeze_tree_state == dir->freeze_tree_state);
+  }
+
+  prepare_new_fragment(replay);
+
+  auto _fnode = _get_fnode();
+
+  nest_info_t rstatdiff;
+  frag_info_t fragstatdiff;
+  bool touched_mtime, touched_chattr;
+  version_t rstat_version = inode->get_projected_inode()->rstat.version;
+  version_t dirstat_version = inode->get_projected_inode()->dirstat.version;
+
+  map<string_snap_t, MDSContext::vec > dentry_waiters;
+
+  for (const auto& dir : subs) {
+    dout(10) << " subfrag " << dir->get_frag() << " " << *dir << dendl;
+    ceph_assert(!dir->is_auth() || dir->is_complete() || replay);
+
+    if (dir->get_fnode()->accounted_rstat.version == rstat_version)
+      rstatdiff.add_delta(dir->get_fnode()->accounted_rstat, dir->get_fnode()->rstat);
+    if (dir->get_fnode()->accounted_fragstat.version == dirstat_version)
+      fragstatdiff.add_delta(dir->get_fnode()->accounted_fragstat, dir->get_fnode()->fragstat,
+			     &touched_mtime, &touched_chattr);
+
+    dir->prepare_old_fragment(dentry_waiters, replay);
+
+    // steal dentries
+    while (!dir->items.empty()) 
+      steal_dentry(dir->items.begin()->second);
+    
+    // merge replica map
+    for (const auto &p : dir->get_replicas()) {
+      unsigned cur = get_replicas()[p.first];
+      if (p.second > cur)
+	get_replicas()[p.first] = p.second;
+    }
+
+    // merge version
+    if (dir->get_version() > _fnode->version)
+      _fnode->version = projected_version = dir->get_version();
+
+    // merge state
+    state_set(dir->get_state() & MASK_STATE_FRAGMENT_KEPT);
+
+    dir->finish_old_fragment(waiters, replay);
+    inode->close_dirfrag(dir->get_frag());
+  }
+
+  if (!dentry_waiters.empty()) {
+    get(PIN_DNWAITER);
+    for (const auto &p : dentry_waiters) {
+      auto &e = waiting_on_dentry[p.first];
+      for (const auto &waiter : p.second) {
+        e.push_back(waiter);
+      }
+    }
+  }
+
+  if (is_auth() && !replay)
+    mark_complete();
+
+  // FIXME: merge dirty old rstat
+  _fnode->rstat.version = rstat_version;
+  _fnode->accounted_rstat = _fnode->rstat;
+  _fnode->accounted_rstat.add(rstatdiff);
+
+  _fnode->fragstat.version = dirstat_version;
+  _fnode->accounted_fragstat = _fnode->fragstat;
+  _fnode->accounted_fragstat.add(fragstatdiff);
+
+  init_fragment_pins();
+}
+
+
+
+
+void CDir::resync_accounted_fragstat()
+{
+  auto pf = _get_projected_fnode();
+  const auto& pi = inode->get_projected_inode();
+
+  if (pf->accounted_fragstat.version != pi->dirstat.version) {
+    pf->fragstat.version = pi->dirstat.version;
+    dout(10) << __func__ << " " << pf->accounted_fragstat << " -> " << pf->fragstat << dendl;
+    pf->accounted_fragstat = pf->fragstat;
+  }
+}
+
+/*
+ * resync rstat and accounted_rstat with inode
+ */
+void CDir::resync_accounted_rstat()
+{
+  auto pf = _get_projected_fnode();
+  const auto& pi = inode->get_projected_inode();
+  
+  if (pf->accounted_rstat.version != pi->rstat.version) {
+    pf->rstat.version = pi->rstat.version;
+    dout(10) << __func__ << " " << pf->accounted_rstat << " -> " << pf->rstat << dendl;
+    pf->accounted_rstat = pf->rstat;
+    dirty_old_rstat.clear();
+  }
+}
+
+void CDir::assimilate_dirty_rstat_inodes(MutationRef& mut)
+{
+  dout(10) << __func__ << dendl;
+  for (elist<CInode*>::iterator p = dirty_rstat_inodes.begin_use_current();
+       !p.end(); ++p) {
+    CInode *in = *p;
+    ceph_assert(in->is_auth());
+    if (in->is_frozen())
+      continue;
+
+    mut->auth_pin(in);
+
+    auto pi = in->project_inode(mut);
+    pi.inode->version = in->pre_dirty();
+
+    mdcache->project_rstat_inode_to_frag(mut, in, this, 0, 0, nullptr);
+  }
+  state_set(STATE_ASSIMRSTAT);
+  dout(10) << __func__ << " done" << dendl;
+}
+
+void CDir::assimilate_dirty_rstat_inodes_finish(EMetaBlob *blob)
+{
+  if (!state_test(STATE_ASSIMRSTAT))
+    return;
+  state_clear(STATE_ASSIMRSTAT);
+  dout(10) << __func__ << dendl;
+  elist<CInode*>::iterator p = dirty_rstat_inodes.begin_use_current();
+  while (!p.end()) {
+    CInode *in = *p;
+    ++p;
+
+    if (in->is_frozen())
+      continue;
+
+    CDentry *dn = in->get_projected_parent_dn();
+
+    in->clear_dirty_rstat();
+    blob->add_primary_dentry(dn, in, true);
+  }
+
+  if (!dirty_rstat_inodes.empty())
+    mdcache->mds->locker->mark_updated_scatterlock(&inode->nestlock);
+}
+
+
+
+
+/****************************************
+ * WAITING
+ */
+
+void CDir::add_dentry_waiter(std::string_view dname, snapid_t snapid, MDSContext *c) 
+{
+  if (waiting_on_dentry.empty())
+    get(PIN_DNWAITER);
+  waiting_on_dentry[string_snap_t(dname, snapid)].push_back(c);
+  dout(10) << __func__ << " dentry " << dname
+	   << " snap " << snapid
+	   << " " << c << " on " << *this << dendl;
+}
+
+void CDir::take_dentry_waiting(std::string_view dname, snapid_t first, snapid_t last,
+			       MDSContext::vec& ls)
+{
+  if (waiting_on_dentry.empty())
+    return;
+  
+  string_snap_t lb(dname, first);
+  string_snap_t ub(dname, last);
+  auto it = waiting_on_dentry.lower_bound(lb);
+  while (it != waiting_on_dentry.end() &&
+	 !(ub < it->first)) {
+    dout(10) << __func__ << " " << dname
+	     << " [" << first << "," << last << "] found waiter on snap "
+	     << it->first.snapid
+	     << " on " << *this << dendl;
+    for (const auto &waiter : it->second) {
+      ls.push_back(waiter);
+    }
+    waiting_on_dentry.erase(it++);
+  }
+
+  if (waiting_on_dentry.empty())
+    put(PIN_DNWAITER);
+}
+
+void CDir::take_sub_waiting(MDSContext::vec& ls)
+{
+  dout(10) << __func__ << dendl;
+  if (!waiting_on_dentry.empty()) {
+    for (const auto &p : waiting_on_dentry) {
+      for (const auto &waiter : p.second) {
+        ls.push_back(waiter);
+      }
+    }
+    waiting_on_dentry.clear();
+    put(PIN_DNWAITER);
+  }
+}
+
+
+
+void CDir::add_waiter(uint64_t tag, MDSContext *c) 
+{
+  // hierarchical?
+  
+  // at subtree root?
+  if (tag & WAIT_ATSUBTREEROOT) {
+    if (!is_subtree_root()) {
+      // try parent
+      dout(10) << "add_waiter " << std::hex << tag << std::dec << " " << c << " should be ATSUBTREEROOT, " << *this << " is not root, trying parent" << dendl;
+      inode->parent->dir->add_waiter(tag, c);
+      return;
+    }
+  }
+
+  ceph_assert(!(tag & WAIT_CREATED) || state_test(STATE_CREATING));
+
+  MDSCacheObject::add_waiter(tag, c);
+}
+
+
+
+/* NOTE: this checks dentry waiters too */
+void CDir::take_waiting(uint64_t mask, MDSContext::vec& ls)
+{
+  if ((mask & WAIT_DENTRY) && !waiting_on_dentry.empty()) {
+    // take all dentry waiters
+    for (const auto &p : waiting_on_dentry) {
+      dout(10) << "take_waiting dentry " << p.first.name
+	       << " snap " << p.first.snapid << " on " << *this << dendl;
+      for (const auto &waiter : p.second) {
+        ls.push_back(waiter);
+      }
+    }
+    waiting_on_dentry.clear();
+    put(PIN_DNWAITER);
+  }
+  
+  // waiting
+  MDSCacheObject::take_waiting(mask, ls);
+}
+
+
+void CDir::finish_waiting(uint64_t mask, int result) 
+{
+  dout(11) << __func__ << " mask " << hex << mask << dec << " result " << result << " on " << *this << dendl;
+
+  MDSContext::vec finished;
+  take_waiting(mask, finished);
+  if (result < 0)
+    finish_contexts(g_ceph_context, finished, result);
+  else
+    mdcache->mds->queue_waiters(finished);
+}
+
+
+
+// dirty/clean
+
+CDir::fnode_ptr CDir::project_fnode(const MutationRef& mut)
+{
+  ceph_assert(get_version() != 0);
+
+  if (mut && mut->is_projected(this))
+    return std::const_pointer_cast<fnode_t>(projected_fnode.back());
+
+  auto pf = allocate_fnode(*get_projected_fnode());
+
+  if (scrub_infop && scrub_infop->last_scrub_dirty) {
+    pf->localized_scrub_stamp = scrub_infop->last_local.time;
+    pf->localized_scrub_version = scrub_infop->last_local.version;
+    pf->recursive_scrub_stamp = scrub_infop->last_recursive.time;
+    pf->recursive_scrub_version = scrub_infop->last_recursive.version;
+    scrub_infop->last_scrub_dirty = false;
+    scrub_maybe_delete_info();
+  }
+
+  projected_fnode.emplace_back(pf);
+  if (mut)
+    mut->add_projected_node(this);
+  dout(10) << __func__ <<  " " << pf.get() << dendl;
+  return pf;
+}
+
+void CDir::pop_and_dirty_projected_fnode(LogSegment *ls, const MutationRef& mut)
+{
+  ceph_assert(!projected_fnode.empty());
+  auto pf = std::move(projected_fnode.front());
+  dout(15) << __func__ << " " << pf.get() << " v" << pf->version << dendl;
+
+  projected_fnode.pop_front();
+  if (mut)
+    mut->remove_projected_node(this);
+
+  reset_fnode(std::move(pf));
+  _mark_dirty(ls);
+}
+
+version_t CDir::pre_dirty(version_t min)
+{
+  if (min > projected_version)
+    projected_version = min;
+  ++projected_version;
+  dout(10) << __func__ << " " << projected_version << dendl;
+  return projected_version;
+}
+
+void CDir::mark_dirty(LogSegment *ls, version_t pv)
+{
+  ceph_assert(is_auth());
+
+  if (pv) {
+    ceph_assert(get_version() < pv);
+    ceph_assert(pv <= projected_version);
+    ceph_assert(!projected_fnode.empty() &&
+	        pv <= projected_fnode.front()->version);
+  }
+
+  _mark_dirty(ls);
+}
+
+void CDir::_mark_dirty(LogSegment *ls)
+{
+  if (!state_test(STATE_DIRTY)) {
+    dout(10) << __func__ << " (was clean) " << *this << " version " << get_version() << dendl;
+    _set_dirty_flag();
+    ceph_assert(ls);
+  } else {
+    dout(10) << __func__ << " (already dirty) " << *this << " version " << get_version() << dendl;
+  }
+  if (ls) {
+    ls->dirty_dirfrags.push_back(&item_dirty);
+
+    // if i've never committed, i need to be before _any_ mention of me is trimmed from the journal.
+    if (committed_version == 0 && !item_new.is_on_list())
+      ls->new_dirfrags.push_back(&item_new);
+  }
+}
+
+void CDir::mark_new(LogSegment *ls)
+{
+  ls->new_dirfrags.push_back(&item_new);
+  state_clear(STATE_CREATING);
+
+  MDSContext::vec waiters;
+  take_waiting(CDir::WAIT_CREATED, waiters);
+  mdcache->mds->queue_waiters(waiters);
+}
+
+void CDir::set_fresh_fnode(fnode_const_ptr&& ptr) {
+  ceph_assert(inode->is_auth());
+  ceph_assert(!is_projected());
+  ceph_assert(!state_test(STATE_COMMITTING));
+  reset_fnode(std::move(ptr));
+  projected_version = committing_version = committed_version = get_version();
+
+  if (state_test(STATE_REJOINUNDEF)) {
+    ceph_assert(mdcache->mds->is_rejoin());
+    state_clear(STATE_REJOINUNDEF);
+    mdcache->opened_undef_dirfrag(this);
+  }
+}
+
+void CDir::mark_clean()
+{
+  dout(10) << __func__ << " " << *this << " version " << get_version() << dendl;
+  if (state_test(STATE_DIRTY)) {
+    item_dirty.remove_myself();
+    item_new.remove_myself();
+
+    state_clear(STATE_DIRTY);
+    put(PIN_DIRTY);
+  }
+}
+
+// caller should hold auth pin of this
+void CDir::log_mark_dirty()
+{
+  if (is_dirty() || projected_version > get_version())
+    return; // noop if it is already dirty or will be dirty
+
+  auto _fnode = allocate_fnode(*get_fnode());
+  _fnode->version = pre_dirty();
+  reset_fnode(std::move(_fnode));
+  mark_dirty(mdcache->mds->mdlog->get_current_segment());
+}
+
+void CDir::mark_complete() {
+  state_set(STATE_COMPLETE);
+  bloom.reset();
+}
+
+void CDir::first_get()
+{
+  inode->get(CInode::PIN_DIRFRAG);
+}
+
+void CDir::last_put()
+{
+  inode->put(CInode::PIN_DIRFRAG);
+}
+
+
+
+/******************************************************************************
+ * FETCH and COMMIT
+ */
+
+// -----------------------
+// FETCH
+void CDir::fetch(MDSContext *c, bool ignore_authpinnability)
+{
+  string want;
+  return fetch(c, want, ignore_authpinnability);
+}
+
+void CDir::fetch(MDSContext *c, std::string_view want_dn, bool ignore_authpinnability)
+{
+  dout(10) << "fetch on " << *this << dendl;
+  
+  ceph_assert(is_auth());
+  ceph_assert(!is_complete());
+
+  if (!can_auth_pin() && !ignore_authpinnability) {
+    if (c) {
+      dout(7) << "fetch waiting for authpinnable" << dendl;
+      add_waiter(WAIT_UNFREEZE, c);
+    } else
+      dout(7) << "fetch not authpinnable and no context" << dendl;
+    return;
+  }
+
+  // unlinked directory inode shouldn't have any entry
+  if (!inode->is_base() && get_parent_dir()->inode->is_stray() &&
+      !inode->snaprealm) {
+    dout(7) << "fetch dirfrag for unlinked directory, mark complete" << dendl;
+    if (get_version() == 0) {
+      auto _fnode = allocate_fnode();
+      _fnode->version = 1;
+      set_fresh_fnode(std::move(_fnode));
+    }
+    mark_complete();
+
+    if (c)
+      mdcache->mds->queue_waiter(c);
+    return;
+  }
+
+  if (c) add_waiter(WAIT_COMPLETE, c);
+  if (!want_dn.empty()) wanted_items.insert(mempool::mds_co::string(want_dn));
+  
+  // already fetching?
+  if (state_test(CDir::STATE_FETCHING)) {
+    dout(7) << "already fetching; waiting" << dendl;
+    return;
+  }
+
+  auth_pin(this);
+  state_set(CDir::STATE_FETCHING);
+
+  if (mdcache->mds->logger) mdcache->mds->logger->inc(l_mds_dir_fetch);
+
+  mdcache->mds->balancer->hit_dir(this, META_POP_FETCH);
+
+  std::set<dentry_key_t> empty;
+  _omap_fetch(NULL, empty);
+}
+
+void CDir::fetch(MDSContext *c, const std::set<dentry_key_t>& keys)
+{
+  dout(10) << "fetch " << keys.size() << " keys on " << *this << dendl;
+
+  ceph_assert(is_auth());
+  ceph_assert(!is_complete());
+
+  if (!can_auth_pin()) {
+    dout(7) << "fetch keys waiting for authpinnable" << dendl;
+    add_waiter(WAIT_UNFREEZE, c);
+    return;
+  }
+  if (state_test(CDir::STATE_FETCHING)) {
+    dout(7) << "fetch keys waiting for full fetch" << dendl;
+    add_waiter(WAIT_COMPLETE, c);
+    return;
+  }
+
+  auth_pin(this);
+  if (mdcache->mds->logger) mdcache->mds->logger->inc(l_mds_dir_fetch);
+
+  mdcache->mds->balancer->hit_dir(this, META_POP_FETCH);
+
+  _omap_fetch(c, keys);
+}
+
+class C_IO_Dir_OMAP_FetchedMore : public CDirIOContext {
+  MDSContext *fin;
+public:
+  const version_t omap_version;
+  bufferlist hdrbl;
+  bool more = false;
+  map<string, bufferlist> omap;      ///< carry-over from before
+  map<string, bufferlist> omap_more; ///< new batch
+  int ret;
+  C_IO_Dir_OMAP_FetchedMore(CDir *d, version_t v, MDSContext *f) :
+    CDirIOContext(d), fin(f), omap_version(v), ret(0) { }
+  void finish(int r) {
+    if (omap_version < dir->get_committed_version()) {
+      omap.clear();
+      dir->_omap_fetch(fin, {});
+      return;
+    }
+
+    // merge results
+    if (omap.empty()) {
+      omap.swap(omap_more);
+    } else {
+      omap.insert(omap_more.begin(), omap_more.end());
+    }
+    if (more) {
+      dir->_omap_fetch_more(omap_version, hdrbl, omap, fin);
+    } else {
+      dir->_omap_fetched(hdrbl, omap, !fin, r);
+      if (fin)
+	fin->complete(r);
+    }
+  }
+  void print(ostream& out) const override {
+    out << "dirfrag_fetch_more(" << dir->dirfrag() << ")";
+  }
+};
+
+class C_IO_Dir_OMAP_Fetched : public CDirIOContext {
+  MDSContext *fin;
+public:
+  const version_t omap_version;
+  bufferlist hdrbl;
+  bool more = false;
+  map<string, bufferlist> omap;
+  bufferlist btbl;
+  int ret1, ret2, ret3;
+
+  C_IO_Dir_OMAP_Fetched(CDir *d, MDSContext *f) :
+    CDirIOContext(d), fin(f),
+    omap_version(d->get_committing_version()),
+    ret1(0), ret2(0), ret3(0) { }
+  void finish(int r) override {
+    // check the correctness of backtrace
+    if (r >= 0 && ret3 != -CEPHFS_ECANCELED)
+      dir->inode->verify_diri_backtrace(btbl, ret3);
+    if (r >= 0) r = ret1;
+    if (r >= 0) r = ret2;
+
+    if (more) {
+      if (omap_version < dir->get_committed_version()) {
+        omap.clear();
+        dir->_omap_fetch(fin, {});
+      } else {
+        dir->_omap_fetch_more(omap_version, hdrbl, omap, fin);
+      }
+      return;
+    }
+
+    dir->_omap_fetched(hdrbl, omap, !fin, r);
+    if (fin)
+      fin->complete(r);
+
+  }
+  void print(ostream& out) const override {
+    out << "dirfrag_fetch(" << dir->dirfrag() << ")";
+  }
+};
+
+void CDir::_omap_fetch(MDSContext *c, const std::set<dentry_key_t>& keys)
+{
+  C_IO_Dir_OMAP_Fetched *fin = new C_IO_Dir_OMAP_Fetched(this, c);
+  object_t oid = get_ondisk_object();
+  object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool());
+  ObjectOperation rd;
+  rd.omap_get_header(&fin->hdrbl, &fin->ret1);
+  if (keys.empty()) {
+    ceph_assert(!c);
+    rd.omap_get_vals("", "", g_conf()->mds_dir_keys_per_op,
+		     &fin->omap, &fin->more, &fin->ret2);
+  } else {
+    ceph_assert(c);
+    std::set<std::string> str_keys;
+    for (auto p : keys) {
+      string str;
+      p.encode(str);
+      str_keys.insert(str);
+    }
+    rd.omap_get_vals_by_keys(str_keys, &fin->omap, &fin->ret2);
+  }
+  // check the correctness of backtrace
+  if (g_conf()->mds_verify_backtrace > 0 && frag == frag_t()) {
+    rd.getxattr("parent", &fin->btbl, &fin->ret3);
+    rd.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK);
+  } else {
+    fin->ret3 = -CEPHFS_ECANCELED;
+  }
+
+  mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, NULL, 0,
+			     new C_OnFinisher(fin, mdcache->mds->finisher));
+}
+
+void CDir::_omap_fetch_more(version_t omap_version, bufferlist& hdrbl,
+			    map<string, bufferlist>& omap, MDSContext *c)
+{
+  // we have more omap keys to fetch!
+  object_t oid = get_ondisk_object();
+  object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool());
+  auto fin = new C_IO_Dir_OMAP_FetchedMore(this, omap_version, c);
+  fin->hdrbl = std::move(hdrbl);
+  fin->omap.swap(omap);
+  ObjectOperation rd;
+  rd.omap_get_vals(fin->omap.rbegin()->first,
+		   "", /* filter prefix */
+		   g_conf()->mds_dir_keys_per_op,
+		   &fin->omap_more,
+		   &fin->more,
+		   &fin->ret);
+  mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, NULL, 0,
+			     new C_OnFinisher(fin, mdcache->mds->finisher));
+}
+
+CDentry *CDir::_load_dentry(
+    std::string_view key,
+    std::string_view dname,
+    const snapid_t last,
+    bufferlist &bl,
+    const int pos,
+    const std::set<snapid_t> *snaps,
+    double rand_threshold,
+    bool *force_dirty)
+{
+  auto q = bl.cbegin();
+
+  snapid_t first;
+  decode(first, q);
+
+  // marker
+  char type;
+  decode(type, q);
+
+  dout(20) << "_fetched pos " << pos << " marker '" << type << "' dname '" << dname
+           << " [" << first << "," << last << "]"
+           << dendl;
+
+  bool stale = false;
+  if (snaps && last != CEPH_NOSNAP) {
+    set<snapid_t>::const_iterator p = snaps->lower_bound(first);
+    if (p == snaps->end() || *p > last) {
+      dout(10) << " skipping stale dentry on [" << first << "," << last << "]" << dendl;
+      stale = true;
+    }
+  }
+
+  /*
+   * look for existing dentry for _last_ snap, because unlink +
+   * create may leave a "hole" (epochs during which the dentry
+   * doesn't exist) but for which no explicit negative dentry is in
+   * the cache.
+   */
+  CDentry *dn;
+  if (stale)
+    dn = lookup_exact_snap(dname, last);
+  else
+    dn = lookup(dname, last);
+
+  if (type == 'L' || type == 'l') {
+    // hard link
+    inodeno_t ino;
+    unsigned char d_type;
+    mempool::mds_co::string alternate_name;
+
+    CDentry::decode_remote(type, ino, d_type, alternate_name, q);
+
+    if (stale) {
+      if (!dn) {
+        stale_items.insert(mempool::mds_co::string(key));
+        *force_dirty = true;
+      }
+      return dn;
+    }
+
+    if (dn) {
+      CDentry::linkage_t *dnl = dn->get_linkage();
+      dout(12) << "_fetched had " << (dnl->is_null() ? "NEG" : "") << " dentry " << *dn << dendl;
+      if (committed_version == 0 &&
+	  dnl->is_remote() &&
+	  dn->is_dirty() &&
+	  ino == dnl->get_remote_ino() &&
+	  d_type == dnl->get_remote_d_type() &&
+          alternate_name == dn->get_alternate_name()) {
+	// see comment below
+	dout(10) << "_fetched  had underwater dentry " << *dn << ", marking clean" << dendl;
+	dn->mark_clean();
+      }
+    } else {
+      // (remote) link
+      dn = add_remote_dentry(dname, ino, d_type, std::move(alternate_name), first, last);
+
+      // link to inode?
+      CInode *in = mdcache->get_inode(ino);   // we may or may not have it.
+      if (in) {
+        dn->link_remote(dn->get_linkage(), in);
+        dout(12) << "_fetched  got remote link " << ino << " which we have " << *in << dendl;
+      } else {
+        dout(12) << "_fetched  got remote link " << ino << " (don't have it)" << dendl;
+      }
+    }
+  }
+  else if (type == 'I' || type == 'i') {
+    InodeStore inode_data;
+    mempool::mds_co::string alternate_name;
+    // inode
+    // Load inode data before looking up or constructing CInode
+    if (type == 'i') {
+      DECODE_START(2, q);
+      if (struct_v >= 2) {
+        decode(alternate_name, q);
+      }
+      inode_data.decode(q);
+      DECODE_FINISH(q);
+    } else {
+      inode_data.decode_bare(q);
+    }
+
+    if (stale) {
+      if (!dn) {
+        stale_items.insert(mempool::mds_co::string(key));
+        *force_dirty = true;
+      }
+      return dn;
+    }
+
+    bool undef_inode = false;
+    if (dn) {
+      CDentry::linkage_t *dnl = dn->get_linkage();
+      dout(12) << "_fetched had " << (dnl->is_null() ? "NEG" : "") << " dentry " << *dn << dendl;
+
+      if (dnl->is_primary()) {
+	CInode *in = dnl->get_inode();
+	if (in->state_test(CInode::STATE_REJOINUNDEF)) {
+	  undef_inode = true;
+	} else if (committed_version == 0 &&
+		   dn->is_dirty() &&
+		   inode_data.inode->ino == in->ino() &&
+		   inode_data.inode->version == in->get_version()) {
+	  /* clean underwater item?
+	   * Underwater item is something that is dirty in our cache from
+	   * journal replay, but was previously flushed to disk before the
+	   * mds failed.
+	   *
+	   * We only do this is committed_version == 0. that implies either
+	   * - this is a fetch after from a clean/empty CDir is created
+	   *   (and has no effect, since the dn won't exist); or
+	   * - this is a fetch after _recovery_, which is what we're worried
+	   *   about.  Items that are marked dirty from the journal should be
+	   *   marked clean if they appear on disk.
+	   */
+	  dout(10) << "_fetched  had underwater dentry " << *dn << ", marking clean" << dendl;
+	  dn->mark_clean();
+	  dout(10) << "_fetched  had underwater inode " << *dnl->get_inode() << ", marking clean" << dendl;
+	  in->mark_clean();
+	}
+      }
+    }
+
+    if (!dn || undef_inode) {
+      // add inode
+      CInode *in = mdcache->get_inode(inode_data.inode->ino, last);
+      if (!in || undef_inode) {
+        if (undef_inode && in)
+          in->first = first;
+        else
+          in = new CInode(mdcache, true, first, last);
+        
+        in->reset_inode(std::move(inode_data.inode));
+        in->reset_xattrs(std::move(inode_data.xattrs));
+        // symlink?
+        if (in->is_symlink()) 
+          in->symlink = inode_data.symlink;
+        
+        in->dirfragtree.swap(inode_data.dirfragtree);
+        in->reset_old_inodes(std::move(inode_data.old_inodes));
+        if (in->is_any_old_inodes()) {
+	  snapid_t min_first = in->get_old_inodes()->rbegin()->first + 1;
+	  if (min_first > in->first)
+	    in->first = min_first;
+	}
+
+        in->oldest_snap = inode_data.oldest_snap;
+        in->decode_snap_blob(inode_data.snap_blob);
+        if (snaps && !in->snaprealm)
+          in->purge_stale_snap_data(*snaps);
+
+        if (!undef_inode) {
+          mdcache->add_inode(in); // add
+          dn = add_primary_dentry(dname, in, std::move(alternate_name), first, last); // link
+        }
+        dout(12) << "_fetched  got " << *dn << " " << *in << dendl;
+
+        if (in->get_inode()->is_dirty_rstat())
+          in->mark_dirty_rstat();
+
+        in->maybe_ephemeral_rand(rand_threshold);
+        //in->hack_accessed = false;
+        //in->hack_load_stamp = ceph_clock_now();
+        //num_new_inodes_loaded++;
+      } else if (g_conf().get_val<bool>("mds_hack_allow_loading_invalid_metadata")) {
+	dout(20) << "hack: adding duplicate dentry for " << *in << dendl;
+	dn = add_primary_dentry(dname, in, std::move(alternate_name), first, last);
+      } else {
+        dout(0) << "_fetched  badness: got (but i already had) " << *in
+                << " mode " << in->get_inode()->mode
+                << " mtime " << in->get_inode()->mtime << dendl;
+        string dirpath, inopath;
+        this->inode->make_path_string(dirpath);
+        in->make_path_string(inopath);
+        mdcache->mds->clog->error() << "loaded dup inode " << inode_data.inode->ino
+          << " [" << first << "," << last << "] v" << inode_data.inode->version
+          << " at " << dirpath << "/" << dname
+          << ", but inode " << in->vino() << " v" << in->get_version()
+	  << " already exists at " << inopath;
+        return dn;
+      }
+    }
+  } else {
+    CachedStackStringStream css;
+    *css << "Invalid tag char '" << type << "' pos " << pos;
+    throw buffer::malformed_input(css->str());
+  }
+
+  return dn;
+}
+
+void CDir::_omap_fetched(bufferlist& hdrbl, map<string, bufferlist>& omap,
+			 bool complete, int r)
+{
+  LogChannelRef clog = mdcache->mds->clog;
+  dout(10) << "_fetched header " << hdrbl.length() << " bytes "
+	   << omap.size() << " keys for " << *this << dendl;
+
+  ceph_assert(r == 0 || r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA);
+  ceph_assert(is_auth());
+  ceph_assert(!is_frozen());
+
+  if (hdrbl.length() == 0) {
+    dout(0) << "_fetched missing object for " << *this << dendl;
+
+    clog->error() << "dir " << dirfrag() << " object missing on disk; some "
+                     "files may be lost (" << get_path() << ")";
+
+    go_bad(complete);
+    return;
+  }
+
+  fnode_t got_fnode;
+  {
+    auto p = hdrbl.cbegin();
+    try {
+      decode(got_fnode, p);
+    } catch (const buffer::error &err) {
+      derr << "Corrupt fnode in dirfrag " << dirfrag()
+	   << ": " << err.what() << dendl;
+      clog->warn() << "Corrupt fnode header in " << dirfrag() << ": "
+		   << err.what() << " (" << get_path() << ")";
+      go_bad(complete);
+      return;
+    }
+    if (!p.end()) {
+      clog->warn() << "header buffer of dir " << dirfrag() << " has "
+		  << hdrbl.length() - p.get_off() << " extra bytes ("
+                  << get_path() << ")";
+      go_bad(complete);
+      return;
+    }
+  }
+
+  dout(10) << "_fetched version " << got_fnode.version << dendl;
+  
+  // take the loaded fnode?
+  // only if we are a fresh CDir* with no prior state.
+  if (get_version() == 0) {
+    set_fresh_fnode(allocate_fnode(got_fnode));
+  }
+
+  list<CInode*> undef_inodes;
+
+  // purge stale snaps?
+  bool force_dirty = false;
+  const set<snapid_t> *snaps = NULL;
+  SnapRealm *realm = inode->find_snaprealm();
+  if (fnode->snap_purged_thru < realm->get_last_destroyed()) {
+    snaps = &realm->get_snaps();
+    dout(10) << " snap_purged_thru " << fnode->snap_purged_thru
+	     << " < " << realm->get_last_destroyed()
+	     << ", snap purge based on " << *snaps << dendl;
+    if (get_num_snap_items() == 0) {
+      const_cast<snapid_t&>(fnode->snap_purged_thru) = realm->get_last_destroyed();
+      force_dirty = true;
+    }
+  }
+
+  int count = 0;
+  unsigned pos = omap.size() - 1;
+  double rand_threshold = get_inode()->get_ephemeral_rand();
+  for (map<string, bufferlist>::reverse_iterator p = omap.rbegin();
+       p != omap.rend();
+       ++p, --pos) {
+    string dname;
+    snapid_t last;
+    dentry_key_t::decode_helper(p->first, dname, last);
+
+    if (!(++count % mdcache->mds->heartbeat_reset_grace(2)))
+      mdcache->mds->heartbeat_reset();
+
+    CDentry *dn = NULL;
+    try {
+      dn = _load_dentry(
+            p->first, dname, last, p->second, pos, snaps,
+            rand_threshold, &force_dirty);
+    } catch (const buffer::error &err) {
+      mdcache->mds->clog->warn() << "Corrupt dentry '" << dname << "' in "
+                                  "dir frag " << dirfrag() << ": "
+                               << err.what() << "(" << get_path() << ")";
+
+      // Remember that this dentry is damaged.  Subsequent operations
+      // that try to act directly on it will get their CEPHFS_EIOs, but this
+      // dirfrag as a whole will continue to look okay (minus the
+      // mysteriously-missing dentry)
+      go_bad_dentry(last, dname);
+
+      // Anyone who was WAIT_DENTRY for this guy will get kicked
+      // to RetryRequest, and hit the DamageTable-interrogating path.
+      // Stats will now be bogus because we will think we're complete,
+      // but have 1 or more missing dentries.
+      continue;
+    }
+
+    if (!dn)
+      continue;
+
+    CDentry::linkage_t *dnl = dn->get_linkage();
+    if (dnl->is_primary() && dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
+      undef_inodes.push_back(dnl->get_inode());
+
+    if (wanted_items.count(mempool::mds_co::string(dname)) > 0 || !complete) {
+      dout(10) << " touching wanted dn " << *dn << dendl;
+      mdcache->touch_dentry(dn);
+    }
+  }
+
+  //cache->mds->logger->inc("newin", num_new_inodes_loaded);
+
+  // mark complete, !fetching
+  if (complete) {
+    wanted_items.clear();
+    mark_complete();
+    state_clear(STATE_FETCHING);
+  }
+
+  // open & force frags
+  while (!undef_inodes.empty()) {
+    CInode *in = undef_inodes.front();
+    undef_inodes.pop_front();
+    in->state_clear(CInode::STATE_REJOINUNDEF);
+    mdcache->opened_undef_inode(in);
+
+    if (!(++count % mdcache->mds->heartbeat_reset_grace()))
+      mdcache->mds->heartbeat_reset();
+  }
+
+  // dirty myself to remove stale snap dentries
+  if (force_dirty && !mdcache->is_readonly())
+    log_mark_dirty();
+
+  auth_unpin(this);
+
+  if (complete) {
+    // kick waiters
+    finish_waiting(WAIT_COMPLETE, 0);
+  }
+}
+
+void CDir::go_bad_dentry(snapid_t last, std::string_view dname)
+{
+  dout(10) << __func__ << " " << dname << dendl;
+  std::string path(get_path());
+  path += "/";
+  path += dname;
+  const bool fatal = mdcache->mds->damage_table.notify_dentry(
+      inode->ino(), frag, last, dname, path);
+  if (fatal) {
+    mdcache->mds->damaged();
+    ceph_abort();  // unreachable, damaged() respawns us
+  }
+}
+
+void CDir::go_bad(bool complete)
+{
+  dout(10) << __func__ << " " << frag << dendl;
+  const bool fatal = mdcache->mds->damage_table.notify_dirfrag(
+      inode->ino(), frag, get_path());
+  if (fatal) {
+    mdcache->mds->damaged();
+    ceph_abort();  // unreachable, damaged() respawns us
+  }
+
+  if (complete) {
+    if (get_version() == 0) {
+      auto _fnode = allocate_fnode();
+      _fnode->version = 1;
+      reset_fnode(std::move(_fnode));
+    }
+    
+    state_set(STATE_BADFRAG);
+    mark_complete();
+  }
+
+  state_clear(STATE_FETCHING);
+  auth_unpin(this);
+  finish_waiting(WAIT_COMPLETE, -CEPHFS_EIO);
+}
+
+// -----------------------
+// COMMIT
+
+/**
+ * commit
+ *
+ * @param want - min version i want committed
+ * @param c - callback for completion
+ */
+void CDir::commit(version_t want, MDSContext *c, bool ignore_authpinnability, int op_prio)
+{
+  dout(10) << "commit want " << want << " on " << *this << dendl;
+  if (want == 0) want = get_version();
+
+  // preconditions
+  ceph_assert(want <= get_version() || get_version() == 0);    // can't commit the future
+  ceph_assert(want > committed_version); // the caller is stupid
+  ceph_assert(is_auth());
+  ceph_assert(ignore_authpinnability || can_auth_pin());
+
+  // note: queue up a noop if necessary, so that we always
+  // get an auth_pin.
+  if (!c)
+    c = new C_MDSInternalNoop;
+
+  // auth_pin on first waiter
+  if (waiting_for_commit.empty())
+    auth_pin(this);
+  waiting_for_commit[want].push_back(c);
+  
+  // ok.
+  _commit(want, op_prio);
+}
+
+class C_IO_Dir_Committed : public CDirIOContext {
+  version_t version;
+public:
+  C_IO_Dir_Committed(CDir *d, version_t v) : CDirIOContext(d), version(v) { }
+  void finish(int r) override {
+    dir->_committed(r, version);
+  }
+  void print(ostream& out) const override {
+    out << "dirfrag_committed(" << dir->dirfrag() << ")";
+  }
+};
+
+class C_IO_Dir_Commit_Ops : public Context {
+public:
+  C_IO_Dir_Commit_Ops(CDir *d, int pr,
+		      vector<CDir::dentry_commit_item> &&s, bufferlist &&bl,
+		      vector<string> &&r,
+		      mempool::mds_co::compact_set<mempool::mds_co::string> &&stales) :
+    dir(d), op_prio(pr) {
+    metapool = dir->mdcache->mds->get_metadata_pool();
+    version = dir->get_version();
+    is_new = dir->is_new();
+    to_set.swap(s);
+    dfts.swap(bl);
+    to_remove.swap(r);
+    stale_items.swap(stales);
+  }
+
+  void finish(int r) override {
+    dir->_omap_commit_ops(r, op_prio, metapool, version, is_new, to_set, dfts,
+			  to_remove, stale_items);
+  }
+
+private:
+  CDir *dir;
+  int op_prio;
+  int64_t metapool;
+  version_t version;
+  bool is_new;
+  vector<CDir::dentry_commit_item> to_set;
+  bufferlist dfts;
+  vector<string> to_remove;
+  mempool::mds_co::compact_set<mempool::mds_co::string> stale_items;
+};
+
+// This is doing the same thing with the InodeStoreBase::encode()
+void CDir::_encode_primary_inode_base(dentry_commit_item &item, bufferlist &dfts,
+                                      bufferlist &bl)
+{
+  ENCODE_START(6, 4, bl);
+  encode(*item.inode, bl, item.features);
+
+  if (!item.symlink.empty())
+    encode(item.symlink, bl);
+
+  // dirfragtree
+  dfts.splice(0, item.dft_len, &bl);
+
+  if (item.xattrs)
+    encode(*item.xattrs, bl);
+  else
+    encode((__u32)0, bl);
+
+  if (item.snaprealm) {
+    bufferlist snapr_bl;
+    encode(item.srnode, snapr_bl);
+    encode(snapr_bl, bl);
+  } else {
+    encode(bufferlist(), bl);
+  }
+
+  if (item.old_inodes)
+    encode(*item.old_inodes, bl, item.features);
+  else
+    encode((__u32)0, bl);
+
+  encode(item.oldest_snap, bl);
+  encode(item.damage_flags, bl);
+  ENCODE_FINISH(bl);
+}
+
+// This is not locked by mds_lock
+void CDir::_omap_commit_ops(int r, int op_prio, int64_t metapool, version_t version, bool _new,
+			    vector<dentry_commit_item> &to_set, bufferlist &dfts,
+                            vector<string>& to_remove,
+			    mempool::mds_co::compact_set<mempool::mds_co::string> &stales)
+{
+  dout(10) << __func__ << dendl;
+
+  if (r < 0) {
+    mdcache->mds->handle_write_error_with_lock(r);
+    return;
+  }
+
+  C_GatherBuilder gather(g_ceph_context,
+                         new C_OnFinisher(new C_IO_Dir_Committed(this, version),
+			 mdcache->mds->finisher));
+
+  SnapContext snapc;
+  object_t oid = get_ondisk_object();
+  object_locator_t oloc(metapool);
+
+  map<string, bufferlist> _set;
+  set<string> _rm;
+
+  unsigned max_write_size = mdcache->max_dir_commit_size;
+  unsigned write_size = 0;
+
+  auto commit_one = [&](bool header=false) {
+    ObjectOperation op;
+
+    // don't create new dirfrag blindly
+    if (!_new)
+      op.stat(nullptr, nullptr, nullptr);
+
+    /*
+     * save the header at the last moment.. If we were to send it off before
+     * other updates, but die before sending them all, we'd think that the
+     * on-disk state was fully committed even though it wasn't! However, since
+     * the messages are strictly ordered between the MDS and the OSD, and
+     * since messages to a given PG are strictly ordered, if we simply send
+     * the message containing the header off last, we cannot get our header
+     * into an incorrect state.
+     */
+    if (header) {
+      bufferlist header;
+      encode(*fnode, header);
+      op.omap_set_header(header);
+    }
+
+    op.priority = op_prio;
+    if (!_set.empty())
+      op.omap_set(_set);
+    if (!_rm.empty())
+      op.omap_rm_keys(_rm);
+    mdcache->mds->objecter->mutate(oid, oloc, op, snapc,
+                                   ceph::real_clock::now(),
+                                   0, gather.new_sub());
+    write_size = 0;
+    _set.clear();
+    _rm.clear();
+  };
+
+  int count = 0;
+  for (auto &key : stales) {
+    unsigned size = key.length() + sizeof(__u32);
+    if (write_size + size > max_write_size)
+      commit_one();
+
+    write_size += size;
+    _rm.emplace(key);
+
+    if (!(++count % mdcache->mds->heartbeat_reset_grace(2)))
+      mdcache->mds->heartbeat_reset();
+  }
+
+  for (auto &key : to_remove) {
+    unsigned size = key.length() + sizeof(__u32);
+    if (write_size + size > max_write_size)
+      commit_one();
+
+    write_size += size;
+    _rm.emplace(std::move(key));
+
+    if (!(++count % mdcache->mds->heartbeat_reset_grace(2)))
+      mdcache->mds->heartbeat_reset();
+  }
+
+  uint64_t off = 0;
+  bufferlist bl;
+  using ceph::encode;
+  for (auto &item : to_set) {
+    encode(item.first, bl);
+    if (item.is_remote) {
+      // remote link
+      CDentry::encode_remote(item.ino, item.d_type, item.alternate_name, bl);
+    } else {
+      // marker, name, inode, [symlink string]
+      bl.append('i');         // inode
+
+      ENCODE_START(2, 1, bl);
+      encode(item.alternate_name, bl);
+      _encode_primary_inode_base(item, dfts, bl);
+      ENCODE_FINISH(bl);
+    }
+    off += item.dft_len;
+
+    unsigned size = item.key.length() + bl.length() + 2 * sizeof(__u32);
+    if (write_size + size > max_write_size)
+      commit_one();
+
+    write_size += size;
+    _set[std::move(item.key)].swap(bl);
+
+    if (!(++count % mdcache->mds->heartbeat_reset_grace()))
+      mdcache->mds->heartbeat_reset();
+  }
+
+  commit_one(true);
+  gather.activate();
+}
+
+/**
+ * Flush out the modified dentries in this dir. Keep the bufferlist
+ * below max_write_size;
+ */
+void CDir::_omap_commit(int op_prio)
+{
+  dout(10) << __func__ << dendl;
+
+  if (op_prio < 0)
+    op_prio = CEPH_MSG_PRIO_DEFAULT;
+
+  // snap purge?
+  const set<snapid_t> *snaps = NULL;
+  SnapRealm *realm = inode->find_snaprealm();
+  if (fnode->snap_purged_thru < realm->get_last_destroyed()) {
+    snaps = &realm->get_snaps();
+    dout(10) << " snap_purged_thru " << fnode->snap_purged_thru
+	     << " < " << realm->get_last_destroyed()
+	     << ", snap purge based on " << *snaps << dendl;
+    // fnode.snap_purged_thru = realm->get_last_destroyed();
+  }
+
+  size_t items_count = 0;
+  if (state_test(CDir::STATE_FRAGMENTING) && is_new()) {
+    items_count = get_num_head_items() + get_num_snap_items();
+  } else {
+    for (elist<CDentry*>::iterator it = dirty_dentries.begin(); !it.end(); ++it)
+      ++items_count;
+  }
+
+  vector<string> to_remove;
+  // reverve enough memories, which maybe larger than the actually needed
+  to_remove.reserve(items_count);
+
+  vector<dentry_commit_item> to_set;
+  // reverve enough memories, which maybe larger than the actually needed
+  to_set.reserve(items_count);
+
+  // for dir fragtrees
+  bufferlist dfts(CEPH_PAGE_SIZE);
+
+  auto write_one = [&](CDentry *dn) {
+    string key;
+    dn->key().encode(key);
+
+    if (dn->last != CEPH_NOSNAP &&
+	snaps && try_trim_snap_dentry(dn, *snaps)) {
+      dout(10) << " rm " << key << dendl;
+      to_remove.emplace_back(std::move(key));
+      return;
+    }
+
+    if (dn->get_linkage()->is_null()) {
+      dout(10) << " rm " << dn->get_name() << " " << *dn << dendl;
+      to_remove.emplace_back(std::move(key));
+    } else {
+      dout(10) << " set " << dn->get_name() << " " << *dn << dendl;
+
+      uint64_t off = dfts.length();
+      // try to reserve new size if there has less
+      // than 1/8 page space
+      uint64_t left = CEPH_PAGE_SIZE - off % CEPH_PAGE_SIZE;
+      if (left < CEPH_PAGE_SIZE / 8)
+        dfts.reserve(left + CEPH_PAGE_SIZE);
+
+      auto& item = to_set.emplace_back();
+      item.key = std::move(key);
+      _parse_dentry(dn, item, snaps, dfts);
+      item.dft_len = dfts.length() - off;
+    }
+  };
+
+  int count = 0;
+  if (state_test(CDir::STATE_FRAGMENTING) && is_new()) {
+    assert(committed_version == 0);
+    for (auto p = items.begin(); p != items.end(); ) {
+      CDentry *dn = p->second;
+      ++p;
+      if (dn->get_linkage()->is_null())
+	continue;
+      write_one(dn);
+
+      if (!(++count % mdcache->mds->heartbeat_reset_grace()))
+        mdcache->mds->heartbeat_reset();
+    }
+  } else {
+    for (auto p = dirty_dentries.begin(); !p.end(); ) {
+      CDentry *dn = *p;
+      ++p;
+      write_one(dn);
+
+      if (!(++count % mdcache->mds->heartbeat_reset_grace()))
+        mdcache->mds->heartbeat_reset();
+    }
+  }
+
+  auto c = new C_IO_Dir_Commit_Ops(this, op_prio, std::move(to_set), std::move(dfts),
+                                   std::move(to_remove), std::move(stale_items));
+  stale_items.clear();
+  mdcache->mds->finisher->queue(c);
+}
+
+void CDir::_parse_dentry(CDentry *dn, dentry_commit_item &item,
+			 const set<snapid_t> *snaps, bufferlist &bl)
+{
+  // clear dentry NEW flag, if any.  we can no longer silently drop it.
+  dn->clear_new();
+
+  item.first = dn->first;
+
+  // primary or remote?
+  auto& linkage = dn->linkage;
+  item.alternate_name = dn->get_alternate_name();
+  if (linkage.is_remote()) {
+    item.is_remote = true;
+    item.ino = linkage.get_remote_ino();
+    item.d_type = linkage.get_remote_d_type();
+    dout(14) << " dn '" << dn->get_name() << "' remote ino " << item.ino << dendl;
+  } else if (linkage.is_primary()) {
+    // primary link
+    CInode *in = linkage.get_inode();
+    ceph_assert(in);
+
+    dout(14) << " dn '" << dn->get_name() << "' inode " << *in << dendl;
+
+    if (in->is_multiversion()) {
+      if (!in->snaprealm) {
+	if (snaps)
+	  in->purge_stale_snap_data(*snaps);
+      } else {
+	in->purge_stale_snap_data(in->snaprealm->get_snaps());
+      }
+    }
+
+    if (in->snaprealm) {
+      item.snaprealm = true;
+      item.srnode = in->snaprealm->srnode;
+    }
+    item.features = mdcache->mds->mdsmap->get_up_features();
+    item.inode = in->inode;
+    if (in->inode->is_symlink())
+      item.symlink = in->symlink;
+    using ceph::encode;
+    encode(in->dirfragtree, bl);
+    item.xattrs = in->xattrs;
+    item.old_inodes = in->old_inodes;
+    item.oldest_snap = in->oldest_snap;
+    item.damage_flags = in->damage_flags;
+  } else {
+    ceph_assert(!linkage.is_null());
+  }
+}
+
+void CDir::_commit(version_t want, int op_prio)
+{
+  dout(10) << "_commit want " << want << " on " << *this << dendl;
+
+  // we can't commit things in the future.
+  // (even the projected future.)
+  ceph_assert(want <= get_version() || get_version() == 0);
+
+  // check pre+postconditions.
+  ceph_assert(is_auth());
+
+  // already committed?
+  if (committed_version >= want) {
+    dout(10) << "already committed " << committed_version << " >= " << want << dendl;
+    return;
+  }
+  // already committing >= want?
+  if (committing_version >= want) {
+    dout(10) << "already committing " << committing_version << " >= " << want << dendl;
+    ceph_assert(state_test(STATE_COMMITTING));
+    return;
+  }
+
+  // alrady committed an older version?
+  if (committing_version > committed_version) {
+    dout(10) << "already committing older " << committing_version << ", waiting for that to finish" << dendl;
+    return;
+  }
+  
+  // commit.
+  committing_version = get_version();
+
+  // mark committing (if not already)
+  if (!state_test(STATE_COMMITTING)) {
+    dout(10) << "marking committing" << dendl;
+    state_set(STATE_COMMITTING);
+  }
+  
+  if (mdcache->mds->logger) mdcache->mds->logger->inc(l_mds_dir_commit);
+
+  mdcache->mds->balancer->hit_dir(this, META_POP_STORE);
+
+  _omap_commit(op_prio);
+}
+
+
+/**
+ * _committed
+ *
+ * @param v version i just committed
+ */
+void CDir::_committed(int r, version_t v)
+{
+  if (r < 0) {
+    // the directory could be partly purged during MDS failover
+    if (r == -CEPHFS_ENOENT && committed_version == 0 &&
+	!inode->is_base() && get_parent_dir()->inode->is_stray()) {
+      r = 0;
+      if (inode->snaprealm)
+	inode->state_set(CInode::STATE_MISSINGOBJS);
+    }
+    if (r < 0) {
+      dout(1) << "commit error " << r << " v " << v << dendl;
+      mdcache->mds->clog->error() << "failed to commit dir " << dirfrag() << " object,"
+				<< " errno " << r;
+      mdcache->mds->handle_write_error(r);
+      return;
+    }
+  }
+
+  dout(10) << "_committed v " << v << " on " << *this << dendl;
+  ceph_assert(is_auth());
+
+  bool stray = inode->is_stray();
+
+  // take note.
+  ceph_assert(v > committed_version);
+  ceph_assert(v <= committing_version);
+  committed_version = v;
+
+  // _all_ commits done?
+  if (committing_version == committed_version) 
+    state_clear(CDir::STATE_COMMITTING);
+
+  // _any_ commit, even if we've been redirtied, means we're no longer new.
+  item_new.remove_myself();
+  
+  // dir clean?
+  if (committed_version == get_version()) 
+    mark_clean();
+
+  int count = 0;
+
+  // dentries clean?
+  for (auto p = dirty_dentries.begin(); !p.end(); ) {
+    CDentry *dn = *p;
+    ++p;
+    
+    // inode?
+    if (dn->linkage.is_primary()) {
+      CInode *in = dn->linkage.get_inode();
+      ceph_assert(in);
+      ceph_assert(in->is_auth());
+      
+      if (committed_version >= in->get_version()) {
+	if (in->is_dirty()) {
+	  dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << dendl;
+	  in->mark_clean();
+	}
+      } else {
+	dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << dendl;
+	ceph_assert(in->is_dirty() || in->last < CEPH_NOSNAP);  // special case for cow snap items (not predirtied)
+      }
+    }
+
+    // dentry
+    if (committed_version >= dn->get_version()) {
+      dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << dendl;
+      dn->mark_clean();
+
+      // drop clean null stray dentries immediately
+      if (stray &&
+	  dn->get_num_ref() == 0 &&
+	  !dn->is_projected() &&
+	  dn->get_linkage()->is_null())
+	remove_dentry(dn);
+    } else {
+      dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << dendl;
+      ceph_assert(dn->is_dirty());
+    }
+
+    if (!(++count % mdcache->mds->heartbeat_reset_grace()))
+      mdcache->mds->heartbeat_reset();
+  }
+
+  // finishers?
+  bool were_waiters = !waiting_for_commit.empty();
+
+  auto it = waiting_for_commit.begin();
+  while (it != waiting_for_commit.end()) {
+    auto _it = it;
+    ++_it;
+    if (it->first > committed_version) {
+      dout(10) << " there are waiters for " << it->first << ", committing again" << dendl;
+      _commit(it->first, -1);
+      break;
+    }
+    MDSContext::vec t;
+    for (const auto &waiter : it->second)
+      t.push_back(waiter);
+    mdcache->mds->queue_waiters(t);
+    waiting_for_commit.erase(it);
+    it = _it;
+
+    if (!(++count % mdcache->mds->heartbeat_reset_grace()))
+      mdcache->mds->heartbeat_reset();
+  }
+
+  // try drop dentries in this dirfrag if it's about to be purged
+  if (!inode->is_base() && get_parent_dir()->inode->is_stray() &&
+      inode->snaprealm)
+    mdcache->maybe_eval_stray(inode, true);
+
+  // unpin if we kicked the last waiter.
+  if (were_waiters &&
+      waiting_for_commit.empty())
+    auth_unpin(this);
+}
+
+
+
+
+// IMPORT/EXPORT
+
+mds_rank_t CDir::get_export_pin(bool inherit) const
+{
+  mds_rank_t export_pin = inode->get_export_pin(inherit);
+  if (export_pin == MDS_RANK_EPHEMERAL_DIST)
+    export_pin = mdcache->hash_into_rank_bucket(ino(), get_frag());
+  else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
+    export_pin = mdcache->hash_into_rank_bucket(ino());
+  return export_pin;
+}
+
+bool CDir::is_exportable(mds_rank_t dest) const
+{
+  mds_rank_t export_pin = get_export_pin();
+  if (export_pin == dest)
+    return true;
+  if (export_pin >= 0)
+    return false;
+  return true;
+}
+
+void CDir::encode_export(bufferlist& bl)
+{
+  ENCODE_START(1, 1, bl);
+  ceph_assert(!is_projected());
+  encode(first, bl);
+  encode(*fnode, bl);
+  encode(dirty_old_rstat, bl);
+  encode(committed_version, bl);
+
+  encode(state, bl);
+  encode(dir_rep, bl);
+
+  encode(pop_me, bl);
+  encode(pop_auth_subtree, bl);
+
+  encode(dir_rep_by, bl);  
+  encode(get_replicas(), bl);
+
+  get(PIN_TEMPEXPORTING);
+  ENCODE_FINISH(bl);
+}
+
+void CDir::finish_export()
+{
+  state &= MASK_STATE_EXPORT_KEPT;
+  pop_nested.sub(pop_auth_subtree);
+  pop_auth_subtree_nested.sub(pop_auth_subtree);
+  pop_me.zero();
+  pop_auth_subtree.zero();
+  put(PIN_TEMPEXPORTING);
+  dirty_old_rstat.clear();
+}
+
+void CDir::decode_import(bufferlist::const_iterator& blp, LogSegment *ls)
+{
+  DECODE_START(1, blp);
+  decode(first, blp);
+  {
+    auto _fnode = allocate_fnode();
+    decode(*_fnode, blp);
+    reset_fnode(std::move(_fnode));
+  }
+  update_projected_version();
+
+  decode(dirty_old_rstat, blp);
+  decode(committed_version, blp);
+  committing_version = committed_version;
+
+  unsigned s;
+  decode(s, blp);
+  state &= MASK_STATE_IMPORT_KEPT;
+  state_set(STATE_AUTH | (s & MASK_STATE_EXPORTED));
+
+  if (is_dirty()) {
+    get(PIN_DIRTY);
+    _mark_dirty(ls);
+  }
+
+  decode(dir_rep, blp);
+
+  decode(pop_me, blp);
+  decode(pop_auth_subtree, blp);
+  pop_nested.add(pop_auth_subtree);
+  pop_auth_subtree_nested.add(pop_auth_subtree);
+
+  decode(dir_rep_by, blp);
+  decode(get_replicas(), blp);
+  if (is_replicated()) get(PIN_REPLICATED);
+
+  replica_nonce = 0;  // no longer defined
+
+  // did we import some dirty scatterlock data?
+  if (dirty_old_rstat.size() ||
+      !(fnode->rstat == fnode->accounted_rstat)) {
+    mdcache->mds->locker->mark_updated_scatterlock(&inode->nestlock);
+    ls->dirty_dirfrag_nest.push_back(&inode->item_dirty_dirfrag_nest);
+  }
+  if (!(fnode->fragstat == fnode->accounted_fragstat)) {
+    mdcache->mds->locker->mark_updated_scatterlock(&inode->filelock);
+    ls->dirty_dirfrag_dir.push_back(&inode->item_dirty_dirfrag_dir);
+  }
+  if (is_dirty_dft()) {
+    if (inode->dirfragtreelock.get_state() != LOCK_MIX &&
+	inode->dirfragtreelock.is_stable()) {
+      // clear stale dirtydft
+      state_clear(STATE_DIRTYDFT);
+    } else {
+      mdcache->mds->locker->mark_updated_scatterlock(&inode->dirfragtreelock);
+      ls->dirty_dirfrag_dirfragtree.push_back(&inode->item_dirty_dirfrag_dirfragtree);
+    }
+  }
+  DECODE_FINISH(blp);
+}
+
+void CDir::abort_import()
+{
+  ceph_assert(is_auth());
+  state_clear(CDir::STATE_AUTH);
+  remove_bloom();
+  clear_replica_map();
+  set_replica_nonce(CDir::EXPORT_NONCE);
+  if (is_dirty())
+    mark_clean();
+
+  pop_nested.sub(pop_auth_subtree);
+  pop_auth_subtree_nested.sub(pop_auth_subtree);
+  pop_me.zero();
+  pop_auth_subtree.zero();
+}
+
+void CDir::encode_dirstat(bufferlist& bl, const session_info_t& info, const DirStat& ds) {
+  if (info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
+    ENCODE_START(1, 1, bl);
+    encode(ds.frag, bl);
+    encode(ds.auth, bl);
+    encode(ds.dist, bl);
+    ENCODE_FINISH(bl);
+  }
+  else {
+    encode(ds.frag, bl);
+    encode(ds.auth, bl);
+    encode(ds.dist, bl);
+  }
+}
+
+/********************************
+ * AUTHORITY
+ */
+
+/*
+ * if dir_auth.first == parent, auth is same as inode.
+ * unless .second != unknown, in which case that sticks.
+ */
+mds_authority_t CDir::authority() const
+{
+  if (is_subtree_root()) 
+    return dir_auth;
+  else
+    return inode->authority();
+}
+
+/** is_subtree_root()
+ * true if this is an auth delegation point.  
+ * that is, dir_auth != default (parent,unknown)
+ *
+ * some key observations:
+ *  if i am auth:
+ *    - any region bound will be an export, or frozen.
+ *
+ * note that this DOES heed dir_auth.pending
+ */
+/*
+bool CDir::is_subtree_root()
+{
+  if (dir_auth == CDIR_AUTH_DEFAULT) {
+    //dout(10) << "is_subtree_root false " << dir_auth << " != " << CDIR_AUTH_DEFAULT
+    //<< " on " << ino() << dendl;
+    return false;
+  } else {
+    //dout(10) << "is_subtree_root true " << dir_auth << " != " << CDIR_AUTH_DEFAULT
+    //<< " on " << ino() << dendl;
+    return true;
+  }
+}
+*/
+
+/** contains(x)
+ * true if we are x, or an ancestor of x
+ */
+bool CDir::contains(CDir *x)
+{
+  while (1) {
+    if (x == this)
+      return true;
+    x = x->get_inode()->get_projected_parent_dir();
+    if (x == 0)
+      return false;    
+  }
+}
+
+bool CDir::can_rep() const
+{
+  if (!is_rep()) 
+    return true;
+
+  unsigned mds_num = mdcache->mds->get_mds_map()->get_num_mds(MDSMap::STATE_ACTIVE);
+  if ((mds_num - 1) > get_replicas().size()) 
+    return true;
+  
+  return false;
+}
+
+
+/** set_dir_auth
+ */
+void CDir::set_dir_auth(const mds_authority_t &a)
+{ 
+  dout(10) << "setting dir_auth=" << a
+	   << " from " << dir_auth
+	   << " on " << *this << dendl;
+  
+  bool was_subtree = is_subtree_root();
+  bool was_ambiguous = dir_auth.second >= 0;
+
+  // set it.
+  dir_auth = a;
+
+  // new subtree root?
+  if (!was_subtree && is_subtree_root()) {
+    dout(10) << " new subtree root, adjusting auth_pins" << dendl;
+
+    if (freeze_tree_state) {
+      // only by CDir::_freeze_tree()
+      ceph_assert(is_freezing_tree_root());
+    }
+
+    inode->num_subtree_roots++;   
+    
+    // unpin parent of frozen dir/tree?
+    if (inode->is_auth()) {
+      ceph_assert(!is_frozen_tree_root());
+      if (is_frozen_dir())
+	inode->auth_unpin(this);
+    }
+  } 
+  if (was_subtree && !is_subtree_root()) {
+    dout(10) << " old subtree root, adjusting auth_pins" << dendl;
+
+    inode->num_subtree_roots--;
+
+    // pin parent of frozen dir/tree?
+    if (inode->is_auth()) {
+      ceph_assert(!is_frozen_tree_root());
+      if (is_frozen_dir())
+	inode->auth_pin(this);
+    }
+  }
+
+  // newly single auth?
+  if (was_ambiguous && dir_auth.second == CDIR_AUTH_UNKNOWN) {
+    MDSContext::vec ls;
+    take_waiting(WAIT_SINGLEAUTH, ls);
+    mdcache->mds->queue_waiters(ls);
+  }
+}
+
+/*****************************************
+ * AUTH PINS and FREEZING
+ *
+ * the basic plan is that auth_pins only exist in auth regions, and they
+ * prevent a freeze (and subsequent auth change).  
+ *
+ * however, we also need to prevent a parent from freezing if a child is frozen.
+ * for that reason, the parent inode of a frozen directory is auth_pinned.
+ *
+ * the oddity is when the frozen directory is a subtree root.  if that's the case,
+ * the parent inode isn't frozen.  which means that when subtree authority is adjusted
+ * at the bounds, inodes for any frozen bound directories need to get auth_pins at that
+ * time.
+ *
+ */
+
+void CDir::auth_pin(void *by) 
+{
+  if (auth_pins == 0)
+    get(PIN_AUTHPIN);
+  auth_pins++;
+
+#ifdef MDS_AUTHPIN_SET
+  auth_pin_set.insert(by);
+#endif
+
+  dout(10) << "auth_pin by " << by << " on " << *this << " count now " << auth_pins << dendl;
+
+  if (freeze_tree_state)
+    freeze_tree_state->auth_pins += 1;
+}
+
+void CDir::auth_unpin(void *by) 
+{
+  auth_pins--;
+
+#ifdef MDS_AUTHPIN_SET
+  {
+    auto it = auth_pin_set.find(by);
+    ceph_assert(it != auth_pin_set.end());
+    auth_pin_set.erase(it);
+  }
+#endif
+  if (auth_pins == 0)
+    put(PIN_AUTHPIN);
+
+  dout(10) << "auth_unpin by " << by << " on " << *this << " count now " << auth_pins << dendl;
+  ceph_assert(auth_pins >= 0);
+
+  if (freeze_tree_state)
+    freeze_tree_state->auth_pins -= 1;
+
+  maybe_finish_freeze();  // pending freeze?
+}
+
+void CDir::adjust_nested_auth_pins(int dirinc, void *by)
+{
+  ceph_assert(dirinc);
+  dir_auth_pins += dirinc;
+  
+  dout(15) << __func__ << " " << dirinc << " on " << *this
+	   << " by " << by << " count now "
+	   << auth_pins << "/" << dir_auth_pins << dendl;
+  ceph_assert(dir_auth_pins >= 0);
+
+  if (freeze_tree_state)
+    freeze_tree_state->auth_pins += dirinc;
+
+  if (dirinc < 0)
+    maybe_finish_freeze();  // pending freeze?
+}
+
+#ifdef MDS_VERIFY_FRAGSTAT
+void CDir::verify_fragstat()
+{
+  ceph_assert(is_complete());
+  if (inode->is_stray())
+    return;
+
+  frag_info_t c;
+  memset(&c, 0, sizeof(c));
+
+  for (auto it = items.begin();
+       it != items.end();
+       ++it) {
+    CDentry *dn = it->second;
+    if (dn->is_null())
+      continue;
+    
+    dout(10) << " " << *dn << dendl;
+    if (dn->is_primary())
+      dout(10) << "     " << *dn->inode << dendl;
+
+    if (dn->is_primary()) {
+      if (dn->inode->is_dir())
+	c.nsubdirs++;
+      else
+	c.nfiles++;
+    }
+    if (dn->is_remote()) {
+      if (dn->get_remote_d_type() == DT_DIR)
+	c.nsubdirs++;
+      else
+	c.nfiles++;
+    }
+  }
+
+  if (c.nsubdirs != fnode->fragstat.nsubdirs ||
+      c.nfiles != fnode->fragstat.nfiles) {
+    dout(0) << "verify_fragstat failed " << fnode->fragstat << " on " << *this << dendl;
+    dout(0) << "               i count " << c << dendl;
+    ceph_abort();
+  } else {
+    dout(0) << "verify_fragstat ok " << fnode->fragstat << " on " << *this << dendl;
+  }
+}
+#endif
+
+/*****************************************************************************
+ * FREEZING
+ */
+
+// FREEZE TREE
+
+void CDir::_walk_tree(std::function<bool(CDir*)> callback)
+{
+  deque<CDir*> dfq;
+  dfq.push_back(this);
+
+  while (!dfq.empty()) {
+    CDir *dir = dfq.front();
+    dfq.pop_front();
+
+    for (auto& p : *dir) {
+      CDentry *dn = p.second;
+      if (!dn->get_linkage()->is_primary())
+	continue;
+      CInode *in = dn->get_linkage()->get_inode();
+      if (!in->is_dir())
+	continue;
+
+      auto&& dfv = in->get_nested_dirfrags();
+      for (auto& dir : dfv) {
+	auto ret = callback(dir);
+	if (ret)
+	  dfq.push_back(dir);
+      }
+    }
+  }
+}
+
+bool CDir::freeze_tree()
+{
+  ceph_assert(!is_frozen());
+  ceph_assert(!is_freezing());
+  ceph_assert(!freeze_tree_state);
+
+  auth_pin(this);
+
+  // Travese the subtree to mark dirfrags as 'freezing' (set freeze_tree_state)
+  // and to accumulate auth pins and record total count in freeze_tree_state.
+  // when auth unpin an 'freezing' object, the counter in freeze_tree_state also
+  // gets decreased. Subtree become 'frozen' when the counter reaches zero.
+  freeze_tree_state = std::make_shared<freeze_tree_state_t>(this);
+  freeze_tree_state->auth_pins += get_auth_pins() + get_dir_auth_pins();
+  if (!lock_caches_with_auth_pins.empty())
+    mdcache->mds->locker->invalidate_lock_caches(this);
+
+  _walk_tree([this](CDir *dir) {
+      if (dir->freeze_tree_state)
+	return false;
+      dir->freeze_tree_state = freeze_tree_state;
+      freeze_tree_state->auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
+      if (!dir->lock_caches_with_auth_pins.empty())
+	mdcache->mds->locker->invalidate_lock_caches(dir);
+      return true;
+    }
+  );
+
+  if (is_freezeable(true)) {
+    _freeze_tree();
+    auth_unpin(this);
+    return true;
+  } else {
+    state_set(STATE_FREEZINGTREE);
+    ++num_freezing_trees;
+    dout(10) << "freeze_tree waiting " << *this << dendl;
+    return false;
+  }
+}
+
+void CDir::_freeze_tree()
+{
+  dout(10) << __func__ << " " << *this << dendl;
+  ceph_assert(is_freezeable(true));
+
+  if (freeze_tree_state) {
+    ceph_assert(is_auth());
+  } else {
+    ceph_assert(!is_auth());
+    freeze_tree_state = std::make_shared<freeze_tree_state_t>(this);
+  }
+  freeze_tree_state->frozen = true;
+
+  if (is_auth()) {
+    mds_authority_t auth;
+    bool was_subtree = is_subtree_root();
+    if (was_subtree) {
+      auth = get_dir_auth();
+    } else {
+      // temporarily prevent parent subtree from becoming frozen.
+      inode->auth_pin(this);
+      // create new subtree
+      auth = authority();
+    }
+
+    _walk_tree([this, &auth] (CDir *dir) {
+	if (dir->freeze_tree_state != freeze_tree_state) {
+	  mdcache->adjust_subtree_auth(dir, auth);
+	  return false;
+	}
+	return true;
+      }
+    );
+
+    ceph_assert(auth.first >= 0);
+    ceph_assert(auth.second == CDIR_AUTH_UNKNOWN);
+    auth.second = auth.first;
+    mdcache->adjust_subtree_auth(this, auth);
+    if (!was_subtree)
+      inode->auth_unpin(this);
+  } else {
+    // importing subtree ?
+    _walk_tree([this] (CDir *dir) {
+	ceph_assert(!dir->freeze_tree_state);
+	dir->freeze_tree_state = freeze_tree_state;
+	return true;
+      }
+    );
+  }
+
+  // twiddle state
+  if (state_test(STATE_FREEZINGTREE)) {
+    state_clear(STATE_FREEZINGTREE);
+    --num_freezing_trees;
+  }
+
+  state_set(STATE_FROZENTREE);
+  ++num_frozen_trees;
+  get(PIN_FROZEN);
+}
+
+void CDir::unfreeze_tree()
+{
+  dout(10) << __func__ << " " << *this << dendl;
+
+  MDSContext::vec unfreeze_waiters;
+  take_waiting(WAIT_UNFREEZE, unfreeze_waiters);
+
+  if (freeze_tree_state) {
+    _walk_tree([this, &unfreeze_waiters](CDir *dir) {
+	if (dir->freeze_tree_state != freeze_tree_state)
+	  return false;
+	dir->freeze_tree_state.reset();
+	dir->take_waiting(WAIT_UNFREEZE, unfreeze_waiters);
+	return true;
+      }
+    );
+  }
+
+  if (state_test(STATE_FROZENTREE)) {
+    // frozen.  unfreeze.
+    state_clear(STATE_FROZENTREE);
+    --num_frozen_trees;
+
+    put(PIN_FROZEN);
+
+    if (is_auth()) {
+      // must be subtree
+      ceph_assert(is_subtree_root());
+      // for debug purpose, caller should ensure 'dir_auth.second == dir_auth.first'
+      mds_authority_t auth = get_dir_auth();
+      ceph_assert(auth.first >= 0);
+      ceph_assert(auth.second == auth.first);
+      auth.second = CDIR_AUTH_UNKNOWN;
+      mdcache->adjust_subtree_auth(this, auth);
+    }
+    freeze_tree_state.reset();
+  } else {
+    ceph_assert(state_test(STATE_FREEZINGTREE));
+
+    // freezing.  stop it.
+    state_clear(STATE_FREEZINGTREE);
+    --num_freezing_trees;
+    freeze_tree_state.reset();
+
+    finish_waiting(WAIT_FROZEN, -1);
+    auth_unpin(this);
+  }
+
+  mdcache->mds->queue_waiters(unfreeze_waiters);
+}
+
+void CDir::adjust_freeze_after_rename(CDir *dir)
+{
+  if (!freeze_tree_state || dir->freeze_tree_state != freeze_tree_state)
+    return;
+  CDir *newdir = dir->get_inode()->get_parent_dir();
+  if (newdir == this || newdir->freeze_tree_state == freeze_tree_state)
+    return;
+
+  ceph_assert(!freeze_tree_state->frozen);
+  ceph_assert(get_dir_auth_pins() > 0);
+
+  MDSContext::vec unfreeze_waiters;
+
+  auto unfreeze = [this, &unfreeze_waiters](CDir *dir) {
+    if (dir->freeze_tree_state != freeze_tree_state)
+      return false;
+    int dec = dir->get_auth_pins() + dir->get_dir_auth_pins();
+    // shouldn't become zero because srcdn of rename was auth pinned 
+    ceph_assert(freeze_tree_state->auth_pins > dec);
+    freeze_tree_state->auth_pins -= dec;
+    dir->freeze_tree_state.reset();
+    dir->take_waiting(WAIT_UNFREEZE, unfreeze_waiters);
+    return true;
+  };
+
+  unfreeze(dir);
+  dir->_walk_tree(unfreeze);
+
+  mdcache->mds->queue_waiters(unfreeze_waiters);
+}
+
+bool CDir::can_auth_pin(int *err_ret) const
+{
+  int err;
+  if (!is_auth()) {
+    err = ERR_NOT_AUTH;
+  } else if (is_freezing_dir() || is_frozen_dir()) {
+    err = ERR_FRAGMENTING_DIR;
+  } else {
+    auto p = is_freezing_or_frozen_tree();
+    if (p.first || p.second) {
+      err = ERR_EXPORTING_TREE;
+    } else {
+      err = 0;
+    }
+  }
+  if (err && err_ret)
+    *err_ret = err;
+  return !err;
+}
+
+class C_Dir_AuthUnpin : public CDirContext {
+  public:
+  explicit C_Dir_AuthUnpin(CDir *d) : CDirContext(d) {}
+  void finish(int r) override {
+    dir->auth_unpin(dir->get_inode());
+  }
+};
+
+void CDir::maybe_finish_freeze()
+{
+  if (dir_auth_pins != 0)
+    return;
+
+  // we can freeze the _dir_ even with nested pins...
+  if (state_test(STATE_FREEZINGDIR)) {
+    if (auth_pins == 1) {
+      _freeze_dir();
+      auth_unpin(this);
+      finish_waiting(WAIT_FROZEN);
+    }
+  }
+
+  if (freeze_tree_state) {
+    if (freeze_tree_state->frozen ||
+	freeze_tree_state->auth_pins != 1)
+      return;
+
+    if (freeze_tree_state->dir != this) {
+      freeze_tree_state->dir->maybe_finish_freeze();
+      return;
+    }
+
+    ceph_assert(state_test(STATE_FREEZINGTREE));
+
+    if (!is_subtree_root() && inode->is_frozen()) {
+      dout(10) << __func__ << " !subtree root and frozen inode, waiting for unfreeze on " << inode << dendl;
+      // retake an auth_pin...
+      auth_pin(inode);
+      // and release it when the parent inode unfreezes
+      inode->add_waiter(WAIT_UNFREEZE, new C_Dir_AuthUnpin(this));
+      return;
+    }
+
+    _freeze_tree();
+    auth_unpin(this);
+    finish_waiting(WAIT_FROZEN);
+  }
+}
+
+
+
+// FREEZE DIR
+
+bool CDir::freeze_dir()
+{
+  ceph_assert(!is_frozen());
+  ceph_assert(!is_freezing());
+  
+  auth_pin(this);
+  if (is_freezeable_dir(true)) {
+    _freeze_dir();
+    auth_unpin(this);
+    return true;
+  } else {
+    state_set(STATE_FREEZINGDIR);
+    if (!lock_caches_with_auth_pins.empty())
+      mdcache->mds->locker->invalidate_lock_caches(this);
+    dout(10) << "freeze_dir + wait " << *this << dendl;
+    return false;
+  } 
+}
+
+void CDir::_freeze_dir()
+{
+  dout(10) << __func__ << " " << *this << dendl;
+  //assert(is_freezeable_dir(true));
+  // not always true during split because the original fragment may have frozen a while
+  // ago and we're just now getting around to breaking it up.
+
+  state_clear(STATE_FREEZINGDIR);
+  state_set(STATE_FROZENDIR);
+  get(PIN_FROZEN);
+
+  if (is_auth() && !is_subtree_root())
+    inode->auth_pin(this);  // auth_pin for duration of freeze
+}
+
+
+void CDir::unfreeze_dir()
+{
+  dout(10) << __func__ << " " << *this << dendl;
+
+  if (state_test(STATE_FROZENDIR)) {
+    state_clear(STATE_FROZENDIR);
+    put(PIN_FROZEN);
+
+    // unpin  (may => FREEZEABLE)   FIXME: is this order good?
+    if (is_auth() && !is_subtree_root())
+      inode->auth_unpin(this);
+
+    finish_waiting(WAIT_UNFREEZE);
+  } else {
+    finish_waiting(WAIT_FROZEN, -1);
+
+    // still freezing. stop.
+    ceph_assert(state_test(STATE_FREEZINGDIR));
+    state_clear(STATE_FREEZINGDIR);
+    auth_unpin(this);
+    
+    finish_waiting(WAIT_UNFREEZE);
+  }
+}
+
+void CDir::enable_frozen_inode()
+{
+  ceph_assert(frozen_inode_suppressed > 0);
+  if (--frozen_inode_suppressed == 0) {
+    for (auto p = freezing_inodes.begin(); !p.end(); ) {
+      CInode *in = *p;
+      ++p;
+      ceph_assert(in->is_freezing_inode());
+      in->maybe_finish_freeze_inode();
+    }
+  }
+}
+
+/**
+ * Slightly less complete than operator<<, because this is intended
+ * for identifying a directory and its state rather than for dumping
+ * debug output.
+ */
+void CDir::dump(Formatter *f, int flags) const
+{
+  ceph_assert(f != NULL);
+  if (flags & DUMP_PATH) {
+    f->dump_stream("path") << get_path();
+  }
+  if (flags & DUMP_DIRFRAG) {
+    f->dump_stream("dirfrag") << dirfrag();
+  }
+  if (flags & DUMP_SNAPID_FIRST) {
+    f->dump_int("snapid_first", first);
+  }
+  if (flags & DUMP_VERSIONS) {
+    f->dump_stream("projected_version") << get_projected_version();
+    f->dump_stream("version") << get_version();
+    f->dump_stream("committing_version") << get_committing_version();
+    f->dump_stream("committed_version") << get_committed_version();
+  }
+  if (flags & DUMP_REP) {
+    f->dump_bool("is_rep", is_rep());
+  }
+  if (flags & DUMP_DIR_AUTH) {
+    if (get_dir_auth() != CDIR_AUTH_DEFAULT) {
+      if (get_dir_auth().second == CDIR_AUTH_UNKNOWN) {
+        f->dump_stream("dir_auth") << get_dir_auth().first;
+      } else {
+        f->dump_stream("dir_auth") << get_dir_auth();
+      }
+    } else {
+      f->dump_string("dir_auth", "");
+    }
+  }
+  if (flags & DUMP_STATES) {
+    f->open_array_section("states");
+    MDSCacheObject::dump_states(f);
+    if (state_test(CDir::STATE_COMPLETE)) f->dump_string("state", "complete");
+    if (state_test(CDir::STATE_FREEZINGTREE)) f->dump_string("state", "freezingtree");
+    if (state_test(CDir::STATE_FROZENTREE)) f->dump_string("state", "frozentree");
+    if (state_test(CDir::STATE_FROZENDIR)) f->dump_string("state", "frozendir");
+    if (state_test(CDir::STATE_FREEZINGDIR)) f->dump_string("state", "freezingdir");
+    if (state_test(CDir::STATE_EXPORTBOUND)) f->dump_string("state", "exportbound");
+    if (state_test(CDir::STATE_IMPORTBOUND)) f->dump_string("state", "importbound");
+    if (state_test(CDir::STATE_BADFRAG)) f->dump_string("state", "badfrag");
+    f->close_section();
+  }
+  if (flags & DUMP_MDS_CACHE_OBJECT) {
+    MDSCacheObject::dump(f);
+  }
+  if (flags & DUMP_ITEMS) {
+    f->open_array_section("dentries");
+    for (auto &p : items) {
+      CDentry *dn = p.second;
+      f->open_object_section("dentry");
+      dn->dump(f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+}
+
+void CDir::dump_load(Formatter *f)
+{
+  f->dump_stream("path") << get_path();
+  f->dump_stream("dirfrag") << dirfrag();
+
+  f->open_object_section("pop_me");
+  pop_me.dump(f);
+  f->close_section();
+
+  f->open_object_section("pop_nested");
+  pop_nested.dump(f);
+  f->close_section();
+
+  f->open_object_section("pop_auth_subtree");
+  pop_auth_subtree.dump(f);
+  f->close_section();
+
+  f->open_object_section("pop_auth_subtree_nested");
+  pop_auth_subtree_nested.dump(f);
+  f->close_section();
+}
+
+/****** Scrub Stuff *******/
+
+void CDir::scrub_info_create() const
+{
+  ceph_assert(!scrub_infop);
+
+  // break out of const-land to set up implicit initial state
+  CDir *me = const_cast<CDir*>(this);
+  const auto& pf = me->get_projected_fnode();
+
+  std::unique_ptr<scrub_info_t> si(new scrub_info_t());
+
+  si->last_recursive.version = pf->recursive_scrub_version;
+  si->last_recursive.time = pf->recursive_scrub_stamp;
+
+  si->last_local.version = pf->localized_scrub_version;
+  si->last_local.time = pf->localized_scrub_stamp;
+
+  me->scrub_infop.swap(si);
+}
+
+void CDir::scrub_initialize(const ScrubHeaderRef& header)
+{
+  ceph_assert(header);
+  // FIXME: weird implicit construction, is someone else meant
+  // to be calling scrub_info_create first?
+  scrub_info();
+  scrub_infop->directory_scrubbing = true;
+  scrub_infop->header = header;
+  header->inc_num_pending();
+}
+
+void CDir::scrub_aborted() {
+  dout(20) << __func__ << dendl;
+  ceph_assert(scrub_is_in_progress());
+
+  scrub_infop->last_scrub_dirty = false;
+  scrub_infop->directory_scrubbing = false;
+  scrub_infop->header->dec_num_pending();
+  scrub_infop.reset();
+}
+
+void CDir::scrub_finished()
+{
+  dout(20) << __func__ << dendl;
+  ceph_assert(scrub_is_in_progress());
+
+  scrub_infop->last_local.time = ceph_clock_now();
+  scrub_infop->last_local.version = get_version();
+  if (scrub_infop->header->get_recursive())
+    scrub_infop->last_recursive = scrub_infop->last_local;
+
+  scrub_infop->last_scrub_dirty = true;
+
+  scrub_infop->directory_scrubbing = false;
+  scrub_infop->header->dec_num_pending();
+}
+
+void CDir::scrub_maybe_delete_info()
+{
+  if (scrub_infop &&
+      !scrub_infop->directory_scrubbing &&
+      !scrub_infop->last_scrub_dirty)
+    scrub_infop.reset();
+}
+
+bool CDir::scrub_local()
+{
+  ceph_assert(is_complete());
+  bool good = check_rstats(true);
+  if (!good && scrub_infop->header->get_repair()) {
+    mdcache->repair_dirfrag_stats(this);
+    scrub_infop->header->set_repaired();
+  }
+  return good;
+}
+
+std::string CDir::get_path() const
+{
+  std::string path;
+  get_inode()->make_path_string(path, true);
+  return path;
+}
+
+bool CDir::should_split_fast() const
+{
+  // Max size a fragment can be before trigger fast splitting
+  int fast_limit = g_conf()->mds_bal_split_size * g_conf()->mds_bal_fragment_fast_factor;
+
+  // Fast path: the sum of accounted size and null dentries does not
+  // exceed threshold: we definitely are not over it.
+  if (get_frag_size() + get_num_head_null() <= fast_limit) {
+    return false;
+  }
+
+  // Fast path: the accounted size of the frag exceeds threshold: we
+  // definitely are over it
+  if (get_frag_size() > fast_limit) {
+    return true;
+  }
+
+  int64_t effective_size = 0;
+
+  for (const auto &p : items) {
+    const CDentry *dn = p.second;
+    if (!dn->get_projected_linkage()->is_null()) {
+      effective_size++;
+    }
+  }
+
+  return effective_size > fast_limit;
+}
+
+bool CDir::should_merge() const
+{
+  if (get_frag() == frag_t())
+    return false;
+
+  if (inode->is_ephemeral_dist()) {
+    unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
+    if (min_frag_bits > 0 && get_frag().bits() < min_frag_bits + 1)
+      return false;
+  }
+
+  return (int)get_frag_size() < g_conf()->mds_bal_merge_size;
+}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(CDir, co_dir, mds_co);
+MEMPOOL_DEFINE_OBJECT_FACTORY(CDir::scrub_info_t, scrub_info_t, mds_co)
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
new file mode 100644
index 000000000..4e189e273
--- /dev/null
+++ b/src/mds/CDir.h
@@ -0,0 +1,788 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_CDIR_H
+#define CEPH_CDIR_H
+
+#include <iosfwd>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <string_view>
+
+#include "common/bloom_filter.hpp"
+#include "common/config.h"
+#include "include/buffer_fwd.h"
+#include "include/counter.h"
+#include "include/types.h"
+
+#include "CInode.h"
+#include "MDSCacheObject.h"
+#include "MDSContext.h"
+#include "cephfs_features.h"
+#include "SessionMap.h"
+#include "messages/MClientReply.h"
+
+class CDentry;
+class MDCache;
+
+std::ostream& operator<<(std::ostream& out, const class CDir& dir);
+
+class CDir : public MDSCacheObject, public Counter<CDir> {
+public:
+  MEMPOOL_CLASS_HELPERS();
+
+  typedef mempool::mds_co::map<dentry_key_t, CDentry*> dentry_key_map;
+  typedef mempool::mds_co::set<dentry_key_t> dentry_key_set;
+
+  using fnode_ptr = std::shared_ptr<fnode_t>;
+  using fnode_const_ptr = std::shared_ptr<const fnode_t>;
+
+  template <typename ...Args>
+  static fnode_ptr allocate_fnode(Args && ...args) {
+    static mempool::mds_co::pool_allocator<fnode_t> allocator;
+    return std::allocate_shared<fnode_t>(allocator, std::forward<Args>(args)...);
+  }
+
+  struct dentry_commit_item {
+    string key;
+    snapid_t first;
+    bool is_remote = false;
+
+    inodeno_t ino;
+    unsigned char d_type;
+    mempool::mds_co::string alternate_name;
+
+    bool snaprealm = false;
+    sr_t srnode;
+
+    mempool::mds_co::string symlink;
+    uint64_t features;
+    uint64_t dft_len;
+    CInode::inode_const_ptr inode;
+    CInode::xattr_map_const_ptr xattrs;
+    CInode::old_inode_map_const_ptr old_inodes;
+    snapid_t oldest_snap;
+    damage_flags_t damage_flags;
+  };
+
+  // -- freezing --
+  struct freeze_tree_state_t {
+    CDir *dir; // freezing/frozen tree root
+    int auth_pins = 0;
+    bool frozen = false;
+    freeze_tree_state_t(CDir *d) : dir(d) {}
+  };
+
+  class scrub_info_t {
+  public:
+    MEMPOOL_CLASS_HELPERS();
+    struct scrub_stamps {
+      version_t version = 0;
+      utime_t time;
+    };
+
+    scrub_info_t() {}
+
+    scrub_stamps last_recursive; // when we last finished a recursive scrub
+    scrub_stamps last_local; // when we last did a local scrub
+
+    bool directory_scrubbing = false; /// safety check
+    bool last_scrub_dirty = false; /// is scrub info dirty or is it flushed to fnode?
+
+    ScrubHeaderRef header;
+  };
+
+  // -- pins --
+  static const int PIN_DNWAITER =     1;
+  static const int PIN_INOWAITER =    2;
+  static const int PIN_CHILD =        3;
+  static const int PIN_FROZEN =       4;
+  static const int PIN_SUBTREE =      5;
+  static const int PIN_IMPORTING =    7;
+  static const int PIN_IMPORTBOUND =  9;
+  static const int PIN_EXPORTBOUND = 10;
+  static const int PIN_STICKY =      11;
+  static const int PIN_SUBTREETEMP = 12;  // used by MDCache::trim_non_auth()
+
+  // -- state --
+  static const unsigned STATE_COMPLETE =      (1<< 0);  // the complete contents are in cache
+  static const unsigned STATE_FROZENTREE =    (1<< 1);  // root of tree (bounded by exports)
+  static const unsigned STATE_FREEZINGTREE =  (1<< 2);  // in process of freezing
+  static const unsigned STATE_FROZENDIR =     (1<< 3);
+  static const unsigned STATE_FREEZINGDIR =   (1<< 4);
+  static const unsigned STATE_COMMITTING =    (1<< 5);  // mid-commit
+  static const unsigned STATE_FETCHING =      (1<< 6);  // currenting fetching
+  static const unsigned STATE_CREATING =      (1<< 7);
+  static const unsigned STATE_IMPORTBOUND =   (1<< 8);
+  static const unsigned STATE_EXPORTBOUND =   (1<< 9);
+  static const unsigned STATE_EXPORTING =     (1<<10);
+  static const unsigned STATE_IMPORTING =     (1<<11);
+  static const unsigned STATE_FRAGMENTING =   (1<<12);
+  static const unsigned STATE_STICKY =        (1<<13);  // sticky pin due to inode stickydirs
+  static const unsigned STATE_DNPINNEDFRAG =  (1<<14);  // dir is refragmenting
+  static const unsigned STATE_ASSIMRSTAT =    (1<<15);  // assimilating inode->frag rstats
+  static const unsigned STATE_DIRTYDFT =      (1<<16);  // dirty dirfragtree
+  static const unsigned STATE_BADFRAG =       (1<<17);  // bad dirfrag
+  static const unsigned STATE_TRACKEDBYOFT =  (1<<18);  // tracked by open file table
+  static const unsigned STATE_AUXSUBTREE =    (1<<19);  // no subtree merge
+
+  // common states
+  static const unsigned STATE_CLEAN =  0;
+
+  // these state bits are preserved by an import/export
+  // ...except if the directory is hashed, in which case none of them are!
+  static const unsigned MASK_STATE_EXPORTED = 
+  (STATE_COMPLETE|STATE_DIRTY|STATE_DIRTYDFT|STATE_BADFRAG);
+  static const unsigned MASK_STATE_IMPORT_KEPT = 
+  (						  
+   STATE_IMPORTING |
+   STATE_IMPORTBOUND |
+   STATE_EXPORTBOUND |
+   STATE_FROZENTREE |
+   STATE_STICKY |
+   STATE_TRACKEDBYOFT);
+  static const unsigned MASK_STATE_EXPORT_KEPT = 
+  (STATE_EXPORTING |
+   STATE_IMPORTBOUND |
+   STATE_EXPORTBOUND |
+   STATE_FROZENTREE |
+   STATE_FROZENDIR |
+   STATE_STICKY |
+   STATE_TRACKEDBYOFT);
+  static const unsigned MASK_STATE_FRAGMENT_KEPT = 
+  (STATE_DIRTY |
+   STATE_EXPORTBOUND |
+   STATE_IMPORTBOUND |
+   STATE_AUXSUBTREE |
+   STATE_REJOINUNDEF);
+
+  // -- rep spec --
+  static const int REP_NONE =     0;
+  static const int REP_ALL =      1;
+  static const int REP_LIST =     2;
+
+  static const unsigned EXPORT_NONCE  = 1;
+
+  // -- wait masks --
+  static const uint64_t WAIT_DENTRY       = (1<<0);  // wait for item to be in cache
+  static const uint64_t WAIT_COMPLETE     = (1<<1);  // wait for complete dir contents
+  static const uint64_t WAIT_FROZEN       = (1<<2);  // auth pins removed
+  static const uint64_t WAIT_CREATED	  = (1<<3);  // new dirfrag is logged
+
+  static const int WAIT_DNLOCK_OFFSET = 4;
+
+  static const uint64_t WAIT_ANY_MASK = (uint64_t)(-1);
+  static const uint64_t WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH);
+
+  // -- dump flags --
+  static const int DUMP_PATH             = (1 << 0);
+  static const int DUMP_DIRFRAG          = (1 << 1);
+  static const int DUMP_SNAPID_FIRST     = (1 << 2);
+  static const int DUMP_VERSIONS         = (1 << 3);
+  static const int DUMP_REP              = (1 << 4);
+  static const int DUMP_DIR_AUTH         = (1 << 5);
+  static const int DUMP_STATES           = (1 << 6);
+  static const int DUMP_MDS_CACHE_OBJECT = (1 << 7);
+  static const int DUMP_ITEMS            = (1 << 8);
+  static const int DUMP_ALL              = (-1);
+  static const int DUMP_DEFAULT          = DUMP_ALL & (~DUMP_ITEMS);
+
+  CDir(CInode *in, frag_t fg, MDCache *mdc, bool auth);
+
+  std::string_view pin_name(int p) const override {
+    switch (p) {
+    case PIN_DNWAITER: return "dnwaiter";
+    case PIN_INOWAITER: return "inowaiter";
+    case PIN_CHILD: return "child";
+    case PIN_FROZEN: return "frozen";
+    case PIN_SUBTREE: return "subtree";
+    case PIN_IMPORTING: return "importing";
+    case PIN_IMPORTBOUND: return "importbound";
+    case PIN_EXPORTBOUND: return "exportbound";
+    case PIN_STICKY: return "sticky";
+    case PIN_SUBTREETEMP: return "subtreetemp";
+    default: return generic_pin_name(p);
+    }
+  }
+
+  bool is_lt(const MDSCacheObject *r) const override {
+    return dirfrag() < (static_cast<const CDir*>(r))->dirfrag();
+  }
+
+  void resync_accounted_fragstat();
+  void resync_accounted_rstat();
+  void assimilate_dirty_rstat_inodes(MutationRef& mut);
+  void assimilate_dirty_rstat_inodes_finish(EMetaBlob *blob);
+
+  void mark_exporting() {
+    state_set(CDir::STATE_EXPORTING);
+    inode->num_exporting_dirs++;
+  }
+  void clear_exporting() {
+    state_clear(CDir::STATE_EXPORTING);
+    inode->num_exporting_dirs--;
+  }
+
+  version_t get_version() const { return fnode->version; }
+  void update_projected_version() {
+    ceph_assert(projected_fnode.empty());
+    projected_version = fnode->version;
+  }
+  version_t get_projected_version() const { return projected_version; }
+
+  void reset_fnode(fnode_const_ptr&& ptr) {
+    fnode = std::move(ptr);
+  }
+  void set_fresh_fnode(fnode_const_ptr&& ptr);
+
+  const fnode_const_ptr& get_fnode() const {
+    return fnode;
+  }
+
+  // only used for updating newly allocated CDir
+  fnode_t* _get_fnode() {
+    if (fnode == empty_fnode)
+      reset_fnode(allocate_fnode());
+    return const_cast<fnode_t*>(fnode.get());
+  }
+
+  const fnode_const_ptr& get_projected_fnode() const {
+    if (projected_fnode.empty())
+      return fnode;
+    else
+      return projected_fnode.back();
+  }
+
+  // fnode should have already been projected in caller's context
+  fnode_t* _get_projected_fnode() {
+    ceph_assert(!projected_fnode.empty());
+    return const_cast<fnode_t*>(projected_fnode.back().get());
+  }
+
+  fnode_ptr project_fnode(const MutationRef& mut);
+
+  void pop_and_dirty_projected_fnode(LogSegment *ls, const MutationRef& mut);
+  bool is_projected() const { return !projected_fnode.empty(); }
+  version_t pre_dirty(version_t min=0);
+  void _mark_dirty(LogSegment *ls);
+  void _set_dirty_flag() {
+    if (!state_test(STATE_DIRTY)) {
+      state_set(STATE_DIRTY);
+      get(PIN_DIRTY);
+    }
+  }
+  void mark_dirty(LogSegment *ls, version_t pv=0);
+  void mark_clean();
+
+  bool is_new() { return item_new.is_on_list(); }
+  void mark_new(LogSegment *ls);
+
+  bool is_bad() { return state_test(STATE_BADFRAG); }
+
+  /**
+   * Call to start this CDir on a new scrub.
+   * @pre It is not currently scrubbing
+   * @pre The CDir is marked complete.
+   * @post It has set up its internal scrubbing state.
+   */
+  void scrub_initialize(const ScrubHeaderRef& header);
+  const ScrubHeaderRef& get_scrub_header() {
+    static const ScrubHeaderRef nullref;
+    return scrub_infop ? scrub_infop->header : nullref;
+  }
+
+  bool scrub_is_in_progress() const {
+    return (scrub_infop && scrub_infop->directory_scrubbing);
+  }
+
+  /**
+   * Call this once all CDentries have been scrubbed, according to
+   * scrub_dentry_next's listing. It finalizes the scrub statistics.
+   */
+  void scrub_finished();
+
+  void scrub_aborted();
+  /**
+   * Tell the CDir to do a local scrub of itself.
+   * @pre The CDir is_complete().
+   * @returns true if the rstats and directory contents match, false otherwise.
+   */
+  bool scrub_local();
+
+  const scrub_info_t *scrub_info() const {
+    if (!scrub_infop)
+      scrub_info_create();
+    return scrub_infop.get();
+  }
+
+  // -- accessors --
+  inodeno_t ino()     const { return inode->ino(); }          // deprecate me?
+  frag_t    get_frag()    const { return frag; }
+  dirfrag_t dirfrag() const { return dirfrag_t(inode->ino(), frag); }
+
+  CInode *get_inode()    { return inode; }
+  const CInode *get_inode() const { return inode; }
+  CDir *get_parent_dir() { return inode->get_parent_dir(); }
+
+  dentry_key_map::iterator begin() { return items.begin(); }
+  dentry_key_map::iterator end() { return items.end(); }
+  dentry_key_map::iterator lower_bound(dentry_key_t key) { return items.lower_bound(key); }
+
+  unsigned get_num_head_items() const { return num_head_items; }
+  unsigned get_num_head_null() const { return num_head_null; }
+  unsigned get_num_snap_items() const { return num_snap_items; }
+  unsigned get_num_snap_null() const { return num_snap_null; }
+  unsigned get_num_any() const { return num_head_items + num_head_null + num_snap_items + num_snap_null; }
+  
+  bool check_rstats(bool scrub=false);
+
+  void inc_num_dirty() { num_dirty++; }
+  void dec_num_dirty() { 
+    ceph_assert(num_dirty > 0);
+    num_dirty--; 
+  }
+  int get_num_dirty() const {
+    return num_dirty;
+  }
+
+  void adjust_num_inodes_with_caps(int d);
+
+  int64_t get_frag_size() const {
+    return get_projected_fnode()->fragstat.size();
+  }
+
+  // -- dentries and inodes --
+  CDentry* lookup_exact_snap(std::string_view dname, snapid_t last);
+  CDentry* lookup(std::string_view n, snapid_t snap=CEPH_NOSNAP);
+
+  CDentry* add_null_dentry(std::string_view dname,
+			   snapid_t first=2, snapid_t last=CEPH_NOSNAP);
+  CDentry* add_primary_dentry(std::string_view dname, CInode *in, mempool::mds_co::string alternate_name,
+			      snapid_t first=2, snapid_t last=CEPH_NOSNAP);
+  CDentry* add_remote_dentry(std::string_view dname, inodeno_t ino, unsigned char d_type,
+                             mempool::mds_co::string alternate_name,
+			     snapid_t first=2, snapid_t last=CEPH_NOSNAP);
+  void remove_dentry( CDentry *dn );         // delete dentry
+  void link_remote_inode( CDentry *dn, inodeno_t ino, unsigned char d_type);
+  void link_remote_inode( CDentry *dn, CInode *in );
+  void link_primary_inode( CDentry *dn, CInode *in );
+  void unlink_inode(CDentry *dn, bool adjust_lru=true);
+  void try_remove_unlinked_dn(CDentry *dn);
+
+  void add_to_bloom(CDentry *dn);
+  bool is_in_bloom(std::string_view name);
+  bool has_bloom() { return (bloom ? true : false); }
+  void remove_bloom() {
+    bloom.reset();
+  }
+
+  void try_remove_dentries_for_stray();
+  bool try_trim_snap_dentry(CDentry *dn, const std::set<snapid_t>& snaps);
+
+  void split(int bits, std::vector<CDir*>* subs, MDSContext::vec& waiters, bool replay);
+  void merge(const std::vector<CDir*>& subs, MDSContext::vec& waiters, bool replay);
+
+  bool should_split() const {
+    return g_conf()->mds_bal_split_size > 0 &&
+           (int)get_frag_size() > g_conf()->mds_bal_split_size;
+  }
+  bool should_split_fast() const;
+  bool should_merge() const;
+
+  mds_authority_t authority() const override;
+  mds_authority_t get_dir_auth() const { return dir_auth; }
+  void set_dir_auth(const mds_authority_t &a);
+  void set_dir_auth(mds_rank_t a) { set_dir_auth(mds_authority_t(a, CDIR_AUTH_UNKNOWN)); }
+  bool is_ambiguous_dir_auth() const {
+    return dir_auth.second != CDIR_AUTH_UNKNOWN;
+  }
+  bool is_full_dir_auth() const {
+    return is_auth() && !is_ambiguous_dir_auth();
+  }
+  bool is_full_dir_nonauth() const {
+    return !is_auth() && !is_ambiguous_dir_auth();
+  }
+  
+  bool is_subtree_root() const {
+    return dir_auth != CDIR_AUTH_DEFAULT;
+  }
+
+  bool contains(CDir *x);  // true if we are x or an ancestor of x 
+
+  // for giving to clients
+  void get_dist_spec(std::set<mds_rank_t>& ls, mds_rank_t auth) {
+    if (is_auth()) {
+      list_replicas(ls);
+      if (!ls.empty()) 
+	ls.insert(auth);
+    }
+  }
+
+  static void encode_dirstat(ceph::buffer::list& bl, const session_info_t& info, const DirStat& ds);
+
+  void _encode_base(ceph::buffer::list& bl) {
+    ENCODE_START(1, 1, bl);
+    encode(first, bl);
+    encode(*fnode, bl);
+    encode(dir_rep, bl);
+    encode(dir_rep_by, bl);
+    ENCODE_FINISH(bl);
+  }
+  void _decode_base(ceph::buffer::list::const_iterator& p) {
+    DECODE_START(1, p);
+    decode(first, p);
+    {
+      auto _fnode = allocate_fnode();
+      decode(*_fnode, p);
+      reset_fnode(std::move(_fnode));
+    }
+    decode(dir_rep, p);
+    decode(dir_rep_by, p);
+    DECODE_FINISH(p);
+  }
+
+  // -- state --
+  bool is_complete() { return state & STATE_COMPLETE; }
+  bool is_exporting() { return state & STATE_EXPORTING; }
+  bool is_importing() { return state & STATE_IMPORTING; }
+  bool is_dirty_dft() { return state & STATE_DIRTYDFT; }
+
+  int get_dir_rep() const { return dir_rep; }
+  bool is_rep() const { 
+    if (dir_rep == REP_NONE) return false;
+    return true;
+  }
+  bool can_rep() const;
+ 
+  // -- fetch --
+  object_t get_ondisk_object() { 
+    return file_object_t(ino(), frag);
+  }
+  void fetch(MDSContext *c, bool ignore_authpinnability=false);
+  void fetch(MDSContext *c, std::string_view want_dn, bool ignore_authpinnability=false);
+  void fetch(MDSContext *c, const std::set<dentry_key_t>& keys);
+
+#if 0  // unused?
+  void wait_for_commit(Context *c, version_t v=0);
+#endif
+  void commit_to(version_t want);
+  void commit(version_t want, MDSContext *c,
+	      bool ignore_authpinnability=false, int op_prio=-1);
+
+  // -- dirtyness --
+  version_t get_committing_version() const { return committing_version; }
+  version_t get_committed_version() const { return committed_version; }
+  void set_committed_version(version_t v) { committed_version = v; }
+
+  void mark_complete();
+
+  // -- reference counting --
+  void first_get() override;
+  void last_put() override;
+
+  bool is_waiting_for_dentry(std::string_view dname, snapid_t snap) {
+    return waiting_on_dentry.count(string_snap_t(dname, snap));
+  }
+  void add_dentry_waiter(std::string_view dentry, snapid_t snap, MDSContext *c);
+  void take_dentry_waiting(std::string_view dentry, snapid_t first, snapid_t last, MDSContext::vec& ls);
+  void take_sub_waiting(MDSContext::vec& ls);  // dentry or ino
+
+  void add_waiter(uint64_t mask, MDSContext *c) override;
+  void take_waiting(uint64_t mask, MDSContext::vec& ls) override;  // may include dentry waiters
+  void finish_waiting(uint64_t mask, int result = 0);    // ditto
+
+  // -- import/export --
+  mds_rank_t get_export_pin(bool inherit=true) const;
+  bool is_exportable(mds_rank_t dest) const;
+
+  void encode_export(ceph::buffer::list& bl);
+  void finish_export();
+  void abort_export() {
+    put(PIN_TEMPEXPORTING);
+  }
+  void decode_import(ceph::buffer::list::const_iterator& blp, LogSegment *ls);
+  void abort_import();
+
+  // -- auth pins --
+  bool can_auth_pin(int *err_ret=nullptr) const override;
+  int get_auth_pins() const { return auth_pins; }
+  int get_dir_auth_pins() const { return dir_auth_pins; }
+  void auth_pin(void *who) override;
+  void auth_unpin(void *who) override;
+
+  void adjust_nested_auth_pins(int dirinc, void *by);
+  void verify_fragstat();
+
+  void _walk_tree(std::function<bool(CDir*)> cb);
+
+  bool freeze_tree();
+  void _freeze_tree();
+  void unfreeze_tree();
+  void adjust_freeze_after_rename(CDir *dir);
+
+  bool freeze_dir();
+  void _freeze_dir();
+  void unfreeze_dir();
+
+  void maybe_finish_freeze();
+
+  std::pair<bool,bool> is_freezing_or_frozen_tree() const {
+    if (freeze_tree_state) {
+      if (freeze_tree_state->frozen)
+	return std::make_pair(false, true);
+      return std::make_pair(true, false);
+    }
+    return std::make_pair(false, false);
+  }
+
+  bool is_freezing() const override { return is_freezing_dir() || is_freezing_tree(); }
+  bool is_freezing_tree() const {
+    if (!num_freezing_trees)
+      return false;
+    return is_freezing_or_frozen_tree().first;
+  }
+  bool is_freezing_tree_root() const { return state & STATE_FREEZINGTREE; }
+  bool is_freezing_dir() const { return state & STATE_FREEZINGDIR; }
+
+  bool is_frozen() const override { return is_frozen_dir() || is_frozen_tree(); }
+  bool is_frozen_tree() const {
+    if (!num_frozen_trees)
+      return false;
+    return is_freezing_or_frozen_tree().second;
+  }
+  bool is_frozen_tree_root() const { return state & STATE_FROZENTREE; }
+  bool is_frozen_dir() const { return state & STATE_FROZENDIR; }
+
+  bool is_freezeable(bool freezing=false) const {
+    // no nested auth pins.
+    if (auth_pins - (freezing ? 1 : 0) > 0 ||
+	(freeze_tree_state && freeze_tree_state->auth_pins != auth_pins))
+      return false;
+
+    // inode must not be frozen.
+    if (!is_subtree_root() && inode->is_frozen())
+      return false;
+
+    return true;
+  }
+
+  bool is_freezeable_dir(bool freezing=false) const {
+    if ((auth_pins - freezing) > 0 || dir_auth_pins > 0)
+      return false;
+
+    // if not subtree root, inode must not be frozen (tree--frozen_dir is okay).
+    if (!is_subtree_root() && inode->is_frozen() && !inode->is_frozen_dir())
+      return false;
+
+    return true;
+  }
+
+  bool is_any_freezing_or_frozen_inode() const {
+    return num_frozen_inodes || !freezing_inodes.empty();
+  }
+  bool is_auth_pinned_by_lock_cache() const {
+    return frozen_inode_suppressed;
+  }
+  void disable_frozen_inode() {
+    ceph_assert(num_frozen_inodes == 0);
+    frozen_inode_suppressed++;
+  }
+  void enable_frozen_inode();
+
+  std::ostream& print_db_line_prefix(std::ostream& out) override;
+  void print(std::ostream& out) override;
+  void dump(ceph::Formatter *f, int flags = DUMP_DEFAULT) const;
+  void dump_load(ceph::Formatter *f);
+
+  // context
+  MDCache *mdcache;
+
+  CInode *inode;  // my inode
+  frag_t frag;   // my frag
+
+  snapid_t first = 2;
+  mempool::mds_co::compact_map<snapid_t,old_rstat_t> dirty_old_rstat;  // [value.first,key]
+
+  // my inodes with dirty rstat data
+  elist<CInode*> dirty_rstat_inodes;
+
+  elist<CDentry*> dirty_dentries;
+  elist<CDir*>::item item_dirty, item_new;
+
+  // lock caches that auth-pin me
+  elist<MDLockCache::DirItem*> lock_caches_with_auth_pins;
+
+  // all dirfrags within freezing/frozen tree reference the 'state'
+  std::shared_ptr<freeze_tree_state_t> freeze_tree_state;
+
+protected:
+  // friends
+  friend class Migrator;
+  friend class CInode;
+  friend class MDCache;
+  friend class MDiscover;
+  friend class MDBalancer;
+
+  friend class CDirDiscover;
+  friend class CDirExport;
+  friend class C_IO_Dir_TMAP_Fetched;
+  friend class C_IO_Dir_OMAP_Fetched;
+  friend class C_IO_Dir_OMAP_FetchedMore;
+  friend class C_IO_Dir_Committed;
+  friend class C_IO_Dir_Commit_Ops;
+
+  void _omap_fetch(MDSContext *fin, const std::set<dentry_key_t>& keys);
+  void _omap_fetch_more(version_t omap_version, bufferlist& hdrbl,
+			map<string, bufferlist>& omap, MDSContext *fin);
+  CDentry *_load_dentry(
+      std::string_view key,
+      std::string_view dname,
+      snapid_t last,
+      ceph::buffer::list &bl,
+      int pos,
+      const std::set<snapid_t> *snaps,
+      double rand_threshold,
+      bool *force_dirty);
+
+  /**
+   * Go bad due to a damaged dentry (register with damagetable and go BADFRAG)
+   */
+  void go_bad_dentry(snapid_t last, std::string_view dname);
+
+  /**
+   * Go bad due to a damaged header (register with damagetable and go BADFRAG)
+   */
+  void go_bad(bool complete);
+
+  void _omap_fetched(ceph::buffer::list& hdrbl, std::map<std::string, ceph::buffer::list>& omap,
+		     bool complete, int r);
+
+  // -- commit --
+  void _commit(version_t want, int op_prio);
+  void _omap_commit_ops(int r, int op_prio, int64_t metapool, version_t version, bool _new,
+			vector<dentry_commit_item> &to_set, bufferlist &dfts,
+			vector<string> &to_remove,
+			mempool::mds_co::compact_set<mempool::mds_co::string> &_stale);
+  void _encode_primary_inode_base(dentry_commit_item &item, bufferlist &dfts,
+                                  bufferlist &bl);
+  void _omap_commit(int op_prio);
+  void _parse_dentry(CDentry *dn, dentry_commit_item &item,
+                     const set<snapid_t> *snaps, bufferlist &bl);
+  void _committed(int r, version_t v);
+
+  static fnode_const_ptr empty_fnode;
+  // fnode is a pointer to constant fnode_t, the constant fnode_t can be shared
+  // by CDir and log events. To update fnode, read-copy-update should be used.
+
+  fnode_const_ptr fnode = empty_fnode;
+
+  version_t projected_version = 0;
+  mempool::mds_co::list<fnode_const_ptr> projected_fnode;
+
+  std::unique_ptr<scrub_info_t> scrub_infop;
+
+  // contents of this directory
+  dentry_key_map items;       // non-null AND null
+  unsigned num_head_items = 0;
+  unsigned num_head_null = 0;
+  unsigned num_snap_items = 0;
+  unsigned num_snap_null = 0;
+
+  int num_dirty = 0;
+
+  int num_inodes_with_caps = 0;
+
+  // state
+  version_t committing_version = 0;
+  version_t committed_version = 0;
+
+  mempool::mds_co::compact_set<mempool::mds_co::string> stale_items;
+
+  // lock nesting, freeze
+  static int num_frozen_trees;
+  static int num_freezing_trees;
+
+  // freezing/frozen inodes in this dirfrag
+  int num_frozen_inodes = 0;
+  int frozen_inode_suppressed = 0;
+  elist<CInode*> freezing_inodes;
+
+  int dir_auth_pins = 0;
+
+  // cache control  (defined for authority; hints for replicas)
+  __s32      dir_rep;
+  mempool::mds_co::compact_set<__s32> dir_rep_by;      // if dir_rep == REP_LIST
+
+  // popularity
+  dirfrag_load_vec_t pop_me;
+  dirfrag_load_vec_t pop_nested;
+  dirfrag_load_vec_t pop_auth_subtree;
+  dirfrag_load_vec_t pop_auth_subtree_nested;
+
+  ceph::coarse_mono_time last_popularity_sample = ceph::coarse_mono_clock::zero();
+
+  load_spread_t pop_spread;
+
+  elist<CInode*> pop_lru_subdirs;
+
+  std::unique_ptr<bloom_filter> bloom; // XXX not part of mempool::mds_co
+  /* If you set up the bloom filter, you must keep it accurate!
+   * It's deleted when you mark_complete() and is deliberately not serialized.*/
+
+  mempool::mds_co::compact_set<mempool::mds_co::string> wanted_items;
+  mempool::mds_co::compact_map<version_t, MDSContext::vec_alloc<mempool::mds_co::pool_allocator> > waiting_for_commit;
+
+  // -- waiters --
+  mempool::mds_co::compact_map< string_snap_t, MDSContext::vec_alloc<mempool::mds_co::pool_allocator> > waiting_on_dentry; // FIXME string_snap_t not in mempool
+
+private:
+  friend std::ostream& operator<<(std::ostream& out, const class CDir& dir);
+
+  void log_mark_dirty();
+
+  /**
+   * Create a scrub_info_t struct for the scrub_infop pointer.
+   */
+  void scrub_info_create() const;
+  /**
+   * Delete the scrub_infop if it's not got any useful data.
+   */
+  void scrub_maybe_delete_info();
+
+  void link_inode_work( CDentry *dn, CInode *in );
+  void unlink_inode_work( CDentry *dn );
+  void remove_null_dentries();
+  void purge_stale_snap_data(const std::set<snapid_t>& snaps);
+
+  void prepare_new_fragment(bool replay);
+  void prepare_old_fragment(std::map<string_snap_t, MDSContext::vec >& dentry_waiters, bool replay);
+  void steal_dentry(CDentry *dn);  // from another dir.  used by merge/split.
+  void finish_old_fragment(MDSContext::vec& waiters, bool replay);
+  void init_fragment_pins();
+  std::string get_path() const;
+
+  // -- authority --
+  /*
+   *     normal: <parent,unknown>   !subtree_root
+   * delegation: <mds,unknown>       subtree_root
+   *  ambiguous: <mds1,mds2>         subtree_root
+   *             <parent,mds2>       subtree_root
+   */
+  mds_authority_t dir_auth;
+};
+
+#endif
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
new file mode 100644
index 000000000..07517eeb7
--- /dev/null
+++ b/src/mds/CInode.cc
@@ -0,0 +1,5494 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "include/int_types.h"
+#include "common/errno.h"
+
+#include <string>
+
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+
+#include "MDSRank.h"
+#include "MDCache.h"
+#include "MDLog.h"
+#include "Locker.h"
+#include "Mutation.h"
+
+#include "events/EUpdate.h"
+
+#include "osdc/Objecter.h"
+
+#include "snap.h"
+
+#include "LogSegment.h"
+
+#include "common/Clock.h"
+
+#include "common/config.h"
+#include "global/global_context.h"
+#include "include/ceph_assert.h"
+
+#include "mds/MDSContinuation.h"
+#include "mds/InoTable.h"
+#include "cephfs_features.h"
+#include "osdc/Objecter.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") "
+
+void CInodeCommitOperation::update(ObjectOperation &op, inode_backtrace_t &bt) {
+  using ceph::encode;
+
+  op.priority = priority;
+  op.create(false);
+
+  bufferlist parent_bl;
+  encode(bt, parent_bl);
+  op.setxattr("parent", parent_bl);
+
+  // for the old pool there is no need to update the layout
+  if (!update_layout)
+    return;
+
+  bufferlist layout_bl;
+  encode(_layout, layout_bl, _features);
+  op.setxattr("layout", layout_bl);
+}
+
+class CInodeIOContext : public MDSIOContextBase
+{
+protected:
+  CInode *in;
+  MDSRank *get_mds() override {return in->mdcache->mds;}
+public:
+  explicit CInodeIOContext(CInode *in_) : in(in_) {
+    ceph_assert(in != NULL);
+  }
+};
+
+sr_t* const CInode::projected_inode::UNDEF_SRNODE = (sr_t*)(unsigned long)-1;
+
+LockType CInode::versionlock_type(CEPH_LOCK_IVERSION);
+LockType CInode::authlock_type(CEPH_LOCK_IAUTH);
+LockType CInode::linklock_type(CEPH_LOCK_ILINK);
+LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT);
+LockType CInode::filelock_type(CEPH_LOCK_IFILE);
+LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR);
+LockType CInode::snaplock_type(CEPH_LOCK_ISNAP);
+LockType CInode::nestlock_type(CEPH_LOCK_INEST);
+LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK);
+LockType CInode::policylock_type(CEPH_LOCK_IPOLICY);
+
+std::string_view CInode::pin_name(int p) const
+{
+  switch (p) {
+    case PIN_DIRFRAG: return "dirfrag";
+    case PIN_CAPS: return "caps";
+    case PIN_IMPORTING: return "importing";
+    case PIN_OPENINGDIR: return "openingdir";
+    case PIN_REMOTEPARENT: return "remoteparent";
+    case PIN_BATCHOPENJOURNAL: return "batchopenjournal";
+    case PIN_SCATTERED: return "scattered";
+    case PIN_STICKYDIRS: return "stickydirs";
+      //case PIN_PURGING: return "purging";
+    case PIN_FREEZING: return "freezing";
+    case PIN_FROZEN: return "frozen";
+    case PIN_IMPORTINGCAPS: return "importingcaps";
+    case PIN_EXPORTINGCAPS: return "exportingcaps";
+    case PIN_PASTSNAPPARENT: return "pastsnapparent";
+    case PIN_OPENINGSNAPPARENTS: return "openingsnapparents";
+    case PIN_TRUNCATING: return "truncating";
+    case PIN_STRAY: return "stray";
+    case PIN_NEEDSNAPFLUSH: return "needsnapflush";
+    case PIN_DIRTYRSTAT: return "dirtyrstat";
+    case PIN_DIRTYPARENT: return "dirtyparent";
+    case PIN_DIRWAITER: return "dirwaiter";
+    default: return generic_pin_name(p);
+  }
+}
+
+//int cinode_pins[CINODE_NUM_PINS];  // counts
+ostream& CInode::print_db_line_prefix(ostream& out)
+{
+  return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") ";
+}
+
+/*
+ * write caps and lock ids
+ */
+struct cinode_lock_info_t cinode_lock_info[] = {
+  { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR },
+  { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL },
+  { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL },
+  { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL },
+};
+int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]);
+
+ostream& operator<<(ostream& out, const CInode& in)
+{
+  string path;
+  in.make_path_string(path, true);
+
+  out << "[inode " << in.ino();
+  out << " [" 
+      << (in.is_multiversion() ? "...":"")
+      << in.first << "," << in.last << "]";
+  out << " " << path << (in.is_dir() ? "/":"");
+
+  if (in.is_auth()) {
+    out << " auth";
+    if (in.is_replicated()) 
+      out << in.get_replicas();
+  } else {
+    mds_authority_t a = in.authority();
+    out << " rep@" << a.first;
+    if (a.second != CDIR_AUTH_UNKNOWN)
+      out << "," << a.second;
+    out << "." << in.get_replica_nonce();
+  }
+
+  if (in.is_symlink())
+    out << " symlink='" << in.symlink << "'";
+  if (in.is_dir() && !in.dirfragtree.empty())
+    out << " " << in.dirfragtree;
+  
+  out << " v" << in.get_version();
+  if (in.get_projected_version() > in.get_version())
+    out << " pv" << in.get_projected_version();
+
+  if (in.get_num_auth_pins()) {
+    out << " ap=" << in.get_num_auth_pins();
+#ifdef MDS_AUTHPIN_SET
+    in.print_authpin_set(out);
+#endif
+  }
+
+  if (in.snaprealm)
+    out << " snaprealm=" << in.snaprealm;
+
+  if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH";
+  if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " NEEDSRECOVER";
+  if (in.state_test(CInode::STATE_RECOVERING)) out << " RECOVERING";
+  if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " DIRTYPARENT";
+  if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " MISSINGOBJS";
+  if (in.is_ephemeral_dist()) out << " DISTEPHEMERALPIN";
+  if (in.is_ephemeral_rand()) out << " RANDEPHEMERALPIN";
+  if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
+  if (in.is_frozen_inode()) out << " FROZEN";
+  if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN";
+
+  const auto& pi = in.get_projected_inode();
+  if (pi->is_truncating())
+    out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")";
+
+  if (in.is_dir()) {
+    out << " " << in.get_inode()->dirstat;
+    if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
+      out << "->" << pi->dirstat;
+    }
+  } else {
+    out << " s=" << in.get_inode()->size;
+    if (in.get_inode()->nlink != 1)
+      out << " nl=" << in.get_inode()->nlink;
+  }
+
+  // rstat
+  out << " " << in.get_inode()->rstat;
+  if (!(in.get_inode()->rstat == in.get_inode()->accounted_rstat))
+    out << "/" << in.get_inode()->accounted_rstat;
+  if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
+    out << "->" << pi->rstat;
+    if (!(pi->rstat == pi->accounted_rstat))
+      out << "/" << pi->accounted_rstat;
+  }
+
+  if (in.is_any_old_inodes()) {
+    out << " old_inodes=" << in.get_old_inodes()->size();
+  }
+
+  if (!in.client_need_snapflush.empty())
+    out << " need_snapflush=" << in.client_need_snapflush;
+
+  // locks
+  if (!in.authlock.is_sync_and_unlocked())
+    out << " " << in.authlock;
+  if (!in.linklock.is_sync_and_unlocked())
+    out << " " << in.linklock;
+  if (in.get_inode()->is_dir()) {
+    if (!in.dirfragtreelock.is_sync_and_unlocked())
+      out << " " << in.dirfragtreelock;
+    if (!in.snaplock.is_sync_and_unlocked())
+      out << " " << in.snaplock;
+    if (!in.nestlock.is_sync_and_unlocked())
+      out << " " << in.nestlock;
+    if (!in.policylock.is_sync_and_unlocked())
+      out << " " << in.policylock;
+  } else  {
+    if (!in.flocklock.is_sync_and_unlocked())
+      out << " " << in.flocklock;
+  }
+  if (!in.filelock.is_sync_and_unlocked())
+    out << " " << in.filelock;
+  if (!in.xattrlock.is_sync_and_unlocked())
+    out << " " << in.xattrlock;
+  if (!in.versionlock.is_sync_and_unlocked())  
+    out << " " << in.versionlock;
+
+  // hack: spit out crap on which clients have caps
+  if (in.get_inode()->client_ranges.size())
+    out << " cr=" << in.get_inode()->client_ranges;
+
+  if (!in.get_client_caps().empty()) {
+    out << " caps={";
+    bool first = true;
+    for (const auto &p : in.get_client_caps()) {
+      if (!first) out << ",";
+      out << p.first << "="
+	  << ccap_string(p.second.pending());
+      if (p.second.issued() != p.second.pending())
+	out << "/" << ccap_string(p.second.issued());
+      out << "/" << ccap_string(p.second.wanted())
+	  << "@" << p.second.get_last_seq();
+      first = false;
+    }
+    out << "}";
+    if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) {
+      out << ",l=" << in.get_loner();
+      if (in.get_loner() != in.get_wanted_loner())
+	out << "(" << in.get_wanted_loner() << ")";
+    }
+  }
+  if (!in.get_mds_caps_wanted().empty()) {
+    out << " mcw={";
+    bool first = true;
+    for (const auto &p : in.get_mds_caps_wanted()) {
+      if (!first)
+	out << ',';
+      out << p.first << '=' << ccap_string(p.second);
+      first = false;
+    }
+    out << '}';
+  }
+
+  if (in.get_num_ref()) {
+    out << " |";
+    in.print_pin_set(out);
+  }
+
+  if (in.get_inode()->export_pin != MDS_RANK_NONE) {
+    out << " export_pin=" << in.get_inode()->export_pin;
+  }
+  if (in.state_test(CInode::STATE_DISTEPHEMERALPIN)) {
+    out << " distepin";
+  }
+  if (in.state_test(CInode::STATE_RANDEPHEMERALPIN)) {
+    out << " randepin";
+  }
+
+  out << " " << &in;
+  out << "]";
+  return out;
+}
+
+CInode::CInode(MDCache *c, bool auth, snapid_t f, snapid_t l) :
+    mdcache(c), first(f), last(l),
+    item_dirty(this),
+    item_caps(this),
+    item_open_file(this),
+    item_dirty_parent(this),
+    item_dirty_dirfrag_dir(this),
+    item_dirty_dirfrag_nest(this),
+    item_dirty_dirfrag_dirfragtree(this),
+    pop(c->decayrate),
+    versionlock(this, &versionlock_type),
+    authlock(this, &authlock_type),
+    linklock(this, &linklock_type),
+    dirfragtreelock(this, &dirfragtreelock_type),
+    filelock(this, &filelock_type),
+    xattrlock(this, &xattrlock_type),
+    snaplock(this, &snaplock_type),
+    nestlock(this, &nestlock_type),
+    flocklock(this, &flocklock_type),
+    policylock(this, &policylock_type)
+{
+  if (auth)
+    state_set(STATE_AUTH);
+}
+
+void CInode::print(ostream& out)
+{
+  out << *this;
+}
+
+void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
+{
+  dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
+
+  if (client_need_snapflush.empty()) {
+    get(CInode::PIN_NEEDSNAPFLUSH);
+
+    // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially
+    // long periods waiting for clients to flush their snaps.
+    auth_pin(this);   // pin head get_inode()->..
+  }
+
+  auto &clients = client_need_snapflush[snapid];
+  if (clients.empty())
+    snapin->auth_pin(this);  // ...and pin snapped/old inode!
+  
+  clients.insert(client);
+}
+
+void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
+{
+  dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
+  auto it = client_need_snapflush.find(snapid);
+  if (it == client_need_snapflush.end()) {
+    dout(10) << " snapid not found" << dendl;
+    return;
+  }
+  size_t n = it->second.erase(client);
+  if (n == 0) {
+    dout(10) << " client not found" << dendl;
+    return;
+  }
+  if (it->second.empty()) {
+    client_need_snapflush.erase(it);
+    snapin->auth_unpin(this);
+
+    if (client_need_snapflush.empty()) {
+      put(CInode::PIN_NEEDSNAPFLUSH);
+      auth_unpin(this);
+    }
+  }
+}
+
+pair<bool,bool> CInode::split_need_snapflush(CInode *cowin, CInode *in)
+{
+  dout(10) << __func__ << " [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl;
+  bool cowin_need_flush = false;
+  bool orig_need_flush = false;
+  auto it = client_need_snapflush.lower_bound(cowin->first);
+  while (it != client_need_snapflush.end() && it->first < in->first) {
+    ceph_assert(!it->second.empty());
+    if (cowin->last >= it->first) {
+      cowin->auth_pin(this);
+      cowin_need_flush = true;
+      ++it;
+    } else {
+      it = client_need_snapflush.erase(it);
+    }
+    in->auth_unpin(this);
+  }
+
+  if (it != client_need_snapflush.end() && it->first <= in->last)
+    orig_need_flush = true;
+
+  return make_pair(cowin_need_flush, orig_need_flush);
+}
+
+void CInode::mark_dirty_rstat()
+{
+  if (!state_test(STATE_DIRTYRSTAT)) {
+    dout(10) << __func__ << dendl;
+    state_set(STATE_DIRTYRSTAT);
+    get(PIN_DIRTYRSTAT);
+    CDentry *pdn = get_projected_parent_dn();
+    if (pdn->is_auth()) {
+      CDir *pdir = pdn->dir;
+      pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item);
+      mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock);
+    } else {
+      // under cross-MDS rename.
+      // DIRTYRSTAT flag will get cleared when rename finishes
+      ceph_assert(state_test(STATE_AMBIGUOUSAUTH));
+    }
+  }
+}
+void CInode::clear_dirty_rstat()
+{
+  if (state_test(STATE_DIRTYRSTAT)) {
+    dout(10) << __func__ << dendl;
+    state_clear(STATE_DIRTYRSTAT);
+    put(PIN_DIRTYRSTAT);
+    dirty_rstat_item.remove_myself();
+  }
+}
+
+CInode::projected_inode CInode::project_inode(const MutationRef& mut,
+					      bool xattr, bool snap)
+{
+  if (mut && mut->is_projected(this)) {
+    ceph_assert(!xattr && !snap);
+    auto _inode = std::const_pointer_cast<mempool_inode>(projected_nodes.back().inode);
+    return projected_inode(std::move(_inode), xattr_map_ptr());
+  }
+
+  auto pi = allocate_inode(*get_projected_inode());
+
+  if (scrub_infop && scrub_infop->last_scrub_dirty) {
+    pi->last_scrub_stamp = scrub_infop->last_scrub_stamp;
+    pi->last_scrub_version = scrub_infop->last_scrub_version;
+    scrub_infop->last_scrub_dirty = false;
+    scrub_maybe_delete_info();
+  }
+
+  const auto& ox = get_projected_xattrs();
+  xattr_map_ptr px;
+  if (xattr) {
+    px = allocate_xattr_map();
+    if (ox)
+      *px = *ox;
+  }
+
+  sr_t* ps = projected_inode::UNDEF_SRNODE;
+  if (snap) {
+    ps = prepare_new_srnode(0);
+    ++num_projected_srnodes;
+  }
+
+  projected_nodes.emplace_back(pi, xattr ? px : ox , ps);
+  if (mut)
+    mut->add_projected_node(this);
+  dout(15) << __func__ << " " << pi->ino << dendl;
+  return projected_inode(std::move(pi), std::move(px), ps);
+}
+
+void CInode::pop_and_dirty_projected_inode(LogSegment *ls, const MutationRef& mut)
+{
+  ceph_assert(!projected_nodes.empty());
+  auto front = std::move(projected_nodes.front());
+  dout(15) << __func__ << " v" << front.inode->version << dendl;
+
+  projected_nodes.pop_front();
+  if (mut)
+    mut->remove_projected_node(this);
+
+  bool pool_updated = get_inode()->layout.pool_id != front.inode->layout.pool_id;
+  bool pin_updated = (get_inode()->export_pin != front.inode->export_pin) ||
+		     (get_inode()->export_ephemeral_distributed_pin !=
+		      front.inode->export_ephemeral_distributed_pin);
+
+  reset_inode(std::move(front.inode));
+  if (front.xattrs != get_xattrs())
+    reset_xattrs(std::move(front.xattrs));
+
+  if (front.snapnode != projected_inode::UNDEF_SRNODE) {
+    --num_projected_srnodes;
+    pop_projected_snaprealm(front.snapnode, false);
+  }
+
+  mark_dirty(ls);
+  if (get_inode()->is_backtrace_updated())
+    mark_dirty_parent(ls, pool_updated);
+
+  if (pin_updated)
+    maybe_export_pin(true);
+}
+
+sr_t *CInode::prepare_new_srnode(snapid_t snapid)
+{
+  const sr_t *cur_srnode = get_projected_srnode();
+  sr_t *new_srnode;
+
+  if (cur_srnode) {
+    new_srnode = new sr_t(*cur_srnode);
+  } else {
+    if (snapid == 0)
+      snapid = mdcache->get_global_snaprealm()->get_newest_seq();
+    new_srnode = new sr_t();
+    new_srnode->seq = snapid;
+    new_srnode->created = snapid;
+    new_srnode->current_parent_since = get_oldest_snap();
+  }
+  return new_srnode;
+}
+
+const sr_t *CInode::get_projected_srnode() const {
+  if (num_projected_srnodes > 0) {
+    for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
+      if (it->snapnode != projected_inode::UNDEF_SRNODE)
+	return it->snapnode;
+  }
+  if (snaprealm)
+    return &snaprealm->srnode;
+  else
+    return NULL;
+}
+
+void CInode::project_snaprealm(sr_t *new_srnode)
+{
+  dout(10) << __func__ << " " << new_srnode << dendl;
+  ceph_assert(projected_nodes.back().snapnode == projected_inode::UNDEF_SRNODE);
+  projected_nodes.back().snapnode = new_srnode;
+  ++num_projected_srnodes;
+}
+
+void CInode::mark_snaprealm_global(sr_t *new_srnode)
+{
+  ceph_assert(!is_dir());
+  // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since'
+  new_srnode->last_destroyed = new_srnode->current_parent_since;
+  new_srnode->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+  new_srnode->mark_parent_global();
+}
+
+void CInode::clear_snaprealm_global(sr_t *new_srnode)
+{
+  // restore 'current_parent_since'
+  new_srnode->current_parent_since = new_srnode->last_destroyed;
+  new_srnode->last_destroyed = 0;
+  new_srnode->seq = mdcache->get_global_snaprealm()->get_newest_seq();
+  new_srnode->clear_parent_global();
+}
+
+bool CInode::is_projected_snaprealm_global() const
+{
+  const sr_t *srnode = get_projected_srnode();
+  if (srnode && srnode->is_parent_global())
+    return true;
+  return false;
+}
+
+void CInode::project_snaprealm_past_parent(SnapRealm *newparent)
+{
+  sr_t *new_snap = project_snaprealm();
+  record_snaprealm_past_parent(new_snap, newparent);
+}
+
+
+/* if newparent != parent, add parent to past_parents
+ if parent DNE, we need to find what the parent actually is and fill that in */
+void CInode::record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent)
+{
+  ceph_assert(!new_snap->is_parent_global());
+  SnapRealm *oldparent;
+  if (!snaprealm) {
+    oldparent = find_snaprealm();
+  } else {
+    oldparent = snaprealm->parent;
+  }
+
+  if (newparent != oldparent) {
+    snapid_t oldparentseq = oldparent->get_newest_seq();
+    if (oldparentseq + 1 > new_snap->current_parent_since) {
+      // copy old parent's snaps
+      const set<snapid_t>& snaps = oldparent->get_snaps();
+      auto p = snaps.lower_bound(new_snap->current_parent_since);
+      if (p != snaps.end())
+	new_snap->past_parent_snaps.insert(p, snaps.end());
+      if (oldparentseq > new_snap->seq)
+	new_snap->seq = oldparentseq;
+    }
+    new_snap->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+  }
+}
+
+void CInode::record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *oldparent,
+					    CDentry *dn, bool primary_dn)
+{
+  ceph_assert(new_snap->is_parent_global());
+
+  if (!oldparent)
+    oldparent = dn->get_dir()->inode->find_snaprealm();
+  auto& snaps = oldparent->get_snaps();
+
+  if (!primary_dn) {
+    auto p = snaps.lower_bound(dn->first);
+    if (p != snaps.end())
+      new_snap->past_parent_snaps.insert(p, snaps.end());
+  } else {
+    // 'last_destroyed' is used as 'current_parent_since'
+    auto p = snaps.lower_bound(new_snap->last_destroyed);
+    if (p != snaps.end())
+      new_snap->past_parent_snaps.insert(p, snaps.end());
+    new_snap->last_destroyed = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+  }
+}
+
+void CInode::early_pop_projected_snaprealm()
+{
+  ceph_assert(!projected_nodes.empty());
+  if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) {
+    pop_projected_snaprealm(projected_nodes.front().snapnode, true);
+    projected_nodes.front().snapnode = projected_inode::UNDEF_SRNODE;
+    --num_projected_srnodes;
+  }
+}
+
+void CInode::pop_projected_snaprealm(sr_t *next_snaprealm, bool early)
+{
+  if (next_snaprealm) {
+    dout(10) << __func__ << (early ? " (early) " : " ")
+	     << next_snaprealm << " seq " << next_snaprealm->seq << dendl;
+    if (!snaprealm)
+      open_snaprealm();
+
+    auto old_flags = snaprealm->srnode.flags;
+    snaprealm->srnode = *next_snaprealm;
+    delete next_snaprealm;
+
+    if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
+      snaprealm->adjust_parent();
+    }
+
+    if (snaprealm->parent)
+      dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl;
+  } else {
+    dout(10) << __func__ << (early ? " (early) null" : " null") << dendl;
+    ceph_assert(snaprealm);
+    snaprealm->merge_to(NULL);
+  }
+}
+
+
+// ====== CInode =======
+
+// dirfrags
+
+InodeStoreBase::inode_const_ptr InodeStoreBase::empty_inode = InodeStoreBase::allocate_inode();
+
+__u32 InodeStoreBase::hash_dentry_name(std::string_view dn)
+{
+  int which = inode->dir_layout.dl_dir_hash;
+  if (!which)
+    which = CEPH_STR_HASH_LINUX;
+  ceph_assert(ceph_str_hash_valid(which));
+  return ceph_str_hash(which, dn.data(), dn.length());
+}
+
+frag_t InodeStoreBase::pick_dirfrag(std::string_view dn)
+{
+  if (dirfragtree.empty())
+    return frag_t();          // avoid the string hash if we can.
+
+  __u32 h = hash_dentry_name(dn);
+  return dirfragtree[h];
+}
+
+std::pair<bool, std::vector<CDir*>> CInode::get_dirfrags_under(frag_t fg)
+{
+  std::pair<bool, std::vector<CDir*>> result;
+  auto& all = result.first;
+  auto& dirs = result.second;
+  all = false;
+  
+  if (auto it = dirfrags.find(fg); it != dirfrags.end()){
+    all = true;
+    dirs.push_back(it->second);
+    return result;
+  }
+  
+  int total = 0;
+  for(auto &[_fg, _dir] : dirfrags){
+    // frag_t.bits() can indicate the depth of the partition in the directory tree
+    // e.g. 
+    // 01*  : bit = 2, on the second floor
+    // *
+    // 0*      1*
+    // 00* 01* 10* 11*     -- > level 2, bit = 2
+    // so fragA.bits > fragB.bits means fragA is deeper than fragB
+
+    if (fg.bits() >= _fg.bits()) {
+      if (_fg.contains(fg)) {
+	all = true;
+	return result;
+      }
+    } else {
+      if (fg.contains(_fg)) {
+	dirs.push_back(_dir);
+	// we can calculate how many sub slices a slice can be divided into
+	// frag_t(*) can be divided into two frags belonging to the first layer(0* 1*)
+	//           or 2^2 frags belonging to the second layer(00* 01* 10* 11*)
+	//           or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level
+	total += 1 << (24 - _fg.bits());
+      }
+    }
+  }
+
+  // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache
+  all = ((1<<(24-fg.bits())) == total);
+  return result;
+}
+
+void CInode::verify_dirfrags()
+{
+  bool bad = false;
+  for (const auto &p : dirfrags) {
+    if (!dirfragtree.is_leaf(p.first)) {
+      dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
+	      << ": " << *p.second << dendl;
+      bad = true;
+    }
+  }
+  ceph_assert(!bad);
+}
+
+void CInode::force_dirfrags()
+{
+  bool bad = false;
+  for (auto &p : dirfrags) {
+    if (!dirfragtree.is_leaf(p.first)) {
+      dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
+	      << ": " << *p.second << dendl;
+      bad = true;
+    }
+  }
+
+  if (bad) {
+    frag_vec_t leaves;
+    dirfragtree.get_leaves(leaves);
+    for (const auto& leaf : leaves) {
+      mdcache->get_force_dirfrag(dirfrag_t(ino(), leaf), true);
+    }
+  }
+
+  verify_dirfrags();
+}
+
+CDir *CInode::get_approx_dirfrag(frag_t fg)
+{
+  CDir *dir = get_dirfrag(fg);
+  if (dir) return dir;
+
+  // find a child?
+  auto&& p = get_dirfrags_under(fg);
+  if (!p.second.empty())
+    return p.second.front();
+
+  // try parents?
+  while (fg.bits() > 0) {
+    fg = fg.parent();
+    dir = get_dirfrag(fg);
+    if (dir) return dir;
+  }
+  return NULL;
+}	
+
+CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg)
+{
+  ceph_assert(is_dir());
+
+  // have it?
+  CDir *dir = get_dirfrag(fg);
+  if (!dir) {
+    // create it.
+    ceph_assert(is_auth() || mdcache->mds->is_any_replay());
+    dir = new CDir(this, fg, mdcache, is_auth());
+    add_dirfrag(dir);
+  }
+  return dir;
+}
+
+CDir *CInode::add_dirfrag(CDir *dir)
+{
+  auto em = dirfrags.emplace(std::piecewise_construct, std::forward_as_tuple(dir->dirfrag().frag), std::forward_as_tuple(dir));
+  ceph_assert(em.second);
+
+  if (stickydir_ref > 0) {
+    dir->state_set(CDir::STATE_STICKY);
+    dir->get(CDir::PIN_STICKY);
+  }
+
+  maybe_export_pin();
+
+  return dir;
+}
+
+void CInode::close_dirfrag(frag_t fg)
+{
+  dout(14) << __func__ << " " << fg << dendl;
+  ceph_assert(dirfrags.count(fg));
+  
+  CDir *dir = dirfrags[fg];
+  dir->remove_null_dentries();
+  
+  // clear dirty flag
+  if (dir->is_dirty())
+    dir->mark_clean();
+  
+  if (stickydir_ref > 0) {
+    dir->state_clear(CDir::STATE_STICKY);
+    dir->put(CDir::PIN_STICKY);
+  }
+
+  if (dir->is_subtree_root())
+    num_subtree_roots--;
+  
+  // dump any remaining dentries, for debugging purposes
+  for (const auto &p : dir->items)
+    dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl;
+
+  ceph_assert(dir->get_num_ref() == 0);
+  delete dir;
+  dirfrags.erase(fg);
+}
+
+void CInode::close_dirfrags()
+{
+  while (!dirfrags.empty()) 
+    close_dirfrag(dirfrags.begin()->first);
+}
+
+bool CInode::has_subtree_root_dirfrag(int auth)
+{
+  if (num_subtree_roots > 0) {
+    if (auth == -1)
+      return true;
+    for (const auto &p : dirfrags) {
+      if (p.second->is_subtree_root() &&
+	  p.second->dir_auth.first == auth)
+	return true;
+    }
+  }
+  return false;
+}
+
+bool CInode::has_subtree_or_exporting_dirfrag()
+{
+  if (num_subtree_roots > 0 || num_exporting_dirs > 0)
+    return true;
+  return false;
+}
+
+void CInode::get_stickydirs()
+{
+  if (stickydir_ref == 0) {
+    get(PIN_STICKYDIRS);
+    for (const auto &p : dirfrags) {
+      p.second->state_set(CDir::STATE_STICKY);
+      p.second->get(CDir::PIN_STICKY);
+    }
+  }
+  stickydir_ref++;
+}
+
+void CInode::put_stickydirs()
+{
+  ceph_assert(stickydir_ref > 0);
+  stickydir_ref--;
+  if (stickydir_ref == 0) {
+    put(PIN_STICKYDIRS);
+    for (const auto &p : dirfrags) {
+      p.second->state_clear(CDir::STATE_STICKY);
+      p.second->put(CDir::PIN_STICKY);
+    }
+  }
+}
+
+
+
+
+
+// pins
+
+void CInode::first_get()
+{
+  // pin my dentry?
+  if (parent) 
+    parent->get(CDentry::PIN_INODEPIN);
+}
+
+void CInode::last_put() 
+{
+  // unpin my dentry?
+  if (parent) 
+    parent->put(CDentry::PIN_INODEPIN);
+}
+
+void CInode::_put()
+{
+  if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
+    mdcache->maybe_eval_stray(this, true);
+}
+
+void CInode::add_remote_parent(CDentry *p) 
+{
+  if (remote_parents.empty())
+    get(PIN_REMOTEPARENT);
+  remote_parents.insert(p);
+}
+void CInode::remove_remote_parent(CDentry *p) 
+{
+  remote_parents.erase(p);
+  if (remote_parents.empty())
+    put(PIN_REMOTEPARENT);
+}
+
+
+
+
+CDir *CInode::get_parent_dir()
+{
+  if (parent)
+    return parent->dir;
+  return NULL;
+}
+CDir *CInode::get_projected_parent_dir()
+{
+  CDentry *p = get_projected_parent_dn();
+  if (p)
+    return p->dir;
+  return NULL;
+}
+CInode *CInode::get_parent_inode() 
+{
+  if (parent) 
+    return parent->dir->inode;
+  return NULL;
+}
+
+bool CInode::is_ancestor_of(const CInode *other) const
+{
+  while (other) {
+    if (other == this)
+      return true;
+    const CDentry *pdn = other->get_oldest_parent_dn();
+    if (!pdn) {
+      ceph_assert(other->is_base());
+      break;
+    }
+    other = pdn->get_dir()->get_inode();
+  }
+  return false;
+}
+
+bool CInode::is_projected_ancestor_of(const CInode *other) const
+{
+  while (other) {
+    if (other == this)
+      return true;
+    const CDentry *pdn = other->get_projected_parent_dn();
+    if (!pdn) {
+      ceph_assert(other->is_base());
+      break;
+    }
+    other = pdn->get_dir()->get_inode();
+  }
+  return false;
+}
+
+/*
+ * Because a non-directory inode may have multiple links, the use_parent
+ * argument allows selecting which parent to use for path construction. This
+ * argument is only meaningful for the final component (i.e. the first of the
+ * nested calls) because directories cannot have multiple hard links. If
+ * use_parent is NULL and projected is true, the primary parent's projected
+ * inode is used all the way up the path chain. Otherwise the primary parent
+ * stable inode is used.
+ */
+void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const
+{
+  if (!use_parent) {
+    use_parent = projected ? get_projected_parent_dn() : parent;
+  }
+
+  if (use_parent) {
+    use_parent->make_path_string(s, projected);
+  } else if (is_root()) {
+    s = "";
+  } else if (is_mdsdir()) {
+    char t[40];
+    uint64_t eino(ino());
+    eino -= MDS_INO_MDSDIR_OFFSET;
+    snprintf(t, sizeof(t), "~mds%" PRId64, eino);
+    s = t;
+  } else {
+    char n[40];
+    uint64_t eino(ino());
+    snprintf(n, sizeof(n), "#%" PRIx64, eino);
+    s += n;
+  }
+}
+
+void CInode::make_path(filepath& fp, bool projected) const
+{
+  const CDentry *use_parent = projected ? get_projected_parent_dn() : parent;
+  if (use_parent) {
+    ceph_assert(!is_base());
+    use_parent->make_path(fp, projected);
+  } else {
+    fp = filepath(ino());
+  }
+}
+
+void CInode::name_stray_dentry(string& dname)
+{
+  char s[20];
+  snprintf(s, sizeof(s), "%llx", (unsigned long long)ino().val);
+  dname = s;
+}
+
+version_t CInode::pre_dirty()
+{
+  version_t pv;
+  CDentry* _cdentry = get_projected_parent_dn(); 
+  if (_cdentry) {
+    pv = _cdentry->pre_dirty(get_projected_version());
+    dout(10) << "pre_dirty " << pv << " (current v " << get_inode()->version << ")" << dendl;
+  } else {
+    ceph_assert(is_base());
+    pv = get_projected_version() + 1;
+  }
+  // force update backtrace for old format inode (see mempool_inode::decode)
+  if (get_inode()->backtrace_version == 0 && !projected_nodes.empty()) {
+    auto pi = _get_projected_inode();
+    if (pi->backtrace_version == 0)
+      pi->update_backtrace(pv);
+  }
+  return pv;
+}
+
+void CInode::_mark_dirty(LogSegment *ls)
+{
+  if (!state_test(STATE_DIRTY)) {
+    state_set(STATE_DIRTY);
+    get(PIN_DIRTY);
+    ceph_assert(ls);
+  }
+  
+  // move myself to this segment's dirty list
+  if (ls) 
+    ls->dirty_inodes.push_back(&item_dirty);
+}
+
+void CInode::mark_dirty(LogSegment *ls) {
+  
+  dout(10) << __func__ << " " << *this << dendl;
+
+  /*
+    NOTE: I may already be dirty, but this fn _still_ needs to be called so that
+    the directory is (perhaps newly) dirtied, and so that parent_dir_version is 
+    updated below.
+  */
+  
+  // only auth can get dirty.  "dirty" async data in replicas is relative to
+  // filelock state, not the dirty flag.
+  ceph_assert(is_auth());
+  
+  // touch my private version
+  _mark_dirty(ls);
+
+  // mark dentry too
+  if (parent)
+    parent->mark_dirty(get_version(), ls);
+}
+
+
+void CInode::mark_clean()
+{
+  dout(10) << __func__ << " " << *this << dendl;
+  if (state_test(STATE_DIRTY)) {
+    state_clear(STATE_DIRTY);
+    put(PIN_DIRTY);
+    
+    // remove myself from ls dirty list
+    item_dirty.remove_myself();
+  }
+}    
+
+
+// --------------
+// per-inode storage
+// (currently for root inode only)
+
+struct C_IO_Inode_Stored : public CInodeIOContext {
+  version_t version;
+  Context *fin;
+  C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
+  void finish(int r) override {
+    in->_stored(r, version, fin);
+  }
+  void print(ostream& out) const override {
+    out << "inode_store(" << in->ino() << ")";
+  }
+};
+
+object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix)
+{
+  char n[60];
+  snprintf(n, sizeof(n), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)fg);
+  ceph_assert(strlen(n) + suffix.size() < sizeof n);
+  strncat(n, suffix.data(), suffix.size());
+  return object_t(n);
+}
+
+void CInode::store(MDSContext *fin)
+{
+  dout(10) << __func__ << " " << get_version() << dendl;
+  ceph_assert(is_base());
+
+  if (snaprealm)
+    purge_stale_snap_data(snaprealm->get_snaps());
+
+  // encode
+  bufferlist bl;
+  string magic = CEPH_FS_ONDISK_MAGIC;
+  using ceph::encode;
+  encode(magic, bl);
+  encode_store(bl, mdcache->mds->mdsmap->get_up_features());
+
+  // write it.
+  SnapContext snapc;
+  ObjectOperation m;
+  m.write_full(bl);
+
+  object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode");
+  object_locator_t oloc(mdcache->mds->get_metadata_pool());
+
+  Context *newfin =
+    new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin),
+		     mdcache->mds->finisher);
+  mdcache->mds->objecter->mutate(oid, oloc, m, snapc,
+				 ceph::real_clock::now(), 0,
+				 newfin);
+}
+
+void CInode::_stored(int r, version_t v, Context *fin)
+{
+  if (r < 0) {
+    dout(1) << "store error " << r << " v " << v << " on " << *this << dendl;
+    mdcache->mds->clog->error() << "failed to store inode " << ino()
+                                << " object: " << cpp_strerror(r);
+    mdcache->mds->handle_write_error(r);
+    fin->complete(r);
+    return;
+  }
+
+  dout(10) << __func__ << " " << v << " on " << *this << dendl;
+  if (v == get_projected_version())
+    mark_clean();
+
+  fin->complete(0);
+}
+
+void CInode::flush(MDSContext *fin)
+{
+  dout(10) << __func__ << " " << *this << dendl;
+  ceph_assert(is_auth() && can_auth_pin());
+
+  MDSGatherBuilder gather(g_ceph_context);
+
+  if (is_dirty_parent()) {
+    store_backtrace(gather.new_sub());
+  }
+  if (is_dirty()) {
+    if (is_base()) {
+      store(gather.new_sub());
+    } else {
+      parent->dir->commit(0, gather.new_sub());
+    }
+  }
+
+  if (gather.has_subs()) {
+    gather.set_finisher(fin);
+    gather.activate();
+  } else {
+    fin->complete(0);
+  }
+}
+
+struct C_IO_Inode_Fetched : public CInodeIOContext {
+  bufferlist bl, bl2;
+  Context *fin;
+  C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {}
+  void finish(int r) override {
+    // Ignore 'r', because we fetch from two places, so r is usually CEPHFS_ENOENT
+    in->_fetched(bl, bl2, fin);
+  }
+  void print(ostream& out) const override {
+    out << "inode_fetch(" << in->ino() << ")";
+  }
+};
+
+void CInode::fetch(MDSContext *fin)
+{
+  dout(10) << __func__  << dendl;
+
+  C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin);
+  C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher));
+
+  object_t oid = CInode::get_object_name(ino(), frag_t(), "");
+  object_locator_t oloc(mdcache->mds->get_metadata_pool());
+
+  // Old on-disk format: inode stored in xattr of a dirfrag
+  ObjectOperation rd;
+  rd.getxattr("inode", &c->bl, NULL);
+  mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub());
+
+  // Current on-disk format: inode stored in a .inode object
+  object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode");
+  mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub());
+
+  gather.activate();
+}
+
+void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin)
+{
+  dout(10) << __func__ << " got " << bl.length() << " and " << bl2.length() << dendl;
+  bufferlist::const_iterator p;
+  if (bl2.length()) {
+    p = bl2.cbegin();
+  } else if (bl.length()) {
+    p = bl.cbegin();
+  } else {
+    derr << "No data while reading inode " << ino() << dendl;
+    fin->complete(-CEPHFS_ENOENT);
+    return;
+  }
+
+  using ceph::decode;
+  // Attempt decode
+  try {
+    string magic;
+    decode(magic, p);
+    dout(10) << " magic is '" << magic << "' (expecting '"
+             << CEPH_FS_ONDISK_MAGIC << "')" << dendl;
+    if (magic != CEPH_FS_ONDISK_MAGIC) {
+      dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC
+              << "'" << dendl;
+      fin->complete(-CEPHFS_EINVAL);
+    } else {
+      decode_store(p);
+      dout(10) << "_fetched " << *this << dendl;
+      fin->complete(0);
+    }
+  } catch (buffer::error &err) {
+    derr << "Corrupt inode " << ino() << ": " << err.what() << dendl;
+    fin->complete(-CEPHFS_EINVAL);
+    return;
+  }
+}
+
+void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt)
+{
+  bt.ino = ino();
+  bt.ancestors.clear();
+  bt.pool = pool;
+
+  CInode *in = this;
+  CDentry *pdn = get_parent_dn();
+  while (pdn) {
+    CInode *diri = pdn->get_dir()->get_inode();
+    bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->get_inode()->version));
+    in = diri;
+    pdn = in->get_parent_dn();
+  }
+  bt.old_pools.reserve(get_inode()->old_pools.size());
+  for (auto &p : get_inode()->old_pools) {
+    // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
+    if (p != pool)
+      bt.old_pools.push_back(p);
+  }
+}
+
+struct C_IO_Inode_StoredBacktrace : public CInodeIOContext {
+  version_t version;
+  Context *fin;
+  C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
+  void finish(int r) override {
+    in->_stored_backtrace(r, version, fin);
+  }
+  void print(ostream& out) const override {
+    out << "backtrace_store(" << in->ino() << ")";
+  }
+};
+
+
+void CInode::_commit_ops(int r, C_GatherBuilder &gather_bld,
+                         std::vector<CInodeCommitOperation> &ops_vec,
+                         inode_backtrace_t &bt)
+{
+  dout(10) << __func__ << dendl;
+
+  if (r < 0) {
+    mdcache->mds->handle_write_error_with_lock(r);
+    return;
+  }
+
+  SnapContext snapc;
+  object_t oid = get_object_name(ino(), frag_t(), "");
+
+  for (auto &op : ops_vec) {
+    ObjectOperation obj_op;
+    object_locator_t oloc(op.get_pool());
+    op.update(obj_op, bt);
+    mdcache->mds->objecter->mutate(oid, oloc, obj_op, snapc,
+                                   ceph::real_clock::now(),
+                                   0, gather_bld.new_sub());
+  }
+}
+
+void CInode::_store_backtrace(std::vector<CInodeCommitOperation> &ops_vec,
+                              inode_backtrace_t &bt, int op_prio)
+{
+  dout(10) << __func__ << " on " << *this << dendl;
+  ceph_assert(is_dirty_parent());
+
+  if (op_prio < 0)
+    op_prio = CEPH_MSG_PRIO_DEFAULT;
+
+  auth_pin(this);
+
+  const int64_t pool = get_backtrace_pool();
+  build_backtrace(pool, bt);
+
+  ops_vec.emplace_back(op_prio, pool, get_inode()->layout,
+                       mdcache->mds->mdsmap->get_up_features());
+
+  if (!state_test(STATE_DIRTYPOOL) || get_inode()->old_pools.empty()) {
+    dout(20) << __func__ << ": no dirtypool or no old pools" << dendl;
+    return;
+  }
+
+  // In the case where DIRTYPOOL is set, we update all old pools backtraces
+  // such that anyone reading them will see the new pool ID in
+  // inode_backtrace_t::pool and go read everything else from there.
+  for (const auto &p : get_inode()->old_pools) {
+    if (p == pool)
+      continue;
+
+    dout(20) << __func__ << ": updating old pool " << p << dendl;
+
+    ops_vec.emplace_back(op_prio, p);
+  }
+}
+
+void CInode::store_backtrace(MDSContext *fin, int op_prio)
+{
+  std::vector<CInodeCommitOperation> ops_vec;
+  inode_backtrace_t bt;
+  auto version = get_inode()->backtrace_version;
+
+  _store_backtrace(ops_vec, bt, op_prio);
+
+  C_GatherBuilder gather(g_ceph_context,
+			 new C_OnFinisher(
+			   new C_IO_Inode_StoredBacktrace(this, version, fin),
+			   mdcache->mds->finisher));
+  _commit_ops(0, gather, ops_vec, bt);
+  ceph_assert(gather.has_subs());
+  gather.activate();
+}
+
+void CInode::store_backtrace(CInodeCommitOperations &op, int op_prio)
+{
+  op.version = get_inode()->backtrace_version;
+  op.in = this;
+
+  _store_backtrace(op.ops_vec, op.bt, op_prio);
+}
+
+void CInode::_stored_backtrace(int r, version_t v, Context *fin)
+{
+  if (r == -CEPHFS_ENOENT) {
+    const int64_t pool = get_backtrace_pool();
+    bool exists = mdcache->mds->objecter->with_osdmap(
+        [pool](const OSDMap &osd_map) {
+          return osd_map.have_pg_pool(pool);
+        });
+
+    // This CEPHFS_ENOENT is because the pool doesn't exist (the user deleted it
+    // out from under us), so the backtrace can never be written, so pretend
+    // to succeed so that the user can proceed to e.g. delete the file.
+    if (!exists) {
+      dout(4) << __func__ << " got CEPHFS_ENOENT: a data pool was deleted "
+                 "beneath us!" << dendl;
+      r = 0;
+    }
+  }
+
+  if (r < 0) {
+    dout(1) << "store backtrace error " << r << " v " << v << dendl;
+    mdcache->mds->clog->error() << "failed to store backtrace on ino "
+				<< ino() << " object"
+                                << ", pool " << get_backtrace_pool()
+                                << ", errno " << r;
+    mdcache->mds->handle_write_error(r);
+    if (fin)
+      fin->complete(r);
+    return;
+  }
+
+  dout(10) << __func__ << " v " << v <<  dendl;
+
+  auth_unpin(this);
+  if (v == get_inode()->backtrace_version)
+    clear_dirty_parent();
+  if (fin)
+    fin->complete(0);
+}
+
+void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace)
+{
+  mdcache->fetch_backtrace(ino(), get_backtrace_pool(), *backtrace, fin);
+}
+
+void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool)
+{
+  if (!state_test(STATE_DIRTYPARENT)) {
+    dout(10) << __func__ << dendl;
+    state_set(STATE_DIRTYPARENT);
+    get(PIN_DIRTYPARENT);
+    ceph_assert(ls);
+  }
+  if (dirty_pool)
+    state_set(STATE_DIRTYPOOL);
+  if (ls)
+    ls->dirty_parent_inodes.push_back(&item_dirty_parent);
+}
+
+void CInode::clear_dirty_parent()
+{
+  if (state_test(STATE_DIRTYPARENT)) {
+    dout(10) << __func__ << dendl;
+    state_clear(STATE_DIRTYPARENT);
+    state_clear(STATE_DIRTYPOOL);
+    put(PIN_DIRTYPARENT);
+    item_dirty_parent.remove_myself();
+  }
+}
+
+void CInode::verify_diri_backtrace(bufferlist &bl, int err)
+{
+  if (is_base() || is_dirty_parent() || !is_auth())
+    return;
+
+  dout(10) << __func__ << dendl;
+
+  if (err == 0) {
+    inode_backtrace_t backtrace;
+    using ceph::decode;
+    decode(backtrace, bl);
+    CDentry *pdn = get_parent_dn();
+    if (backtrace.ancestors.empty() ||
+	backtrace.ancestors[0].dname != pdn->get_name() ||
+	backtrace.ancestors[0].dirino != pdn->get_dir()->ino())
+      err = -CEPHFS_EINVAL;
+  }
+
+  if (err) {
+    MDSRank *mds = mdcache->mds;
+    mds->clog->error() << "bad backtrace on directory inode " << ino();
+    ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace > 1));
+
+    mark_dirty_parent(mds->mdlog->get_current_segment(), false);
+    mds->mdlog->flush();
+  }
+}
+
+// ------------------
+// parent dir
+
+
+void InodeStoreBase::encode_xattrs(bufferlist &bl) const {
+  using ceph::encode;
+  if (xattrs)
+    encode(*xattrs, bl);
+  else
+    encode((__u32)0, bl);
+}
+
+void InodeStoreBase::decode_xattrs(bufferlist::const_iterator &p) {
+  using ceph::decode;
+  mempool_xattr_map tmp;
+  decode_noshare(tmp, p);
+  if (tmp.empty()) {
+    reset_xattrs(xattr_map_ptr());
+  } else {
+    reset_xattrs(allocate_xattr_map(std::move(tmp)));
+  }
+}
+
+void InodeStoreBase::encode_old_inodes(bufferlist &bl, uint64_t features) const {
+  using ceph::encode;
+  if (old_inodes)
+    encode(*old_inodes, bl, features);
+  else
+    encode((__u32)0, bl);
+}
+
+void InodeStoreBase::decode_old_inodes(bufferlist::const_iterator &p) {
+  using ceph::decode;
+  mempool_old_inode_map tmp;
+  decode(tmp, p);
+  if (tmp.empty()) {
+    reset_old_inodes(old_inode_map_ptr());
+  } else {
+    reset_old_inodes(allocate_old_inode_map(std::move(tmp)));
+  }
+}
+
+void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features,
+				 const bufferlist *snap_blob) const
+{
+  using ceph::encode;
+  encode(*inode, bl, features);
+  if (inode->is_symlink())
+    encode(symlink, bl);
+  encode(dirfragtree, bl);
+  encode_xattrs(bl);
+
+  if (snap_blob)
+    encode(*snap_blob, bl);
+  else
+    encode(bufferlist(), bl);
+  encode_old_inodes(bl, features);
+  encode(oldest_snap, bl);
+  encode(damage_flags, bl);
+}
+
+void InodeStoreBase::encode(bufferlist &bl, uint64_t features,
+			    const bufferlist *snap_blob) const
+{
+  ENCODE_START(6, 4, bl);
+  encode_bare(bl, features, snap_blob);
+  ENCODE_FINISH(bl);
+}
+
+void CInode::encode_store(bufferlist& bl, uint64_t features)
+{
+  bufferlist snap_blob;
+  encode_snap_blob(snap_blob);
+  InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(),
+			 &snap_blob);
+}
+
+void InodeStoreBase::decode_bare(bufferlist::const_iterator &bl,
+			      bufferlist& snap_blob, __u8 struct_v)
+{
+  using ceph::decode;
+
+  auto _inode = allocate_inode();
+  decode(*_inode, bl);
+
+  if (_inode->is_symlink()) {
+    std::string tmp;
+    decode(tmp, bl);
+    symlink = std::string_view(tmp);
+  }
+  decode(dirfragtree, bl);
+  decode_xattrs(bl);
+  decode(snap_blob, bl);
+
+  decode_old_inodes(bl);
+  if (struct_v == 2 && _inode->is_dir()) {
+    bool default_layout_exists;
+    decode(default_layout_exists, bl);
+    if (default_layout_exists) {
+      decode(struct_v, bl); // this was a default_file_layout
+      decode(_inode->layout, bl); // but we only care about the layout portion
+    }
+  }
+
+  if (struct_v >= 5) {
+    // InodeStore is embedded in dentries without proper versioning, so
+    // we consume up to the end of the buffer
+    if (!bl.end()) {
+      decode(oldest_snap, bl);
+    }
+
+    if (!bl.end()) {
+      decode(damage_flags, bl);
+    }
+  }
+
+  reset_inode(std::move(_inode));
+}
+
+
+void InodeStoreBase::decode(bufferlist::const_iterator &bl, bufferlist& snap_blob)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
+  decode_bare(bl, snap_blob, struct_v);
+  DECODE_FINISH(bl);
+}
+
+void CInode::decode_store(bufferlist::const_iterator& bl)
+{
+  bufferlist snap_blob;
+  InodeStoreBase::decode(bl, snap_blob);
+  decode_snap_blob(snap_blob);
+}
+
+// ------------------
+// locking
+
+SimpleLock* CInode::get_lock(int type)
+{
+  switch (type) {
+    case CEPH_LOCK_IVERSION: return &versionlock;
+    case CEPH_LOCK_IFILE: return &filelock;
+    case CEPH_LOCK_IAUTH: return &authlock;
+    case CEPH_LOCK_ILINK: return &linklock;
+    case CEPH_LOCK_IDFT: return &dirfragtreelock;
+    case CEPH_LOCK_IXATTR: return &xattrlock;
+    case CEPH_LOCK_ISNAP: return &snaplock;
+    case CEPH_LOCK_INEST: return &nestlock;
+    case CEPH_LOCK_IFLOCK: return &flocklock;
+    case CEPH_LOCK_IPOLICY: return &policylock;
+  }
+  return 0;
+}
+
+void CInode::set_object_info(MDSCacheObjectInfo &info)
+{
+  info.ino = ino();
+  info.snapid = last;
+}
+
+void CInode::encode_lock_iauth(bufferlist& bl)
+{
+  ENCODE_START(1, 1, bl);
+  encode(get_inode()->version, bl);
+  encode(get_inode()->ctime, bl);
+  encode(get_inode()->mode, bl);
+  encode(get_inode()->uid, bl);
+  encode(get_inode()->gid, bl);  
+  ENCODE_FINISH(bl);
+}
+
+void CInode::decode_lock_iauth(bufferlist::const_iterator& p)
+{
+  ceph_assert(!is_auth());
+  auto _inode = allocate_inode(*get_inode());
+  DECODE_START(1, p);
+  decode(_inode->version, p);
+  utime_t tm;
+  decode(tm, p);
+  if (_inode->ctime < tm) _inode->ctime = tm;
+  decode(_inode->mode, p);
+  decode(_inode->uid, p);
+  decode(_inode->gid, p);
+  DECODE_FINISH(p);
+  reset_inode(std::move(_inode));
+}
+
+void CInode::encode_lock_ilink(bufferlist& bl)
+{
+  ENCODE_START(1, 1, bl);
+  encode(get_inode()->version, bl);
+  encode(get_inode()->ctime, bl);
+  encode(get_inode()->nlink, bl);
+  ENCODE_FINISH(bl);
+}
+
+void CInode::decode_lock_ilink(bufferlist::const_iterator& p)
+{
+  ceph_assert(!is_auth());
+  auto _inode = allocate_inode(*get_inode());
+  DECODE_START(1, p);
+  decode(_inode->version, p);
+  utime_t tm;
+  decode(tm, p);
+  if (_inode->ctime < tm) _inode->ctime = tm;
+  decode(_inode->nlink, p);
+  DECODE_FINISH(p);
+  reset_inode(std::move(_inode));
+}
+
+void CInode::encode_lock_idft(bufferlist& bl)
+{
+  ENCODE_START(1, 1, bl);
+  if (is_auth()) {
+    encode(get_inode()->version, bl);
+  } else {
+    // treat flushing as dirty when rejoining cache
+    bool dirty = dirfragtreelock.is_dirty_or_flushing();
+    encode(dirty, bl);
+  }
+  {
+    // encode the raw tree
+    encode(dirfragtree, bl);
+
+    // also specify which frags are mine
+    set<frag_t> myfrags;
+    auto&& dfls = get_dirfrags();
+    for (const auto& dir : dfls) {
+      if (dir->is_auth()) {
+	frag_t fg = dir->get_frag();
+	myfrags.insert(fg);
+      }
+    }
+    encode(myfrags, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void CInode::decode_lock_idft(bufferlist::const_iterator& p)
+{
+  inode_ptr _inode;
+
+  DECODE_START(1, p);
+  if (is_auth()) {
+    bool replica_dirty;
+    decode(replica_dirty, p);
+    if (replica_dirty) {
+      dout(10) << __func__ << " setting dftlock dirty flag" << dendl;
+      dirfragtreelock.mark_dirty();  // ok bc we're auth and caller will handle
+    }
+  } else {
+    _inode = allocate_inode(*get_inode());
+    decode(_inode->version, p);
+  }
+  {
+    fragtree_t temp;
+    decode(temp, p);
+    set<frag_t> authfrags;
+    decode(authfrags, p);
+    if (is_auth()) {
+      // auth.  believe replica's auth frags only.
+      for (auto fg : authfrags) {
+        if (!dirfragtree.is_leaf(fg)) {
+          dout(10) << " forcing frag " << fg << " to leaf (split|merge)" << dendl;
+          dirfragtree.force_to_leaf(g_ceph_context, fg);
+          dirfragtreelock.mark_dirty();  // ok bc we're auth and caller will handle
+        }
+      }
+    } else {
+      // replica.  take the new tree, BUT make sure any open
+      //  dirfrags remain leaves (they may have split _after_ this
+      //  dft was scattered, or we may still be be waiting on the
+      //  notify from the auth)
+      dirfragtree.swap(temp);
+      for (const auto &p : dirfrags) {
+        if (!dirfragtree.is_leaf(p.first)) {
+          dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl;
+          dirfragtree.force_to_leaf(g_ceph_context, p.first);
+        }
+	if (p.second->is_auth())
+	  p.second->state_clear(CDir::STATE_DIRTYDFT);
+      }
+    }
+    if (g_conf()->mds_debug_frag)
+      verify_dirfrags();
+  }
+  DECODE_FINISH(p);
+
+  if (_inode)
+    reset_inode(std::move(_inode));
+}
+
+void CInode::encode_lock_ifile(bufferlist& bl)
+{
+  ENCODE_START(1, 1, bl);
+  if (is_auth()) {
+    encode(get_inode()->version, bl); 
+    encode(get_inode()->ctime, bl); 
+    encode(get_inode()->mtime, bl); 
+    encode(get_inode()->atime, bl); 
+    encode(get_inode()->time_warp_seq, bl); 
+    if (!is_dir()) {
+      encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features());
+      encode(get_inode()->size, bl); 
+      encode(get_inode()->truncate_seq, bl); 
+      encode(get_inode()->truncate_size, bl); 
+      encode(get_inode()->client_ranges, bl); 
+      encode(get_inode()->inline_data, bl); 
+    }    
+  } else {
+    // treat flushing as dirty when rejoining cache
+    bool dirty = filelock.is_dirty_or_flushing();
+    encode(dirty, bl); 
+  }    
+  dout(15) << __func__ << " inode.dirstat is " << get_inode()->dirstat << dendl;
+  encode(get_inode()->dirstat, bl);  // only meaningful if i am auth.
+  bufferlist tmp;
+  __u32 n = 0;
+  for (const auto &p : dirfrags) {
+    frag_t fg = p.first;
+    CDir *dir = p.second;
+    if (is_auth() || dir->is_auth()) {
+      const auto& pf = dir->get_projected_fnode();
+      dout(15) << fg << " " << *dir << dendl;
+      dout(20) << fg << "           fragstat " << pf->fragstat << dendl;
+      dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
+      encode(fg, tmp);
+      encode(dir->first, tmp);
+      encode(pf->fragstat, tmp);
+      encode(pf->accounted_fragstat, tmp);
+      n++;
+    }
+  }
+  encode(n, bl);
+  bl.claim_append(tmp);
+  ENCODE_FINISH(bl);
+}
+
+void CInode::decode_lock_ifile(bufferlist::const_iterator& p)
+{
+  inode_ptr _inode;
+
+  DECODE_START(1, p);
+  if (!is_auth()) {
+    _inode = allocate_inode(*get_inode());
+
+    decode(_inode->version, p);
+    utime_t tm;
+    decode(tm, p);
+    if (_inode->ctime < tm) _inode->ctime = tm;
+    decode(_inode->mtime, p);
+    decode(_inode->atime, p);
+    decode(_inode->time_warp_seq, p);
+    if (!is_dir()) {
+      decode(_inode->layout, p);
+      decode(_inode->size, p);
+      decode(_inode->truncate_seq, p);
+      decode(_inode->truncate_size, p);
+      decode(_inode->client_ranges, p);
+      decode(_inode->inline_data, p);
+    }
+  } else {
+    bool replica_dirty;
+    decode(replica_dirty, p);
+    if (replica_dirty) {
+      dout(10) << __func__ << " setting filelock dirty flag" << dendl;
+      filelock.mark_dirty();  // ok bc we're auth and caller will handle
+    }
+  }
+ 
+  frag_info_t dirstat;
+  decode(dirstat, p);
+  if (!is_auth()) {
+    dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl;
+    _inode->dirstat = dirstat;    // take inode summation if replica
+  }
+  __u32 n;
+  decode(n, p);
+  dout(10) << " ...got " << n << " fragstats on " << *this << dendl;
+  while (n--) {
+    frag_t fg;
+    snapid_t fgfirst;
+    frag_info_t fragstat;
+    frag_info_t accounted_fragstat;
+    decode(fg, p);
+    decode(fgfirst, p);
+    decode(fragstat, p);
+    decode(accounted_fragstat, p);
+    dout(10) << fg << " [" << fgfirst << ",head] " << dendl;
+    dout(10) << fg << "           fragstat " << fragstat << dendl;
+    dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl;
+
+    CDir *dir = get_dirfrag(fg);
+    if (is_auth()) {
+      ceph_assert(dir);                // i am auth; i had better have this dir open
+      dout(10) << fg << " first " << dir->first << " -> " << fgfirst
+               << " on " << *dir << dendl;
+      dir->first = fgfirst;
+      auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
+      _fnode->fragstat = fragstat;
+      _fnode->accounted_fragstat = accounted_fragstat;
+      dir->reset_fnode(std::move(_fnode));
+      if (!(fragstat == accounted_fragstat)) {
+        dout(10) << fg << " setting filelock updated flag" << dendl;
+        filelock.mark_dirty();  // ok bc we're auth and caller will handle
+      }
+    } else {
+      if (dir && dir->is_auth()) {
+        dout(10) << fg << " first " << dir->first << " -> " << fgfirst
+                 << " on " << *dir << dendl;
+        dir->first = fgfirst;
+        const auto& pf = dir->get_projected_fnode();
+        finish_scatter_update(&filelock, dir,
+                              _inode->dirstat.version, pf->accounted_fragstat.version);
+      }
+    }
+  }
+  DECODE_FINISH(p);
+
+  if (_inode)
+    reset_inode(std::move(_inode));
+}
+
+void CInode::encode_lock_inest(bufferlist& bl)
+{
+  ENCODE_START(1, 1, bl);
+  if (is_auth()) {
+    encode(get_inode()->version, bl);
+  } else {
+    // treat flushing as dirty when rejoining cache
+    bool dirty = nestlock.is_dirty_or_flushing();
+    encode(dirty, bl);
+  }
+  dout(15) << __func__ << " inode.rstat is " << get_inode()->rstat << dendl;
+  encode(get_inode()->rstat, bl);  // only meaningful if i am auth.
+  bufferlist tmp;
+  __u32 n = 0;
+  for (const auto &p : dirfrags) {
+    frag_t fg = p.first;
+    CDir *dir = p.second;
+    if (is_auth() || dir->is_auth()) {
+      const auto& pf = dir->get_projected_fnode();
+      dout(10) << __func__ << " " << fg << " dir " << *dir << dendl;
+      dout(10) << __func__ << " " << fg << " rstat " << pf->rstat << dendl;
+      dout(10) << __func__ << " " << fg << " accounted_rstat " << pf->rstat << dendl;
+      dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
+      encode(fg, tmp);
+      encode(dir->first, tmp);
+      encode(pf->rstat, tmp);
+      encode(pf->accounted_rstat, tmp);
+      encode(dir->dirty_old_rstat, tmp);
+      n++;
+    }
+  }
+  encode(n, bl);
+  bl.claim_append(tmp);
+  ENCODE_FINISH(bl);
+}
+
+void CInode::decode_lock_inest(bufferlist::const_iterator& p)
+{
+  inode_ptr _inode;
+
+  DECODE_START(1, p);
+  if (is_auth()) {
+    bool replica_dirty;
+    decode(replica_dirty, p);
+    if (replica_dirty) {
+      dout(10) << __func__ << " setting nestlock dirty flag" << dendl;
+      nestlock.mark_dirty();  // ok bc we're auth and caller will handle
+    }
+  } else {
+    _inode = allocate_inode(*get_inode());
+    decode(_inode->version, p);
+  }
+  nest_info_t rstat;
+  decode(rstat, p);
+  if (!is_auth()) {
+    dout(10) << __func__ << " taking inode rstat " << rstat << " for " << *this << dendl;
+    _inode->rstat = rstat;    // take inode summation if replica
+  }
+  __u32 n;
+  decode(n, p);
+  while (n--) {
+    frag_t fg;
+    snapid_t fgfirst;
+    nest_info_t rstat;
+    nest_info_t accounted_rstat;
+    decltype(CDir::dirty_old_rstat) dirty_old_rstat;
+    decode(fg, p);
+    decode(fgfirst, p);
+    decode(rstat, p);
+    decode(accounted_rstat, p);
+    decode(dirty_old_rstat, p);
+    dout(10) << __func__ << " " << fg << " [" << fgfirst << ",head]" << dendl;
+    dout(10) << __func__ << " " << fg << " rstat " << rstat << dendl;
+    dout(10) << __func__ << " " << fg << " accounted_rstat " << accounted_rstat << dendl;
+    dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dirty_old_rstat << dendl;
+    CDir *dir = get_dirfrag(fg);
+    if (is_auth()) {
+      ceph_assert(dir);                // i am auth; i had better have this dir open
+      dout(10) << fg << " first " << dir->first << " -> " << fgfirst
+               << " on " << *dir << dendl;
+      dir->first = fgfirst;
+      auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
+      _fnode->rstat = rstat;
+      _fnode->accounted_rstat = accounted_rstat;
+      dir->reset_fnode(std::move(_fnode));
+      dir->dirty_old_rstat.swap(dirty_old_rstat);
+      if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) {
+        dout(10) << fg << " setting nestlock updated flag" << dendl;
+        nestlock.mark_dirty();  // ok bc we're auth and caller will handle
+      }
+    } else {
+      if (dir && dir->is_auth()) {
+        dout(10) << fg << " first " << dir->first << " -> " << fgfirst
+                 << " on " << *dir << dendl;
+        dir->first = fgfirst;
+        const auto& pf = dir->get_projected_fnode();
+        finish_scatter_update(&nestlock, dir,
+                              _inode->rstat.version, pf->accounted_rstat.version);
+      }
+    }
+  }
+  DECODE_FINISH(p);
+
+  if (_inode)
+    reset_inode(std::move(_inode));
+}
+
+void CInode::encode_lock_ixattr(bufferlist& bl)
+{
+  ENCODE_START(2, 1, bl);
+  encode(get_inode()->version, bl);
+  encode(get_inode()->ctime, bl);
+  encode_xattrs(bl);
+  encode(get_inode()->xattr_version, bl);
+  ENCODE_FINISH(bl);
+}
+
+void CInode::decode_lock_ixattr(bufferlist::const_iterator& p)
+{
+  ceph_assert(!is_auth());
+  auto _inode = allocate_inode(*get_inode());
+  DECODE_START(2, p);
+  decode(_inode->version, p);
+  utime_t tm;
+  decode(tm, p);
+  if (_inode->ctime < tm)
+    _inode->ctime = tm;
+  decode_xattrs(p);
+  if (struct_v >= 2) {
+    decode(_inode->xattr_version, p);
+  }
+  DECODE_FINISH(p);
+  reset_inode(std::move(_inode));
+}
+
+void CInode::encode_lock_isnap(bufferlist& bl)
+{
+  ENCODE_START(1, 1, bl);
+  encode(get_inode()->version, bl);
+  encode(get_inode()->ctime, bl);
+  encode_snap(bl);
+  ENCODE_FINISH(bl);
+}
+
+void CInode::decode_lock_isnap(bufferlist::const_iterator& p)
+{
+  ceph_assert(!is_auth());
+  auto _inode = allocate_inode(*get_inode());
+  DECODE_START(1, p);
+  decode(_inode->version, p);
+  utime_t tm;
+  decode(tm, p);
+  if (_inode->ctime < tm) _inode->ctime = tm;
+  decode_snap(p);
+  DECODE_FINISH(p);
+  reset_inode(std::move(_inode));
+}
+
+void CInode::encode_lock_iflock(bufferlist& bl)
+{
+  ENCODE_START(1, 1, bl);
+  encode(get_inode()->version, bl);
+  _encode_file_locks(bl);
+  ENCODE_FINISH(bl);
+}
+
+void CInode::decode_lock_iflock(bufferlist::const_iterator& p)
+{
+  ceph_assert(!is_auth());
+  auto _inode = allocate_inode(*get_inode());
+  DECODE_START(1, p);
+  decode(_inode->version, p);
+  _decode_file_locks(p);
+  DECODE_FINISH(p);
+  reset_inode(std::move(_inode));
+}
+
+void CInode::encode_lock_ipolicy(bufferlist& bl)
+{
+  ENCODE_START(2, 1, bl);
+  if (is_dir()) {
+    encode(get_inode()->version, bl);
+    encode(get_inode()->ctime, bl);
+    encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features());
+    encode(get_inode()->quota, bl);
+    encode(get_inode()->export_pin, bl);
+    encode(get_inode()->export_ephemeral_distributed_pin, bl);
+    encode(get_inode()->export_ephemeral_random_pin, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p)
+{
+  ceph_assert(!is_auth());
+  auto _inode = allocate_inode(*get_inode());
+  DECODE_START(1, p);
+  if (is_dir()) {
+    decode(_inode->version, p);
+    utime_t tm;
+    decode(tm, p);
+    if (_inode->ctime < tm)
+      _inode->ctime = tm;
+    decode(_inode->layout, p);
+    decode(_inode->quota, p);
+    decode(_inode->export_pin, p);
+    if (struct_v >= 2) {
+      decode(_inode->export_ephemeral_distributed_pin, p);
+      decode(_inode->export_ephemeral_random_pin, p);
+    }
+  }
+  DECODE_FINISH(p);
+
+  bool pin_updated = (get_inode()->export_pin != _inode->export_pin) ||
+		     (get_inode()->export_ephemeral_distributed_pin !=
+		      _inode->export_ephemeral_distributed_pin);
+  reset_inode(std::move(_inode));
+  maybe_export_pin(pin_updated);
+}
+
+void CInode::encode_lock_state(int type, bufferlist& bl)
+{
+  ENCODE_START(1, 1, bl);
+  encode(first, bl);
+  if (!is_base())
+    encode(parent->first, bl);
+
+  switch (type) {
+  case CEPH_LOCK_IAUTH:
+    encode_lock_iauth(bl);
+    break;
+
+  case CEPH_LOCK_ILINK:
+    encode_lock_ilink(bl);
+    break;
+
+  case CEPH_LOCK_IDFT:
+    encode_lock_idft(bl);
+    break;
+
+  case CEPH_LOCK_IFILE:
+    encode_lock_ifile(bl);
+    break;
+
+  case CEPH_LOCK_INEST:
+    encode_lock_inest(bl);
+    break;
+    
+  case CEPH_LOCK_IXATTR:
+    encode_lock_ixattr(bl);
+    break;
+
+  case CEPH_LOCK_ISNAP:
+    encode_lock_isnap(bl);
+    break;
+
+  case CEPH_LOCK_IFLOCK:
+    encode_lock_iflock(bl);
+    break;
+
+  case CEPH_LOCK_IPOLICY:
+    encode_lock_ipolicy(bl);
+    break;
+  
+  default:
+    ceph_abort();
+  }
+  ENCODE_FINISH(bl);
+}
+
+/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
+
+void CInode::decode_lock_state(int type, const bufferlist& bl)
+{
+  auto p = bl.cbegin();
+
+  DECODE_START(1, p);
+  utime_t tm;
+
+  snapid_t newfirst;
+  using ceph::decode;
+  decode(newfirst, p);
+  if (!is_auth() && newfirst != first) {
+    dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl;
+    first = newfirst;
+  }
+  if (!is_base()) {
+    decode(newfirst, p);
+    if (!parent->is_auth() && newfirst != parent->first) {
+      dout(10) << __func__ << " parent first " << first << " -> " << newfirst << dendl;
+      parent->first = newfirst;
+    }
+  }
+
+  switch (type) {
+  case CEPH_LOCK_IAUTH:
+    decode_lock_iauth(p);
+    break;
+
+  case CEPH_LOCK_ILINK:
+    decode_lock_ilink(p);
+    break;
+
+  case CEPH_LOCK_IDFT:
+    decode_lock_idft(p);
+    break;
+
+  case CEPH_LOCK_IFILE:
+    decode_lock_ifile(p);
+    break;
+
+  case CEPH_LOCK_INEST:
+    decode_lock_inest(p);
+    break;
+
+  case CEPH_LOCK_IXATTR:
+    decode_lock_ixattr(p);
+    break;
+
+  case CEPH_LOCK_ISNAP:
+    decode_lock_isnap(p);
+    break;
+
+  case CEPH_LOCK_IFLOCK:
+    decode_lock_iflock(p);
+    break;
+
+  case CEPH_LOCK_IPOLICY:
+    decode_lock_ipolicy(p);
+    break;
+
+  default:
+    ceph_abort();
+  }
+  DECODE_FINISH(p);
+}
+
+
+bool CInode::is_dirty_scattered()
+{
+  return
+    filelock.is_dirty_or_flushing() ||
+    nestlock.is_dirty_or_flushing() ||
+    dirfragtreelock.is_dirty_or_flushing();
+}
+
+void CInode::clear_scatter_dirty()
+{
+  filelock.remove_dirty();
+  nestlock.remove_dirty();
+  dirfragtreelock.remove_dirty();
+}
+
+void CInode::clear_dirty_scattered(int type)
+{
+  dout(10) << __func__ << " " << type << " on " << *this << dendl;
+  ceph_assert(is_dir());
+  switch (type) {
+  case CEPH_LOCK_IFILE:
+    item_dirty_dirfrag_dir.remove_myself();
+    break;
+
+  case CEPH_LOCK_INEST:
+    item_dirty_dirfrag_nest.remove_myself();
+    break;
+
+  case CEPH_LOCK_IDFT:
+    item_dirty_dirfrag_dirfragtree.remove_myself();
+    break;
+
+  default:
+    ceph_abort();
+  }
+}
+
+
+/*
+ * when we initially scatter a lock, we need to check if any of the dirfrags
+ * have out of date accounted_rstat/fragstat.  if so, mark the lock stale.
+ */
+/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
+void CInode::start_scatter(ScatterLock *lock)
+{
+  dout(10) << __func__ << " " << *lock << " on " << *this << dendl;
+  ceph_assert(is_auth());
+  const auto& pi = get_projected_inode();
+
+  for (const auto &p : dirfrags) {
+    frag_t fg = p.first;
+    CDir *dir = p.second;
+    const auto& pf = dir->get_projected_fnode();
+    dout(20) << fg << " " << *dir << dendl;
+
+    if (!dir->is_auth())
+      continue;
+
+    switch (lock->get_type()) {
+    case CEPH_LOCK_IFILE:
+      finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version);
+      break;
+
+    case CEPH_LOCK_INEST:
+      finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version);
+      break;
+
+    case CEPH_LOCK_IDFT:
+      dir->state_clear(CDir::STATE_DIRTYDFT);
+      break;
+    }
+  }
+}
+
+
+class C_Inode_FragUpdate : public MDSLogContextBase {
+protected:
+  CInode *in;
+  CDir *dir;
+  MutationRef mut;
+  MDSRank *get_mds() override {return in->mdcache->mds;}
+  void finish(int r) override {
+    in->_finish_frag_update(dir, mut);
+  }    
+
+public:
+  C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {}
+};
+
+void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir,
+				   version_t inode_version, version_t dir_accounted_version)
+{
+  frag_t fg = dir->get_frag();
+  ceph_assert(dir->is_auth());
+
+  if (dir->is_frozen()) {
+    dout(10) << __func__ << " " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl;
+  } else if (dir->get_version() == 0) {
+    dout(10) << __func__ << " " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl;
+  } else {
+    if (dir_accounted_version != inode_version) {
+      dout(10) << __func__ << " " << fg << " journaling accounted scatterstat update v" << inode_version << dendl;
+
+      MDLog *mdlog = mdcache->mds->mdlog;
+      MutationRef mut(new MutationImpl());
+      mut->ls = mdlog->get_current_segment();
+
+      auto pf = dir->project_fnode(mut);
+
+      std::string_view ename;
+      switch (lock->get_type()) {
+      case CEPH_LOCK_IFILE:
+	pf->fragstat.version = inode_version;
+	pf->accounted_fragstat = pf->fragstat;
+	ename = "lock ifile accounted scatter stat update";
+	break;
+      case CEPH_LOCK_INEST:
+	pf->rstat.version = inode_version;
+	pf->accounted_rstat = pf->rstat;
+	ename = "lock inest accounted scatter stat update";
+
+	if (!is_auth() && lock->get_state() == LOCK_MIX) {
+	  dout(10) << __func__ << " try to assimilate dirty rstat on " 
+	    << *dir << dendl; 
+	  dir->assimilate_dirty_rstat_inodes(mut);
+       }
+
+	break;
+      default:
+	ceph_abort();
+      }
+	
+      EUpdate *le = new EUpdate(mdlog, ename);
+      mdlog->start_entry(le);
+      le->metablob.add_dir_context(dir);
+      le->metablob.add_dir(dir, true);
+      
+      ceph_assert(!dir->is_frozen());
+      mut->auth_pin(dir);
+
+      if (lock->get_type() == CEPH_LOCK_INEST && 
+	  !is_auth() && lock->get_state() == LOCK_MIX) {
+        dout(10) << __func__ << " finish assimilating dirty rstat on " 
+          << *dir << dendl; 
+        dir->assimilate_dirty_rstat_inodes_finish(&le->metablob);
+
+        if (!(pf->rstat == pf->accounted_rstat)) {
+          if (!mut->is_wrlocked(&nestlock)) {
+            mdcache->mds->locker->wrlock_force(&nestlock, mut);
+          }
+
+          mdcache->mds->locker->mark_updated_scatterlock(&nestlock);
+          mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest);
+        }
+      }
+
+      pf->version = dir->pre_dirty();
+      
+      mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut));
+    } else {
+      dout(10) << __func__ << " " << fg << " accounted " << *lock
+	       << " scatter stat unchanged at v" << dir_accounted_version << dendl;
+    }
+  }
+}
+
+void CInode::_finish_frag_update(CDir *dir, MutationRef& mut)
+{
+  dout(10) << __func__ << " on " << *dir << dendl;
+  mut->apply();
+  mdcache->mds->locker->drop_locks(mut.get());
+  mut->cleanup();
+}
+
+
+/*
+ * when we gather a lock, we need to assimilate dirfrag changes into the inode
+ * state.  it's possible we can't update the dirfrag accounted_rstat/fragstat
+ * because the frag is auth and frozen, or that the replica couldn't for the same
+ * reason.  hopefully it will get updated the next time the lock cycles.
+ *
+ * we have two dimensions of behavior:
+ *  - we may be (auth and !frozen), and able to update, or not.
+ *  - the frag may be stale, or not.
+ *
+ * if the frag is non-stale, we want to assimilate the diff into the
+ * inode, regardless of whether it's auth or updateable.
+ *
+ * if we update the frag, we want to set accounted_fragstat = frag,
+ * both if we took the diff or it was stale and we are making it
+ * un-stale.
+ */
+/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
+void CInode::finish_scatter_gather_update(int type, MutationRef& mut)
+{
+  LogChannelRef clog = mdcache->mds->clog;
+
+  dout(10) << __func__ << " " << type << " on " << *this << dendl;
+  ceph_assert(is_auth());
+
+  switch (type) {
+  case CEPH_LOCK_IFILE:
+    {
+      fragtree_t tmpdft = dirfragtree;
+      struct frag_info_t dirstat;
+      bool dirstat_valid = true;
+
+      // adjust summation
+      ceph_assert(is_auth());
+      auto pi = _get_projected_inode();
+
+      bool touched_mtime = false, touched_chattr = false;
+      dout(20) << "  orig dirstat " << pi->dirstat << dendl;
+      pi->dirstat.version++;
+      for (const auto &p : dirfrags) {
+	frag_t fg = p.first;
+	CDir *dir = p.second;
+	dout(20) << fg << " " << *dir << dendl;
+
+	bool update;
+	if (dir->get_version() != 0) {
+	  update = dir->is_auth() && !dir->is_frozen();
+	} else {
+	  update = false;
+	  dirstat_valid = false;
+	}
+
+	CDir::fnode_const_ptr pf;
+	if (update) {
+	  mut->auth_pin(dir);
+	  pf = dir->project_fnode(mut);
+	} else {
+	  pf = dir->get_projected_fnode();
+	}
+
+	if (pf->accounted_fragstat.version == pi->dirstat.version - 1) {
+	  dout(20) << fg << "           fragstat " << pf->fragstat << dendl;
+	  dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
+	  pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
+	} else {
+	  dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl;
+	}
+
+	if (pf->fragstat.nfiles < 0 ||
+	    pf->fragstat.nsubdirs < 0) {
+	  clog->error() << "bad/negative dir size on "
+			<< dir->dirfrag() << " " << pf->fragstat;
+	  ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
+
+	  auto _pf = const_cast<fnode_t*>(pf.get());
+	  if (pf->fragstat.nfiles < 0)
+	    _pf->fragstat.nfiles = 0;
+	  if (pf->fragstat.nsubdirs < 0)
+	    _pf->fragstat.nsubdirs = 0;
+	}
+
+	if (update) {
+	  auto _pf = const_cast<fnode_t*>(pf.get());
+	  _pf->accounted_fragstat = _pf->fragstat;
+	  _pf->fragstat.version = _pf->accounted_fragstat.version = pi->dirstat.version;
+	  _pf->version = dir->pre_dirty();
+	  dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl;
+	}
+
+	tmpdft.force_to_leaf(g_ceph_context, fg);
+	dirstat.add(pf->fragstat);
+      }
+      if (touched_mtime)
+	pi->mtime = pi->ctime = pi->dirstat.mtime;
+      if (touched_chattr)
+	pi->change_attr++;
+
+      dout(20) << " final dirstat " << pi->dirstat << dendl;
+
+      if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) {
+        frag_vec_t leaves;
+        tmpdft.get_leaves_under(frag_t(), leaves);
+	for (const auto& leaf : leaves) {
+	  if (!dirfrags.count(leaf)) {
+	    dirstat_valid = false;
+	    break;
+	  }
+        }
+	if (dirstat_valid) {
+	  if (state_test(CInode::STATE_REPAIRSTATS)) {
+	    dout(20) << " dirstat mismatch, fixing" << dendl;
+	  } else {
+	    clog->error() << "unmatched fragstat on " << ino() << ", inode has "
+			  << pi->dirstat << ", dirfrags have " << dirstat;
+	    ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter);
+	  }
+	  // trust the dirfrags for now
+	  version_t v = pi->dirstat.version;
+	  if (pi->dirstat.mtime > dirstat.mtime)
+	    dirstat.mtime = pi->dirstat.mtime;
+	  if (pi->dirstat.change_attr > dirstat.change_attr)
+	    dirstat.change_attr = pi->dirstat.change_attr;
+	  pi->dirstat = dirstat;
+	  pi->dirstat.version = v;
+	}
+      }
+
+      if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0) {
+        std::string path;
+        make_path_string(path);
+	clog->error() << "Inconsistent statistics detected: fragstat on inode "
+                      << ino() << " (" << path << "), inode has " << pi->dirstat;
+	ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
+
+	if (pi->dirstat.nfiles < 0)
+	  pi->dirstat.nfiles = 0;
+	if (pi->dirstat.nsubdirs < 0)
+	  pi->dirstat.nsubdirs = 0;
+      }
+    }
+    break;
+
+  case CEPH_LOCK_INEST:
+    {
+      // adjust summation
+      ceph_assert(is_auth());
+
+      fragtree_t tmpdft = dirfragtree;
+      nest_info_t rstat;
+      bool rstat_valid = true;
+
+      rstat.rsubdirs = 1;
+      if (const sr_t *srnode = get_projected_srnode(); srnode)
+	rstat.rsnaps = srnode->snaps.size();
+
+      auto pi = _get_projected_inode();
+      dout(20) << "  orig rstat " << pi->rstat << dendl;
+      pi->rstat.version++;
+      for (const auto &p : dirfrags) {
+	frag_t fg = p.first;
+	CDir *dir = p.second;
+	dout(20) << fg << " " << *dir << dendl;
+
+	bool update;
+	if (dir->get_version() != 0) {
+	  update = dir->is_auth() && !dir->is_frozen();
+	} else {
+	  update = false;
+	  rstat_valid = false;
+	}
+
+	CDir::fnode_const_ptr pf;
+	if (update) {
+	  mut->auth_pin(dir);
+	  pf = dir->project_fnode(mut);
+	} else {
+	  pf = dir->get_projected_fnode();
+	}
+
+	if (pf->accounted_rstat.version == pi->rstat.version-1) {
+	  // only pull this frag's dirty rstat inodes into the frag if
+	  // the frag is non-stale and updateable.  if it's stale,
+	  // that info will just get thrown out!
+	  if (update)
+	    dir->assimilate_dirty_rstat_inodes(mut);
+
+	  dout(20) << fg << "           rstat " << pf->rstat << dendl;
+	  dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl;
+	  dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
+	  mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat,
+					       dir->first, CEPH_NOSNAP, this, true);
+	  for (auto &p : dir->dirty_old_rstat) {
+	    mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat,
+						 p.second.first, p.first, this, true);
+          }
+	  if (update)  // dir contents not valid if frozen or non-auth
+	    dir->check_rstats();
+	} else {
+	  dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl;
+	}
+	if (update) {
+	  auto _pf = const_cast<fnode_t*>(pf.get());
+	  _pf->accounted_rstat = pf->rstat;
+	  _pf->rstat.version = _pf->accounted_rstat.version = pi->rstat.version;
+	  _pf->version = dir->pre_dirty();
+	  dir->dirty_old_rstat.clear();
+	  dir->check_rstats();
+	  dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl;
+	}
+
+	tmpdft.force_to_leaf(g_ceph_context, fg);
+	rstat.add(pf->rstat);
+      }
+      dout(20) << " final rstat " << pi->rstat << dendl;
+
+      if (rstat_valid && !rstat.same_sums(pi->rstat)) {
+        frag_vec_t leaves;
+        tmpdft.get_leaves_under(frag_t(), leaves);
+        for (const auto& leaf : leaves) {
+          if (!dirfrags.count(leaf)) {
+	    rstat_valid = false;
+	    break;
+	  }
+        }
+	if (rstat_valid) {
+	  if (state_test(CInode::STATE_REPAIRSTATS)) {
+	    dout(20) << " rstat mismatch, fixing" << dendl;
+	  } else {
+	    clog->error() << "inconsistent rstat on inode " << ino()
+                          << ", inode has " << pi->rstat
+                          << ", directory fragments have " << rstat;
+	    ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter);
+	  }
+	  // trust the dirfrag for now
+	  version_t v = pi->rstat.version;
+	  if (pi->rstat.rctime > rstat.rctime)
+	    rstat.rctime = pi->rstat.rctime;
+	  pi->rstat = rstat;
+	  pi->rstat.version = v;
+	}
+      }
+
+      mdcache->broadcast_quota_to_client(this);
+    }
+    break;
+
+  case CEPH_LOCK_IDFT:
+    break;
+
+  default:
+    ceph_abort();
+  }
+}
+
+void CInode::finish_scatter_gather_update_accounted(int type, EMetaBlob *metablob)
+{
+  dout(10) << __func__ << " " << type << " on " << *this << dendl;
+  ceph_assert(is_auth());
+
+  for (const auto &p : dirfrags) {
+    CDir *dir = p.second;
+    if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen())
+      continue;
+    
+    if (type == CEPH_LOCK_IDFT)
+      continue;  // nothing to do.
+
+    if (type == CEPH_LOCK_INEST)
+      dir->assimilate_dirty_rstat_inodes_finish(metablob);
+
+    dout(10) << " journaling updated frag accounted_ on " << *dir << dendl;
+    ceph_assert(dir->is_projected());
+    metablob->add_dir(dir, true);
+  }
+}
+
+// waiting
+
+bool CInode::is_frozen() const
+{
+  if (is_frozen_inode()) return true;
+  if (parent && parent->dir->is_frozen()) return true;
+  return false;
+}
+
+bool CInode::is_frozen_dir() const
+{
+  if (parent && parent->dir->is_frozen_dir()) return true;
+  return false;
+}
+
+bool CInode::is_freezing() const
+{
+  if (is_freezing_inode()) return true;
+  if (parent && parent->dir->is_freezing()) return true;
+  return false;
+}
+
+void CInode::add_dir_waiter(frag_t fg, MDSContext *c)
+{
+  if (waiting_on_dir.empty())
+    get(PIN_DIRWAITER);
+  waiting_on_dir[fg].push_back(c);
+  dout(10) << __func__ << " frag " << fg << " " << c << " on " << *this << dendl;
+}
+
+void CInode::take_dir_waiting(frag_t fg, MDSContext::vec& ls)
+{
+  if (waiting_on_dir.empty())
+    return;
+
+  auto it = waiting_on_dir.find(fg);
+  if (it != waiting_on_dir.end()) {
+    dout(10) << __func__ << " frag " << fg << " on " << *this << dendl;
+    auto& waiting = it->second;
+    ls.insert(ls.end(), waiting.begin(), waiting.end());
+    waiting_on_dir.erase(it);
+
+    if (waiting_on_dir.empty())
+      put(PIN_DIRWAITER);
+  }
+}
+
+void CInode::add_waiter(uint64_t tag, MDSContext *c) 
+{
+  dout(10) << __func__ << " tag " << std::hex << tag << std::dec << " " << c
+	   << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH)
+	   << " !frozen " << !is_frozen_inode()
+	   << " !freezing " << !is_freezing_inode()
+	   << dendl;
+  // wait on the directory?
+  //  make sure its not the inode that is explicitly ambiguous|freezing|frozen
+  if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) ||
+      ((tag & WAIT_UNFREEZE) &&
+       !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) {
+    dout(15) << "passing waiter up tree" << dendl;
+    parent->dir->add_waiter(tag, c);
+    return;
+  }
+  dout(15) << "taking waiter here" << dendl;
+  MDSCacheObject::add_waiter(tag, c);
+}
+
+void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls)
+{
+  if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) {
+    // take all dentry waiters
+    while (!waiting_on_dir.empty()) {
+      auto it = waiting_on_dir.begin();
+      dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl;
+      auto& waiting = it->second;
+      ls.insert(ls.end(), waiting.begin(), waiting.end());
+      waiting_on_dir.erase(it);
+    }
+    put(PIN_DIRWAITER);
+  }
+
+  // waiting
+  MDSCacheObject::take_waiting(mask, ls);
+}
+
+void CInode::maybe_finish_freeze_inode()
+{
+  CDir *dir = get_parent_dir();
+  if (auth_pins > auth_pin_freeze_allowance || dir->frozen_inode_suppressed)
+    return;
+
+  dout(10) << "maybe_finish_freeze_inode - frozen" << dendl;
+  ceph_assert(auth_pins == auth_pin_freeze_allowance);
+  get(PIN_FROZEN);
+  put(PIN_FREEZING);
+  state_clear(STATE_FREEZING);
+  state_set(STATE_FROZEN);
+
+  item_freezing_inode.remove_myself();
+  dir->num_frozen_inodes++;
+
+  finish_waiting(WAIT_FROZEN);
+}
+
+bool CInode::freeze_inode(int auth_pin_allowance)
+{
+  CDir *dir = get_parent_dir();
+  ceph_assert(dir);
+
+  ceph_assert(auth_pin_allowance > 0);  // otherwise we need to adjust parent's nested_auth_pins
+  ceph_assert(auth_pins >= auth_pin_allowance);
+  if (auth_pins == auth_pin_allowance && !dir->frozen_inode_suppressed) {
+    dout(10) << "freeze_inode - frozen" << dendl;
+    if (!state_test(STATE_FROZEN)) {
+      get(PIN_FROZEN);
+      state_set(STATE_FROZEN);
+      dir->num_frozen_inodes++;
+    }
+    return true;
+  }
+
+  dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl;
+  auth_pin_freeze_allowance = auth_pin_allowance;
+  dir->freezing_inodes.push_back(&item_freezing_inode);
+
+  get(PIN_FREEZING);
+  state_set(STATE_FREEZING);
+
+  if (!dir->lock_caches_with_auth_pins.empty())
+    mdcache->mds->locker->invalidate_lock_caches(dir);
+
+  const static int lock_types[] = {
+    CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT,
+    CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0
+  };
+  for (int i = 0; lock_types[i]; ++i) {
+    auto lock = get_lock(lock_types[i]);
+    if (lock->is_cached())
+      mdcache->mds->locker->invalidate_lock_caches(lock);
+  }
+  // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed
+  // and finish freezing the inode
+  return state_test(STATE_FROZEN);
+}
+
+void CInode::unfreeze_inode(MDSContext::vec& finished) 
+{
+  dout(10) << __func__ << dendl;
+  if (state_test(STATE_FREEZING)) {
+    state_clear(STATE_FREEZING);
+    put(PIN_FREEZING);
+    item_freezing_inode.remove_myself();
+  } else if (state_test(STATE_FROZEN)) {
+    state_clear(STATE_FROZEN);
+    put(PIN_FROZEN);
+    get_parent_dir()->num_frozen_inodes--;
+  } else 
+    ceph_abort();
+  take_waiting(WAIT_UNFREEZE, finished);
+}
+
+void CInode::unfreeze_inode()
+{
+    MDSContext::vec finished;
+    unfreeze_inode(finished);
+    mdcache->mds->queue_waiters(finished);
+}
+
+void CInode::freeze_auth_pin()
+{
+  ceph_assert(state_test(CInode::STATE_FROZEN));
+  state_set(CInode::STATE_FROZENAUTHPIN);
+  get_parent_dir()->num_frozen_inodes++;
+}
+
+void CInode::unfreeze_auth_pin()
+{
+  ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN));
+  state_clear(CInode::STATE_FROZENAUTHPIN);
+  get_parent_dir()->num_frozen_inodes--;
+  if (!state_test(STATE_FREEZING|STATE_FROZEN)) {
+    MDSContext::vec finished;
+    take_waiting(WAIT_UNFREEZE, finished);
+    mdcache->mds->queue_waiters(finished);
+  }
+}
+
+void CInode::clear_ambiguous_auth(MDSContext::vec& finished)
+{
+  ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH));
+  state_clear(CInode::STATE_AMBIGUOUSAUTH);
+  take_waiting(CInode::WAIT_SINGLEAUTH, finished);
+}
+
+void CInode::clear_ambiguous_auth()
+{
+  MDSContext::vec finished;
+  clear_ambiguous_auth(finished);
+  mdcache->mds->queue_waiters(finished);
+}
+
+// auth_pins
+bool CInode::can_auth_pin(int *err_ret) const {
+  int err;
+  if (!is_auth()) {
+    err = ERR_NOT_AUTH;
+  } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
+    err = ERR_EXPORTING_INODE;
+  } else {
+    if (parent)
+      return parent->can_auth_pin(err_ret);
+    err = 0;
+  }
+  if (err && err_ret)
+    *err_ret = err;
+  return !err;
+}
+
+void CInode::auth_pin(void *by) 
+{
+  if (auth_pins == 0)
+    get(PIN_AUTHPIN);
+  auth_pins++;
+
+#ifdef MDS_AUTHPIN_SET
+  auth_pin_set.insert(by);
+#endif
+
+  dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl;
+  
+  if (parent)
+    parent->adjust_nested_auth_pins(1, this);
+}
+
+void CInode::auth_unpin(void *by) 
+{
+  auth_pins--;
+
+#ifdef MDS_AUTHPIN_SET
+  {
+    auto it = auth_pin_set.find(by);
+    ceph_assert(it != auth_pin_set.end());
+    auth_pin_set.erase(it);
+  }
+#endif
+
+  if (auth_pins == 0)
+    put(PIN_AUTHPIN);
+  
+  dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl;
+  
+  ceph_assert(auth_pins >= 0);
+
+  if (parent)
+    parent->adjust_nested_auth_pins(-1, by);
+
+  if (is_freezing_inode())
+    maybe_finish_freeze_inode();
+}
+
+// authority
+
+mds_authority_t CInode::authority() const
+{
+  if (inode_auth.first >= 0) 
+    return inode_auth;
+
+  if (parent)
+    return parent->dir->authority();
+
+  // new items that are not yet linked in (in the committed plane) belong
+  // to their first parent.
+  if (!projected_parent.empty())
+    return projected_parent.front()->dir->authority();
+
+  return CDIR_AUTH_UNDEF;
+}
+
+
+// SNAP
+
+snapid_t CInode::get_oldest_snap()
+{
+  snapid_t t = first;
+  if (is_any_old_inodes())
+    t = get_old_inodes()->begin()->second.first;
+  return std::min(t, oldest_snap);
+}
+
+const CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head)
+{
+  ceph_assert(follows >= first);
+
+  const auto& pi = cow_head ? get_projected_inode() : get_previous_projected_inode();
+  const auto& px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs();
+
+  auto _old_inodes = allocate_old_inode_map();
+  if (old_inodes)
+    *_old_inodes = *old_inodes;
+
+  mempool_old_inode &old = (*_old_inodes)[follows];
+  old.first = first;
+  old.inode = *pi;
+  if (px) {
+    dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl;
+    old.xattrs = *px;
+  }
+
+  if (first < oldest_snap)
+    oldest_snap = first;
+
+  old.inode.trim_client_ranges(follows);
+
+  if (g_conf()->mds_snap_rstat &&
+      !(old.inode.rstat == old.inode.accounted_rstat))
+    dirty_old_rstats.insert(follows);
+  
+  first = follows+1;
+
+  dout(10) << __func__ << " " << (cow_head ? "head" : "previous_head" )
+	   << " to [" << old.first << "," << follows << "] on "
+	   << *this << dendl;
+
+  reset_old_inodes(std::move(_old_inodes));
+  return old;
+}
+
+void CInode::pre_cow_old_inode()
+{
+  snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+  if (first <= follows)
+    cow_old_inode(follows, true);
+}
+
+bool CInode::has_snap_data(snapid_t snapid)
+{
+  bool found = snapid >= first && snapid <= last;
+  if (!found && is_any_old_inodes()) {
+    auto p = old_inodes->lower_bound(snapid);
+    if (p != old_inodes->end()) {
+      if (p->second.first > snapid) {
+	if  (p != old_inodes->begin())
+	  --p;
+      }
+      if (p->second.first <= snapid && snapid <= p->first) {
+	found = true;
+      }
+    }
+  }
+  return found;
+}
+
+void CInode::purge_stale_snap_data(const set<snapid_t>& snaps)
+{
+  dout(10) << __func__ << " " << snaps << dendl;
+
+  if (!get_old_inodes())
+    return;
+
+  std::vector<snapid_t> to_remove;
+  for (auto p : *get_old_inodes()) {
+    const snapid_t &id = p.first;
+    const auto &s = snaps.lower_bound(p.second.first);
+    if (s == snaps.end() || *s > id) {
+      dout(10) << " purging old_inode [" << p.second.first << "," << id << "]" << dendl;
+      to_remove.push_back(id);
+    }
+  }
+
+  if (to_remove.size() == get_old_inodes()->size()) {
+    reset_old_inodes(old_inode_map_ptr());
+  } else if (!to_remove.empty()) {
+    auto _old_inodes = allocate_old_inode_map(*get_old_inodes());
+    for (auto id : to_remove)
+      _old_inodes->erase(id);
+    reset_old_inodes(std::move(_old_inodes));
+  }
+}
+
+/*
+ * pick/create an old_inode
+ */
+snapid_t CInode::pick_old_inode(snapid_t snap) const
+{
+  if (is_any_old_inodes()) {
+    auto it = old_inodes->lower_bound(snap);  // p is first key >= to snap
+    if (it != old_inodes->end() && it->second.first <= snap) {
+      dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl;
+      return it->first;
+    }
+  }
+  dout(10) << __func__ << " snap " << snap << " -> nothing" << dendl;
+  return 0;
+}
+
+void CInode::open_snaprealm(bool nosplit)
+{
+  if (!snaprealm) {
+    SnapRealm *parent = find_snaprealm();
+    snaprealm = new SnapRealm(mdcache, this);
+    if (parent) {
+      dout(10) << __func__ << " " << snaprealm
+	       << " parent is " << parent
+	       << dendl;
+      dout(30) << " siblings are " << parent->open_children << dendl;
+      snaprealm->parent = parent;
+      if (!nosplit)
+	parent->split_at(snaprealm);
+      parent->open_children.insert(snaprealm);
+    }
+  }
+}
+void CInode::close_snaprealm(bool nojoin)
+{
+  if (snaprealm) {
+    dout(15) << __func__ << " " << *snaprealm << dendl;
+    if (snaprealm->parent) {
+      snaprealm->parent->open_children.erase(snaprealm);
+      //if (!nojoin)
+      //snaprealm->parent->join(snaprealm);
+    }
+    delete snaprealm;
+    snaprealm = 0;
+  }
+}
+
+SnapRealm *CInode::find_snaprealm() const
+{
+  const CInode *cur = this;
+  while (!cur->snaprealm) {
+    const CDentry *pdn = cur->get_oldest_parent_dn();
+    if (!pdn)
+      break;
+    cur = pdn->get_dir()->get_inode();
+  }
+  return cur->snaprealm;
+}
+
+void CInode::encode_snap_blob(bufferlist &snapbl)
+{
+  if (snaprealm) {
+    using ceph::encode;
+    encode(snaprealm->srnode, snapbl);
+    dout(20) << __func__ << " " << *snaprealm << dendl;
+  }
+}
+void CInode::decode_snap_blob(const bufferlist& snapbl)
+{
+  using ceph::decode;
+  if (snapbl.length()) {
+    open_snaprealm();
+    auto old_flags = snaprealm->srnode.flags;
+    auto p = snapbl.cbegin();
+    decode(snaprealm->srnode, p);
+    if (!is_base()) {
+      if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
+	snaprealm->adjust_parent();
+      }
+    }
+    dout(20) << __func__ << " " << *snaprealm << dendl;
+  } else if (snaprealm &&
+	     !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675
+    ceph_assert(mdcache->mds->is_any_replay());
+    snaprealm->merge_to(NULL);
+  }
+}
+
+void CInode::encode_snap(bufferlist& bl)
+{
+  ENCODE_START(1, 1, bl);
+  bufferlist snapbl;
+  encode_snap_blob(snapbl);
+  encode(snapbl, bl);
+  encode(oldest_snap, bl);
+  ENCODE_FINISH(bl);
+}
+
+void CInode::decode_snap(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  bufferlist snapbl;
+  decode(snapbl, p);
+  decode(oldest_snap, p);
+  decode_snap_blob(snapbl);
+  DECODE_FINISH(p);
+}
+
+// =============================================
+
+client_t CInode::calc_ideal_loner()
+{
+  if (mdcache->is_readonly())
+    return -1;
+  if (!get_mds_caps_wanted().empty())
+    return -1;
+  
+  int n = 0;
+  client_t loner = -1;
+  for (const auto &p : client_caps) {
+    if (!p.second.is_stale() &&
+	(is_dir() ?
+	 !has_subtree_or_exporting_dirfrag() :
+	 (p.second.wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_RD)))) {
+      if (n)
+	return -1;
+      n++;
+      loner = p.first;
+    }
+  }
+  return loner;
+}
+
+bool CInode::choose_ideal_loner()
+{
+  want_loner_cap = calc_ideal_loner();
+  int changed = false;
+  if (loner_cap >= 0 && loner_cap != want_loner_cap) {
+    if (!try_drop_loner())
+      return false;
+    changed = true;
+  }
+
+  if (want_loner_cap >= 0) {
+    if (loner_cap < 0) {
+      set_loner_cap(want_loner_cap);
+      changed = true;
+    } else
+      ceph_assert(loner_cap == want_loner_cap);
+  }
+  return changed;
+}
+
+bool CInode::try_set_loner()
+{
+  ceph_assert(want_loner_cap >= 0);
+  if (loner_cap >= 0 && loner_cap != want_loner_cap)
+    return false;
+  set_loner_cap(want_loner_cap);
+  return true;
+}
+
+void CInode::set_loner_cap(client_t l)
+{
+  loner_cap = l;
+  authlock.set_excl_client(loner_cap);
+  filelock.set_excl_client(loner_cap);
+  linklock.set_excl_client(loner_cap);
+  xattrlock.set_excl_client(loner_cap);
+}
+
+bool CInode::try_drop_loner()
+{
+  if (loner_cap < 0)
+    return true;
+
+  int other_allowed = get_caps_allowed_by_type(CAP_ANY);
+  Capability *cap = get_client_cap(loner_cap);
+  if (!cap ||
+      (cap->issued() & ~other_allowed) == 0) {
+    set_loner_cap(-1);
+    return true;
+  }
+  return false;
+}
+
+
+// choose new lock state during recovery, based on issued caps
+void CInode::choose_lock_state(SimpleLock *lock, int allissued)
+{
+  int shift = lock->get_cap_shift();
+  int issued = (allissued >> shift) & lock->get_cap_mask();
+  if (is_auth()) {
+    if (lock->is_xlocked()) {
+      // do nothing here
+    } else if (lock->get_state() != LOCK_MIX) {
+      if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER))
+	lock->set_state(LOCK_EXCL);
+      else if (issued & CEPH_CAP_GWR) {
+        if (issued & (CEPH_CAP_GCACHE | CEPH_CAP_GSHARED))
+          lock->set_state(LOCK_EXCL);
+        else
+          lock->set_state(LOCK_MIX);
+      } else if (lock->is_dirty()) {
+	if (is_replicated())
+	  lock->set_state(LOCK_MIX);
+	else
+	  lock->set_state(LOCK_LOCK);
+      } else
+	lock->set_state(LOCK_SYNC);
+    }
+  } else {
+    // our states have already been chosen during rejoin.
+    if (lock->is_xlocked())
+      ceph_assert(lock->get_state() == LOCK_LOCK);
+  }
+}
+ 
+void CInode::choose_lock_states(int dirty_caps)
+{
+  int issued = get_caps_issued() | dirty_caps;
+  if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR)))
+    choose_ideal_loner();
+  choose_lock_state(&filelock, issued);
+  choose_lock_state(&nestlock, issued);
+  choose_lock_state(&dirfragtreelock, issued);
+  choose_lock_state(&authlock, issued);
+  choose_lock_state(&xattrlock, issued);
+  choose_lock_state(&linklock, issued);
+}
+
+int CInode::count_nonstale_caps()
+{
+  int n = 0;
+  for (const auto &p : client_caps) {
+    if (!p.second.is_stale())
+      n++;
+  }
+  return n;
+}
+
+bool CInode::multiple_nonstale_caps()
+{
+  int n = 0;
+  for (const auto &p : client_caps) {
+    if (!p.second.is_stale()) {
+      if (n)
+	return true;
+      n++;
+    }
+  }
+  return false;
+}
+
+void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m)
+{
+  bool old_empty = mds_caps_wanted.empty();
+  mds_caps_wanted.swap(m);
+  if (old_empty != (bool)mds_caps_wanted.empty()) {
+    if (old_empty)
+      adjust_num_caps_notable(1);
+    else
+      adjust_num_caps_notable(-1);
+  }
+}
+
+void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted)
+{
+  bool old_empty = mds_caps_wanted.empty();
+  if (wanted) {
+    mds_caps_wanted[mds] = wanted;
+    if (old_empty)
+      adjust_num_caps_notable(1);
+  } else if (!old_empty) {
+    mds_caps_wanted.erase(mds);
+    if (mds_caps_wanted.empty())
+      adjust_num_caps_notable(-1);
+  }
+}
+
+Capability *CInode::add_client_cap(client_t client, Session *session,
+				   SnapRealm *conrealm, bool new_inode)
+{
+  ceph_assert(last == CEPH_NOSNAP);
+  if (client_caps.empty()) {
+    get(PIN_CAPS);
+    if (conrealm)
+      containing_realm = conrealm;
+    else
+      containing_realm = find_snaprealm();
+    containing_realm->inodes_with_caps.push_back(&item_caps);
+    dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl;
+
+    mdcache->num_inodes_with_caps++;
+    if (parent)
+      parent->dir->adjust_num_inodes_with_caps(1);
+  }
+
+  uint64_t cap_id = new_inode ? 1 : ++mdcache->last_cap_id;
+  auto ret = client_caps.emplace(std::piecewise_construct, std::forward_as_tuple(client),
+                                 std::forward_as_tuple(this, session, cap_id));
+  ceph_assert(ret.second == true);
+  Capability *cap = &ret.first->second;
+
+  cap->client_follows = first-1;
+  containing_realm->add_cap(client, cap);
+
+  return cap;
+}
+
+void CInode::remove_client_cap(client_t client)
+{
+  auto it = client_caps.find(client);
+  ceph_assert(it != client_caps.end());
+  Capability *cap = &it->second;
+  
+  cap->item_session_caps.remove_myself();
+  cap->item_revoking_caps.remove_myself();
+  cap->item_client_revoking_caps.remove_myself();
+  containing_realm->remove_cap(client, cap);
+  
+  if (client == loner_cap)
+    loner_cap = -1;
+
+  if (cap->is_wanted_notable())
+    adjust_num_caps_notable(-1);
+
+  client_caps.erase(it);
+  if (client_caps.empty()) {
+    dout(10) << __func__ << " last cap, leaving realm " << *containing_realm << dendl;
+    put(PIN_CAPS);
+    item_caps.remove_myself();
+    containing_realm = NULL;
+    mdcache->num_inodes_with_caps--;
+    if (parent)
+      parent->dir->adjust_num_inodes_with_caps(-1);
+  }
+
+  //clean up advisory locks
+  bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false;
+  bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false; 
+  if (fcntl_removed || flock_removed) {
+    MDSContext::vec waiters;
+    take_waiting(CInode::WAIT_FLOCK, waiters);
+    mdcache->mds->queue_waiters(waiters);
+  }
+}
+
+void CInode::move_to_realm(SnapRealm *realm)
+{
+  dout(10) << __func__ << " joining realm " << *realm
+	   << ", leaving realm " << *containing_realm << dendl;
+  for (auto& p : client_caps) {
+    containing_realm->remove_cap(p.first, &p.second);
+    realm->add_cap(p.first, &p.second);
+  }
+  item_caps.remove_myself();
+  realm->inodes_with_caps.push_back(&item_caps);
+  containing_realm = realm;
+}
+
+Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session)
+{
+  Capability *cap = get_client_cap(client);
+  if (cap) {
+    // FIXME?
+    cap->merge(icr.capinfo.wanted, icr.capinfo.issued);
+  } else {
+    cap = add_client_cap(client, session);
+    cap->set_cap_id(icr.capinfo.cap_id);
+    cap->set_wanted(icr.capinfo.wanted);
+    cap->issue_norevoke(icr.capinfo.issued);
+    cap->reset_seq();
+  }
+  cap->set_last_issue_stamp(ceph_clock_now());
+  return cap;
+}
+
+void CInode::clear_client_caps_after_export()
+{
+  while (!client_caps.empty())
+    remove_client_cap(client_caps.begin()->first);
+  loner_cap = -1;
+  want_loner_cap = -1;
+  if (!get_mds_caps_wanted().empty()) {
+    mempool::mds_co::compact_map<int32_t,int32_t> empty;
+    set_mds_caps_wanted(empty);
+  }
+}
+
+void CInode::export_client_caps(map<client_t,Capability::Export>& cl)
+{
+  for (const auto &p : client_caps) {
+    cl[p.first] = p.second.make_export();
+  }
+}
+
+  // caps allowed
+int CInode::get_caps_liked() const
+{
+  if (is_dir())
+    return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED;  // but not, say, FILE_RD|WR|WRBUFFER
+  else
+    return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO;
+}
+
+int CInode::get_caps_allowed_ever() const
+{
+  int allowed;
+  if (is_dir())
+    allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED;
+  else
+    allowed = CEPH_CAP_ANY;
+  return allowed & 
+    (CEPH_CAP_PIN |
+     (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) |
+     (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) |
+     (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) |
+     (linklock.gcaps_allowed_ever() << linklock.get_cap_shift()));
+}
+
+int CInode::get_caps_allowed_by_type(int type) const
+{
+  return 
+    CEPH_CAP_PIN |
+    (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) |
+    (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) |
+    (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) |
+    (linklock.gcaps_allowed(type) << linklock.get_cap_shift());
+}
+
+int CInode::get_caps_careful() const
+{
+  return 
+    (filelock.gcaps_careful() << filelock.get_cap_shift()) |
+    (authlock.gcaps_careful() << authlock.get_cap_shift()) |
+    (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) |
+    (linklock.gcaps_careful() << linklock.get_cap_shift());
+}
+
+int CInode::get_xlocker_mask(client_t client) const
+{
+  return 
+    (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) |
+    (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) |
+    (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) |
+    (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift());
+}
+
+int CInode::get_caps_allowed_for_client(Session *session, Capability *cap,
+					const mempool_inode *file_i) const
+{
+  client_t client = session->get_client();
+  int allowed;
+  if (client == get_loner()) {
+    // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked
+    allowed =
+      get_caps_allowed_by_type(CAP_LONER) |
+      (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client));
+  } else {
+    allowed = get_caps_allowed_by_type(CAP_ANY);
+  }
+
+  if (is_dir()) {
+    allowed &= ~CEPH_CAP_ANY_DIR_OPS;
+    if (cap && (allowed & CEPH_CAP_FILE_EXCL))
+      allowed |= cap->get_lock_cache_allowed();
+  } else {
+    if (file_i->inline_data.version == CEPH_INLINE_NONE &&
+	file_i->layout.pool_ns.empty()) {
+      // noop
+    } else if (cap) {
+      if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
+	   cap->is_noinline()) ||
+	  (!file_i->layout.pool_ns.empty() &&
+	   cap->is_nopoolns()))
+	allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
+    } else {
+      auto& conn = session->get_connection();
+      if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
+	   !conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) ||
+	  (!file_i->layout.pool_ns.empty() &&
+	   !conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)))
+	allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
+    }
+  }
+  return allowed;
+}
+
+// caps issued, wanted
+int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker,
+			    int shift, int mask)
+{
+  int c = 0;
+  int loner = 0, other = 0, xlocker = 0;
+  if (!is_auth()) {
+    loner_cap = -1;
+  }
+
+  for (const auto &p : client_caps) {
+    int i = p.second.issued();
+    c |= i;
+    if (p.first == loner_cap)
+      loner |= i;
+    else
+      other |= i;
+    xlocker |= get_xlocker_mask(p.first) & i;
+  }
+  if (ploner) *ploner = (loner >> shift) & mask;
+  if (pother) *pother = (other >> shift) & mask;
+  if (pxlocker) *pxlocker = (xlocker >> shift) & mask;
+  return (c >> shift) & mask;
+}
+
+bool CInode::is_any_caps_wanted() const
+{
+  for (const auto &p : client_caps) {
+    if (p.second.wanted())
+      return true;
+  }
+  return false;
+}
+
+int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const
+{
+  int w = 0;
+  int loner = 0, other = 0;
+  for (const auto &p : client_caps) {
+    if (!p.second.is_stale()) {
+      int t = p.second.wanted();
+      w |= t;
+      if (p.first == loner_cap)
+	loner |= t;
+      else
+	other |= t;	
+    }
+    //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
+  }
+  if (is_auth())
+    for (const auto &p : mds_caps_wanted) {
+      w |= p.second;
+      other |= p.second;
+      //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
+    }
+  if (ploner) *ploner = (loner >> shift) & mask;
+  if (pother) *pother = (other >> shift) & mask;
+  return (w >> shift) & mask;
+}
+
+bool CInode::issued_caps_need_gather(SimpleLock *lock)
+{
+  int loner_issued, other_issued, xlocker_issued;
+  get_caps_issued(&loner_issued, &other_issued, &xlocker_issued,
+		  lock->get_cap_shift(), lock->get_cap_mask());
+  if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) ||
+      (other_issued & ~lock->gcaps_allowed(CAP_ANY)) ||
+      (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER)))
+    return true;
+  return false;
+}
+
+void CInode::adjust_num_caps_notable(int d)
+{
+  if (!is_clientwriteable()) {
+    if (!num_caps_notable && d > 0)
+      mdcache->open_file_table.add_inode(this);
+    else if (num_caps_notable > 0 && num_caps_notable == -d)
+      mdcache->open_file_table.remove_inode(this);
+  }
+
+  num_caps_notable +=d;
+  ceph_assert(num_caps_notable >= 0);
+}
+
+void CInode::mark_clientwriteable()
+{
+  if (last != CEPH_NOSNAP)
+    return;
+  if (!state_test(STATE_CLIENTWRITEABLE)) {
+    if (num_caps_notable == 0)
+      mdcache->open_file_table.add_inode(this);
+    state_set(STATE_CLIENTWRITEABLE);
+  }
+}
+
+void CInode::clear_clientwriteable()
+{
+  if (state_test(STATE_CLIENTWRITEABLE)) {
+    if (num_caps_notable == 0)
+      mdcache->open_file_table.remove_inode(this);
+    state_clear(STATE_CLIENTWRITEABLE);
+  }
+}
+
+// =============================================
+
+int CInode::encode_inodestat(bufferlist& bl, Session *session,
+			     SnapRealm *dir_realm,
+			     snapid_t snapid,
+			     unsigned max_bytes,
+			     int getattr_caps)
+{
+  client_t client = session->get_client();
+  ceph_assert(snapid);
+  
+  bool valid = true;
+
+  // pick a version!
+  const mempool_inode *oi = get_inode().get();
+  const mempool_inode *pi = get_projected_inode().get();
+
+  const mempool_xattr_map *pxattrs = nullptr;
+
+  if (snapid != CEPH_NOSNAP) {
+
+    // for now at least, old_inodes is only defined/valid on the auth
+    if (!is_auth())
+      valid = false;
+
+    if (is_any_old_inodes()) {
+      auto it = old_inodes->lower_bound(snapid);
+      if (it != old_inodes->end()) {
+	if (it->second.first > snapid) {
+	  if  (it != old_inodes->begin())
+	    --it;
+	}
+	if (it->second.first <= snapid && snapid <= it->first) {
+	  dout(15) << __func__ << " snapid " << snapid
+		   << " to old_inode [" << it->second.first << "," << it->first << "]"
+		   << " " << it->second.inode.rstat
+		   << dendl;
+	  pi = oi = &it->second.inode;
+	  pxattrs = &it->second.xattrs;
+	} else {
+	  // snapshoted remote dentry can result this
+	  dout(0) << __func__ << " old_inode for snapid " << snapid
+		  << " not found" << dendl;
+	}
+      }
+    } else if (snapid < first || snapid > last) {
+      // snapshoted remote dentry can result this
+      dout(0) << __func__ << " [" << first << "," << last << "]"
+	      << " not match snapid " << snapid << dendl;
+    }
+  }
+
+  utime_t snap_btime;
+  std::map<std::string, std::string> snap_metadata;
+  SnapRealm *realm = find_snaprealm();
+  if (snapid != CEPH_NOSNAP && realm) {
+    // add snapshot timestamp vxattr
+    map<snapid_t,const SnapInfo*> infomap;
+    realm->get_snap_info(infomap,
+                         snapid,  // min
+                         snapid); // max
+    if (!infomap.empty()) {
+      ceph_assert(infomap.size() == 1);
+      const SnapInfo *si = infomap.begin()->second;
+      snap_btime = si->stamp;
+      snap_metadata = si->metadata;
+    }
+  }
+
+
+  bool no_caps = !valid ||
+		 session->is_stale() ||
+		 (dir_realm && realm != dir_realm) ||
+		 is_frozen() ||
+		 state_test(CInode::STATE_EXPORTINGCAPS);
+  if (no_caps)
+    dout(20) << __func__ << " no caps"
+	     << (!valid?", !valid":"")
+	     << (session->is_stale()?", session stale ":"")
+	     << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"")
+	     << (is_frozen()?", frozen inode":"")
+	     << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"")
+	     << dendl;
+
+  
+  // "fake" a version that is old (stable) version, +1 if projected.
+  version_t version = (oi->version * 2) + is_projected();
+
+  Capability *cap = get_client_cap(client);
+  bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client;
+  //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL));
+  bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client;
+  bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client;
+  bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client;
+
+  bool plocal = versionlock.get_last_wrlock_client() == client;
+  bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client;
+  
+  const mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi;
+  
+  dout(20) << " pfile " << pfile << " pauth " << pauth
+	   << " plink " << plink << " pxattr " << pxattr
+	   << " plocal " << plocal
+	   << " mtime " << any_i->mtime
+	   << " ctime " << any_i->ctime
+	   << " change_attr " << any_i->change_attr
+	   << " valid=" << valid << dendl;
+
+  // file
+  const mempool_inode *file_i = pfile ? pi:oi;
+  file_layout_t layout;
+  if (is_dir()) {
+    layout = (ppolicy ? pi : oi)->layout;
+  } else {
+    layout = file_i->layout;
+  }
+
+  // max_size is min of projected, actual
+  uint64_t max_size =
+    std::min(oi->get_client_range(client),
+	     pi->get_client_range(client));
+
+  // inline data
+  version_t inline_version = 0;
+  bufferlist inline_data;
+  if (file_i->inline_data.version == CEPH_INLINE_NONE) {
+    inline_version = CEPH_INLINE_NONE;
+  } else if ((!cap && !no_caps) ||
+	     (cap && cap->client_inline_version < file_i->inline_data.version) ||
+	     (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data
+    inline_version = file_i->inline_data.version;
+    if (file_i->inline_data.length() > 0)
+      file_i->inline_data.get_data(inline_data);
+  }
+
+  // nest (do same as file... :/)
+  if (cap) {
+    cap->last_rbytes = file_i->rstat.rbytes;
+    cap->last_rsize = file_i->rstat.rsize();
+  }
+
+  // auth
+  const mempool_inode *auth_i = pauth ? pi:oi;
+
+  // link
+  const mempool_inode *link_i = plink ? pi:oi;
+  
+  // xattr
+  const mempool_inode *xattr_i = pxattr ? pi:oi;
+
+  using ceph::encode;
+  // xattr
+  version_t xattr_version;
+  if ((!cap && !no_caps) ||
+      (cap && cap->client_xattr_version < xattr_i->xattr_version) ||
+      (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs
+    if (!pxattrs)
+      pxattrs = pxattr ? get_projected_xattrs().get() : get_xattrs().get();
+    xattr_version = xattr_i->xattr_version;
+  } else {
+    xattr_version = 0;
+  }
+  
+  // do we have room?
+  if (max_bytes) {
+    unsigned bytes =
+      8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) +
+      sizeof(struct ceph_file_layout) +
+      sizeof(struct ceph_timespec) * 3 + 4 + // ctime ~ time_warp_seq
+      8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink
+      8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + // dirstat.nfiles ~ rstat.rctime
+      sizeof(__u32) + sizeof(__u32) * 2 * dirfragtree._splits.size() + // dirfragtree
+      sizeof(__u32) + symlink.length() + // symlink
+      sizeof(struct ceph_dir_layout); // dir_layout
+
+    if (xattr_version) {
+      bytes += sizeof(__u32) + sizeof(__u32); // xattr buffer len + number entries
+      if (pxattrs) {
+	for (const auto &p : *pxattrs)
+	  bytes += sizeof(__u32) * 2 + p.first.length() + p.second.length();
+      }
+    } else {
+      bytes += sizeof(__u32); // xattr buffer len
+    }
+    bytes +=
+      sizeof(version_t) + sizeof(__u32) + inline_data.length() + // inline data
+      1 + 1 + 8 + 8 + 4 + // quota
+      4 + layout.pool_ns.size() + // pool ns
+      sizeof(struct ceph_timespec) + 8; // btime + change_attr
+
+    if (bytes > max_bytes)
+      return -CEPHFS_ENOSPC;
+  }
+
+
+  // encode caps
+  struct ceph_mds_reply_cap ecap;
+  if (snapid != CEPH_NOSNAP) {
+    /*
+     * snapped inodes (files or dirs) only get read-only caps.  always
+     * issue everything possible, since it is read only.
+     *
+     * if a snapped inode has caps, limit issued caps based on the
+     * lock state.
+     *
+     * if it is a live inode, limit issued caps based on the lock
+     * state.
+     *
+     * do NOT adjust cap issued state, because the client always
+     * tracks caps per-snap and the mds does either per-interval or
+     * multiversion.
+     */
+    ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE;
+    if (last == CEPH_NOSNAP || is_any_caps())
+      ecap.caps = ecap.caps & get_caps_allowed_for_client(session, nullptr, file_i);
+    ecap.seq = 0;
+    ecap.mseq = 0;
+    ecap.realm = 0;
+  } else {
+    if (!no_caps && !cap) {
+      // add a new cap
+      cap = add_client_cap(client, session, realm);
+      if (is_auth())
+	choose_ideal_loner();
+    }
+
+    int issue = 0;
+    if (!no_caps && cap) {
+      int likes = get_caps_liked();
+      int allowed = get_caps_allowed_for_client(session, cap, file_i);
+      issue = (cap->wanted() | likes) & allowed;
+      cap->issue_norevoke(issue, true);
+      issue = cap->pending();
+      dout(10) << "encode_inodestat issuing " << ccap_string(issue)
+	       << " seq " << cap->get_last_seq() << dendl;
+    } else if (cap && cap->is_new() && !dir_realm) {
+      // alway issue new caps to client, otherwise the caps get lost
+      ceph_assert(cap->is_stale());
+      ceph_assert(!cap->pending());
+      issue = CEPH_CAP_PIN;
+      cap->issue_norevoke(issue, true);
+      dout(10) << "encode_inodestat issuing " << ccap_string(issue)
+	       << " seq " << cap->get_last_seq()
+	       << "(stale&new caps)" << dendl;
+    }
+
+    if (issue) {
+      cap->set_last_issue();
+      cap->set_last_issue_stamp(ceph_clock_now());
+      ecap.caps = issue;
+      ecap.wanted = cap->wanted();
+      ecap.cap_id = cap->get_cap_id();
+      ecap.seq = cap->get_last_seq();
+      ecap.mseq = cap->get_mseq();
+      ecap.realm = realm->inode->ino();
+    } else {
+      ecap.cap_id = 0;
+      ecap.caps = 0;
+      ecap.seq = 0;
+      ecap.mseq = 0;
+      ecap.realm = 0;
+      ecap.wanted = 0;
+    }
+  }
+  ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0;
+  dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps)
+	   << " seq " << ecap.seq << " mseq " << ecap.mseq
+	   << " xattrv " << xattr_version << dendl;
+
+  if (inline_data.length() && cap) {
+    if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) {
+      dout(10) << "including inline version " << inline_version << dendl;
+      cap->client_inline_version = inline_version;
+    } else {
+      dout(10) << "dropping inline version " << inline_version << dendl;
+      inline_version = 0;
+      inline_data.clear();
+    }
+  }
+
+  // include those xattrs?
+  if (xattr_version && cap) {
+    if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) {
+      dout(10) << "including xattrs version " << xattr_version << dendl;
+      cap->client_xattr_version = xattr_version;
+    } else {
+      dout(10) << "dropping xattrs version " << xattr_version << dendl;
+      xattr_version = 0;
+    }
+  }
+
+  // The end result of encode_xattrs() is equivalent to:
+  // {
+  //   bufferlist xbl;
+  //   if (xattr_version) {
+  //     if (pxattrs)
+  //       encode(*pxattrs, bl);
+  //     else
+  //       encode((__u32)0, bl);
+  //   }
+  //   encode(xbl, bl);
+  // }
+  //
+  // But encoding xattrs into the 'xbl' requires a memory allocation.
+  // The 'bl' should have enough pre-allocated memory in most cases.
+  // Encoding xattrs directly into it can avoid the extra allocation.
+  auto encode_xattrs = [xattr_version, pxattrs, &bl]() {
+    using ceph::encode;
+    if (xattr_version) {
+      ceph_le32 xbl_len;
+      auto filler = bl.append_hole(sizeof(xbl_len));
+      const auto starting_bl_len = bl.length();
+      if (pxattrs)
+	encode(*pxattrs, bl);
+      else
+	encode((__u32)0, bl);
+      xbl_len = bl.length() - starting_bl_len;
+      filler.copy_in(sizeof(xbl_len), (char *)&xbl_len);
+    } else {
+      encode((__u32)0, bl);
+    }
+  };
+
+  /*
+   * note: encoding matches MClientReply::InodeStat
+   */
+  if (session->info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
+    ENCODE_START(6, 1, bl);
+    encode(oi->ino, bl);
+    encode(snapid, bl);
+    encode(oi->rdev, bl);
+    encode(version, bl);
+    encode(xattr_version, bl);
+    encode(ecap, bl);
+    {
+      ceph_file_layout legacy_layout;
+      layout.to_legacy(&legacy_layout);
+      encode(legacy_layout, bl);
+    }
+    encode(any_i->ctime, bl);
+    encode(file_i->mtime, bl);
+    encode(file_i->atime, bl);
+    encode(file_i->time_warp_seq, bl);
+    encode(file_i->size, bl);
+    encode(max_size, bl);
+    encode(file_i->truncate_size, bl);
+    encode(file_i->truncate_seq, bl);
+    encode(auth_i->mode, bl);
+    encode((uint32_t)auth_i->uid, bl);
+    encode((uint32_t)auth_i->gid, bl);
+    encode(link_i->nlink, bl);
+    encode(file_i->dirstat.nfiles, bl);
+    encode(file_i->dirstat.nsubdirs, bl);
+    encode(file_i->rstat.rbytes, bl);
+    encode(file_i->rstat.rfiles, bl);
+    encode(file_i->rstat.rsubdirs, bl);
+    encode(file_i->rstat.rctime, bl);
+    dirfragtree.encode(bl);
+    encode(symlink, bl);
+    encode(file_i->dir_layout, bl);
+    encode_xattrs();
+    encode(inline_version, bl);
+    encode(inline_data, bl);
+    const mempool_inode *policy_i = ppolicy ? pi : oi;
+    encode(policy_i->quota, bl);
+    encode(layout.pool_ns, bl);
+    encode(any_i->btime, bl);
+    encode(any_i->change_attr, bl);
+    encode(file_i->export_pin, bl);
+    encode(snap_btime, bl);
+    encode(file_i->rstat.rsnaps, bl);
+    encode(snap_metadata, bl);
+    encode(file_i->fscrypt, bl);
+    ENCODE_FINISH(bl);
+  }
+  else {
+    ceph_assert(session->get_connection());
+
+    encode(oi->ino, bl);
+    encode(snapid, bl);
+    encode(oi->rdev, bl);
+    encode(version, bl);
+    encode(xattr_version, bl);
+    encode(ecap, bl);
+    {
+      ceph_file_layout legacy_layout;
+      layout.to_legacy(&legacy_layout);
+      encode(legacy_layout, bl);
+    }
+    encode(any_i->ctime, bl);
+    encode(file_i->mtime, bl);
+    encode(file_i->atime, bl);
+    encode(file_i->time_warp_seq, bl);
+    encode(file_i->size, bl);
+    encode(max_size, bl);
+    encode(file_i->truncate_size, bl);
+    encode(file_i->truncate_seq, bl);
+    encode(auth_i->mode, bl);
+    encode((uint32_t)auth_i->uid, bl);
+    encode((uint32_t)auth_i->gid, bl);
+    encode(link_i->nlink, bl);
+    encode(file_i->dirstat.nfiles, bl);
+    encode(file_i->dirstat.nsubdirs, bl);
+    encode(file_i->rstat.rbytes, bl);
+    encode(file_i->rstat.rfiles, bl);
+    encode(file_i->rstat.rsubdirs, bl);
+    encode(file_i->rstat.rctime, bl);
+    dirfragtree.encode(bl);
+    encode(symlink, bl);
+    auto& conn = session->get_connection();
+    if (conn->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) {
+      encode(file_i->dir_layout, bl);
+    }
+    encode_xattrs();
+    if (conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
+      encode(inline_version, bl);
+      encode(inline_data, bl);
+    }
+    if (conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) {
+      const mempool_inode *policy_i = ppolicy ? pi : oi;
+      encode(policy_i->quota, bl);
+    }
+    if (conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) {
+      encode(layout.pool_ns, bl);
+    }
+    if (conn->has_feature(CEPH_FEATURE_FS_BTIME)) {
+      encode(any_i->btime, bl);
+      encode(any_i->change_attr, bl);
+    }
+  }
+
+  return valid;
+}
+
+void CInode::encode_cap_message(const ref_t<MClientCaps> &m, Capability *cap)
+{
+  ceph_assert(cap);
+
+  client_t client = cap->get_client();
+
+  bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL);
+  bool pauth = authlock.is_xlocked_by_client(client);
+  bool plink = linklock.is_xlocked_by_client(client);
+  bool pxattr = xattrlock.is_xlocked_by_client(client);
+ 
+  const mempool_inode *oi = get_inode().get();
+  const mempool_inode *pi = get_projected_inode().get();
+  const mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi;
+
+  dout(20) << __func__ << " pfile " << pfile
+	   << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr
+	   << " mtime " << i->mtime << " ctime " << i->ctime << " change_attr " << i->change_attr << dendl;
+
+  i = pfile ? pi:oi;
+  m->set_layout(i->layout);
+  m->size = i->size;
+  m->truncate_seq = i->truncate_seq;
+  m->truncate_size = i->truncate_size;
+  m->mtime = i->mtime;
+  m->atime = i->atime;
+  m->ctime = i->ctime;
+  m->btime = i->btime;
+  m->change_attr = i->change_attr;
+  m->time_warp_seq = i->time_warp_seq;
+  m->nfiles = i->dirstat.nfiles;
+  m->nsubdirs = i->dirstat.nsubdirs;
+
+  if (cap->client_inline_version < i->inline_data.version) {
+    m->inline_version = cap->client_inline_version = i->inline_data.version;
+    if (i->inline_data.length() > 0)
+      i->inline_data.get_data(m->inline_data);
+  } else {
+    m->inline_version = 0;
+  }
+
+  // max_size is min of projected, actual.
+  uint64_t oldms = oi->get_client_range(client);
+  uint64_t newms = pi->get_client_range(client);
+  m->max_size = std::min(oldms, newms);
+
+  i = pauth ? pi:oi;
+  m->head.mode = i->mode;
+  m->head.uid = i->uid;
+  m->head.gid = i->gid;
+
+  i = plink ? pi:oi;
+  m->head.nlink = i->nlink;
+
+  using ceph::encode;
+  i = pxattr ? pi:oi;
+  const auto& ix = pxattr ? get_projected_xattrs() : get_xattrs();
+  if ((cap->pending() & CEPH_CAP_XATTR_SHARED) &&
+      i->xattr_version > cap->client_xattr_version) {
+    dout(10) << "    including xattrs v " << i->xattr_version << dendl;
+    if (ix)
+      encode(*ix, m->xattrbl);
+    else
+      encode((__u32)0, m->xattrbl);
+    m->head.xattr_version = i->xattr_version;
+    cap->client_xattr_version = i->xattr_version;
+  }
+}
+
+
+
+void CInode::_encode_base(bufferlist& bl, uint64_t features)
+{
+  ENCODE_START(1, 1, bl);
+  encode(first, bl);
+  encode(*get_inode(), bl, features);
+  encode(symlink, bl);
+  encode(dirfragtree, bl);
+  encode_xattrs(bl);
+  encode_old_inodes(bl, features);
+  encode(damage_flags, bl);
+  encode_snap(bl);
+  ENCODE_FINISH(bl);
+}
+void CInode::_decode_base(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(first, p);
+  {
+    auto _inode = allocate_inode();
+    decode(*_inode, p);
+    reset_inode(std::move(_inode));
+  }
+  {
+    std::string tmp;
+    decode(tmp, p);
+    symlink = std::string_view(tmp);
+  }
+  decode(dirfragtree, p);
+  decode_xattrs(p);
+  decode_old_inodes(p);
+  decode(damage_flags, p);
+  decode_snap(p);
+  DECODE_FINISH(p);
+}
+
+void CInode::_encode_locks_full(bufferlist& bl)
+{
+  using ceph::encode;
+  encode(authlock, bl);
+  encode(linklock, bl);
+  encode(dirfragtreelock, bl);
+  encode(filelock, bl);
+  encode(xattrlock, bl);
+  encode(snaplock, bl);
+  encode(nestlock, bl);
+  encode(flocklock, bl);
+  encode(policylock, bl);
+
+  encode(loner_cap, bl);
+}
+void CInode::_decode_locks_full(bufferlist::const_iterator& p)
+{
+  using ceph::decode;
+  decode(authlock, p);
+  decode(linklock, p);
+  decode(dirfragtreelock, p);
+  decode(filelock, p);
+  decode(xattrlock, p);
+  decode(snaplock, p);
+  decode(nestlock, p);
+  decode(flocklock, p);
+  decode(policylock, p);
+
+  decode(loner_cap, p);
+  set_loner_cap(loner_cap);
+  want_loner_cap = loner_cap;  // for now, we'll eval() shortly.
+}
+
+void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover)
+{
+  ENCODE_START(1, 1, bl);
+  authlock.encode_state_for_replica(bl);
+  linklock.encode_state_for_replica(bl);
+  dirfragtreelock.encode_state_for_replica(bl);
+  filelock.encode_state_for_replica(bl);
+  nestlock.encode_state_for_replica(bl);
+  xattrlock.encode_state_for_replica(bl);
+  snaplock.encode_state_for_replica(bl);
+  flocklock.encode_state_for_replica(bl);
+  policylock.encode_state_for_replica(bl);
+  encode(need_recover, bl);
+  ENCODE_FINISH(bl);
+}
+
+void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
+{
+  authlock.encode_state_for_replica(bl);
+  linklock.encode_state_for_replica(bl);
+  dirfragtreelock.encode_state_for_rejoin(bl, rep);
+  filelock.encode_state_for_rejoin(bl, rep);
+  nestlock.encode_state_for_rejoin(bl, rep);
+  xattrlock.encode_state_for_replica(bl);
+  snaplock.encode_state_for_replica(bl);
+  flocklock.encode_state_for_replica(bl);
+  policylock.encode_state_for_replica(bl);
+}
+
+void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator& p, bool is_new)
+{
+  DECODE_START(1, p);
+  authlock.decode_state(p, is_new);
+  linklock.decode_state(p, is_new);
+  dirfragtreelock.decode_state(p, is_new);
+  filelock.decode_state(p, is_new);
+  nestlock.decode_state(p, is_new);
+  xattrlock.decode_state(p, is_new);
+  snaplock.decode_state(p, is_new);
+  flocklock.decode_state(p, is_new);
+  policylock.decode_state(p, is_new);
+
+  bool need_recover;
+  decode(need_recover, p);
+  if (need_recover && is_new) {
+    // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
+    // and change the object when replaying unsafe requests.
+    authlock.mark_need_recover();
+    linklock.mark_need_recover();
+    dirfragtreelock.mark_need_recover();
+    filelock.mark_need_recover();
+    nestlock.mark_need_recover();
+    xattrlock.mark_need_recover();
+    snaplock.mark_need_recover();
+    flocklock.mark_need_recover();
+    policylock.mark_need_recover();
+  }
+  DECODE_FINISH(p);
+}
+void CInode::_decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters,
+				  list<SimpleLock*>& eval_locks, bool survivor)
+{
+  authlock.decode_state_rejoin(p, waiters, survivor);
+  linklock.decode_state_rejoin(p, waiters, survivor);
+  dirfragtreelock.decode_state_rejoin(p, waiters, survivor);
+  filelock.decode_state_rejoin(p, waiters, survivor);
+  nestlock.decode_state_rejoin(p, waiters, survivor);
+  xattrlock.decode_state_rejoin(p, waiters, survivor);
+  snaplock.decode_state_rejoin(p, waiters, survivor);
+  flocklock.decode_state_rejoin(p, waiters, survivor);
+  policylock.decode_state_rejoin(p, waiters, survivor);
+
+  if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked())
+    eval_locks.push_back(&dirfragtreelock);
+  if (!filelock.is_stable() && !filelock.is_wrlocked())
+    eval_locks.push_back(&filelock);
+  if (!nestlock.is_stable() && !nestlock.is_wrlocked())
+    eval_locks.push_back(&nestlock);
+}
+
+
+// IMPORT/EXPORT
+
+void CInode::encode_export(bufferlist& bl)
+{
+  ENCODE_START(5, 4, bl);
+  _encode_base(bl, mdcache->mds->mdsmap->get_up_features());
+
+  encode(state, bl);
+
+  encode(pop, bl);
+
+  encode(get_replicas(), bl);
+
+  // include scatterlock info for any bounding CDirs
+  bufferlist bounding;
+  if (get_inode()->is_dir())
+    for (const auto &p : dirfrags) {
+      CDir *dir = p.second;
+      if (dir->state_test(CDir::STATE_EXPORTBOUND)) {
+	encode(p.first, bounding);
+	encode(dir->get_fnode()->fragstat, bounding);
+	encode(dir->get_fnode()->accounted_fragstat, bounding);
+	encode(dir->get_fnode()->rstat, bounding);
+	encode(dir->get_fnode()->accounted_rstat, bounding);
+	dout(10) << " encoded fragstat/rstat info for " << *dir << dendl;
+      }
+    }
+  encode(bounding, bl);
+
+  _encode_locks_full(bl);
+
+  _encode_file_locks(bl);
+
+  ENCODE_FINISH(bl);
+
+  get(PIN_TEMPEXPORTING);
+}
+
+void CInode::finish_export()
+{
+  state &= MASK_STATE_EXPORT_KEPT;
+
+  pop.zero();
+
+  // just in case!
+  //dirlock.clear_updated();
+
+  loner_cap = -1;
+
+  put(PIN_TEMPEXPORTING);
+}
+
+void CInode::decode_import(bufferlist::const_iterator& p,
+			   LogSegment *ls)
+{
+  DECODE_START(5, p);
+
+  _decode_base(p);
+
+  {
+    unsigned s;
+    decode(s, p);
+    s &= MASK_STATE_EXPORTED;
+
+    set_ephemeral_pin((s & STATE_DISTEPHEMERALPIN),
+		      (s & STATE_RANDEPHEMERALPIN));
+    state_set(STATE_AUTH | s);
+  }
+
+  if (is_dirty()) {
+    get(PIN_DIRTY);
+    _mark_dirty(ls);
+  }
+  if (is_dirty_parent()) {
+    get(PIN_DIRTYPARENT);
+    mark_dirty_parent(ls);
+  }
+
+  decode(pop, p);
+
+  decode(get_replicas(), p);
+  if (is_replicated())
+    get(PIN_REPLICATED);
+  replica_nonce = 0;
+
+  // decode fragstat info on bounding cdirs
+  bufferlist bounding;
+  decode(bounding, p);
+  auto q = bounding.cbegin();
+  while (!q.end()) {
+    frag_t fg;
+    decode(fg, q);
+    CDir *dir = get_dirfrag(fg);
+    ceph_assert(dir);  // we should have all bounds open
+
+    // Only take the remote's fragstat/rstat if we are non-auth for
+    // this dirfrag AND the lock is NOT in a scattered (MIX) state.
+    // We know lock is stable, and MIX is the only state in which
+    // the inode auth (who sent us this data) may not have the best
+    // info.
+
+    // HMM: Are there cases where dir->is_auth() is an insufficient
+    // check because the dirfrag is under migration?  That implies
+    // it is frozen (and in a SYNC or LOCK state).  FIXME.
+
+    auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
+    if (dir->is_auth() ||
+        filelock.get_state() == LOCK_MIX) {
+      dout(10) << " skipped fragstat info for " << *dir << dendl;
+      frag_info_t f;
+      decode(f, q);
+      decode(f, q);
+    } else {
+      decode(_fnode->fragstat, q);
+      decode(_fnode->accounted_fragstat, q);
+      dout(10) << " took fragstat info for " << *dir << dendl;
+    }
+    if (dir->is_auth() ||
+        nestlock.get_state() == LOCK_MIX) {
+      dout(10) << " skipped rstat info for " << *dir << dendl;
+      nest_info_t n;
+      decode(n, q);
+      decode(n, q);
+    } else {
+      decode(_fnode->rstat, q);
+      decode(_fnode->accounted_rstat, q);
+      dout(10) << " took rstat info for " << *dir << dendl;
+    }
+    dir->reset_fnode(std::move(_fnode));
+  }
+
+  _decode_locks_full(p);
+
+  _decode_file_locks(p);
+
+  DECODE_FINISH(p);
+}
+
+
+void InodeStoreBase::dump(Formatter *f) const
+{
+  inode->dump(f);
+  f->dump_string("symlink", symlink);
+
+  f->open_array_section("xattrs");
+  if (xattrs) {
+    for (const auto& [key, val] : *xattrs) {
+      f->open_object_section("xattr");
+      f->dump_string("key", key);
+      std::string v(val.c_str(), val.length());
+      f->dump_string("val", v);
+      f->close_section();
+    }
+  }
+  f->close_section();
+  f->open_object_section("dirfragtree");
+  dirfragtree.dump(f);
+  f->close_section(); // dirfragtree
+  
+  f->open_array_section("old_inodes");
+  if (old_inodes) {
+    for (const auto &p : *old_inodes) {
+      f->open_object_section("old_inode");
+      // The key is the last snapid, the first is in the mempool_old_inode
+      f->dump_int("last", p.first);
+      p.second.dump(f);
+      f->close_section();  // old_inode
+    }
+  }
+  f->close_section();  // old_inodes
+
+  f->dump_unsigned("oldest_snap", oldest_snap);
+  f->dump_unsigned("damage_flags", damage_flags);
+}
+
+template <>
+void decode_json_obj(mempool::mds_co::string& t, JSONObj *obj){
+
+  t = mempool::mds_co::string(std::string_view(obj->get_data()));
+}
+
+void InodeStoreBase::decode_json(JSONObj *obj)
+{
+  {
+    auto _inode = allocate_inode();
+    _inode->decode_json(obj);
+    reset_inode(std::move(_inode));
+  }
+
+  JSONDecoder::decode_json("symlink", symlink, obj, true);
+  // JSONDecoder::decode_json("dirfragtree", dirfragtree, obj, true); // cann't decode it now
+  //
+  //
+  {
+    mempool_xattr_map tmp;
+    JSONDecoder::decode_json("xattrs", tmp, xattrs_cb, obj, true);
+    if (tmp.empty())
+      reset_xattrs(xattr_map_ptr());
+    else
+      reset_xattrs(allocate_xattr_map(std::move(tmp)));
+  }
+  // JSONDecoder::decode_json("old_inodes", old_inodes, InodeStoreBase::old_indoes_cb, obj, true); // cann't decode old_inodes now
+  JSONDecoder::decode_json("oldest_snap", oldest_snap.val, obj, true);
+  JSONDecoder::decode_json("damage_flags", damage_flags, obj, true);
+  //sr_t srnode;
+  //JSONDecoder::decode_json("snap_blob", srnode, obj, true);   // cann't decode it now
+  //snap_blob = srnode;
+}
+
+void InodeStoreBase::xattrs_cb(InodeStoreBase::mempool_xattr_map& c, JSONObj *obj){
+
+  string k;
+  JSONDecoder::decode_json("key", k, obj, true);
+  string v;
+  JSONDecoder::decode_json("val", v, obj, true);
+  c[k.c_str()] = buffer::copy(v.c_str(), v.size());
+}
+
+void InodeStoreBase::old_indoes_cb(InodeStoreBase::mempool_old_inode_map& c, JSONObj *obj){
+
+  snapid_t s;
+  JSONDecoder::decode_json("last", s.val, obj, true);
+  InodeStoreBase::mempool_old_inode i;
+  // i.decode_json(obj); // cann't decode now, simon
+  c[s] = i;
+}
+
+void InodeStore::generate_test_instances(std::list<InodeStore*> &ls)
+{
+  InodeStore *populated = new InodeStore;
+  populated->get_inode()->ino = 0xdeadbeef;
+  populated->symlink = "rhubarb";
+  ls.push_back(populated);
+}
+
+void InodeStoreBare::generate_test_instances(std::list<InodeStoreBare*> &ls)
+{
+  InodeStoreBare *populated = new InodeStoreBare;
+  populated->get_inode()->ino = 0xdeadbeef;
+  populated->symlink = "rhubarb";
+  ls.push_back(populated);
+}
+
+void CInode::validate_disk_state(CInode::validated_data *results,
+                                 MDSContext *fin)
+{
+  class ValidationContinuation : public MDSContinuation {
+  public:
+    MDSContext *fin;
+    CInode *in;
+    CInode::validated_data *results;
+    bufferlist bl;
+    CInode *shadow_in;
+
+    enum {
+      START = 0,
+      BACKTRACE,
+      INODE,
+      DIRFRAGS,
+      SNAPREALM,
+    };
+
+    ValidationContinuation(CInode *i,
+                           CInode::validated_data *data_r,
+                           MDSContext *fin_) :
+                             MDSContinuation(i->mdcache->mds->server),
+                             fin(fin_),
+                             in(i),
+                             results(data_r),
+                             shadow_in(NULL) {
+      set_callback(START, static_cast<Continuation::stagePtr>(&ValidationContinuation::_start));
+      set_callback(BACKTRACE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_backtrace));
+      set_callback(INODE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_inode_disk));
+      set_callback(DIRFRAGS, static_cast<Continuation::stagePtr>(&ValidationContinuation::_dirfrags));
+    }
+
+    ~ValidationContinuation() override {
+      if (shadow_in) {
+	delete shadow_in;
+	in->mdcache->num_shadow_inodes--;
+      }
+    }
+
+    /**
+     * Fetch backtrace and set tag if tag is non-empty
+     */
+    void fetch_backtrace_and_tag(CInode *in,
+                                 std::string_view tag, bool is_internal,
+                                 Context *fin, int *bt_r, bufferlist *bt)
+    {
+      const int64_t pool = in->get_backtrace_pool();
+      object_t oid = CInode::get_object_name(in->ino(), frag_t(), "");
+
+      ObjectOperation fetch;
+      fetch.getxattr("parent", bt, bt_r);
+      in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP,
+				       NULL, 0, fin);
+      if (in->mdcache->mds->logger) {
+        in->mdcache->mds->logger->inc(l_mds_openino_backtrace_fetch);
+        in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_fetch);
+      }
+
+      using ceph::encode;
+      if (!is_internal) {
+        ObjectOperation scrub_tag;
+        bufferlist tag_bl;
+        encode(tag, tag_bl);
+        scrub_tag.setxattr("scrub_tag", tag_bl);
+        SnapContext snapc;
+        in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc,
+					   ceph::real_clock::now(),
+					   0, NULL);
+        if (in->mdcache->mds->logger)
+          in->mdcache->mds->logger->inc(l_mds_scrub_set_tag);
+      }
+    }
+
+    bool _start(int rval) {
+      ceph_assert(in->can_auth_pin());
+      in->auth_pin(this);
+
+      if (in->is_dirty()) {
+	MDCache *mdcache = in->mdcache;  // For the benefit of dout
+	auto ino = [this]() { return in->ino(); }; // For the benefit of dout
+	dout(20) << "validating a dirty CInode; results will be inconclusive"
+	  << dendl;
+      }
+
+      C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE),
+					    in->mdcache->mds->finisher);
+
+      std::string_view tag = in->scrub_infop->header->get_tag();
+      bool is_internal = in->scrub_infop->header->is_internal_tag();
+      // Rather than using the usual CInode::fetch_backtrace,
+      // use a special variant that optionally writes a tag in the same
+      // operation.
+      fetch_backtrace_and_tag(in, tag, is_internal, conf, &results->backtrace.ondisk_read_retval, &bl);
+      return false;
+    }
+
+    bool _backtrace(int rval) {
+      // set up basic result reporting and make sure we got the data
+      results->performed_validation = true; // at least, some of it!
+      results->backtrace.checked = true;
+
+      const int64_t pool = in->get_backtrace_pool();
+      inode_backtrace_t& memory_backtrace = results->backtrace.memory_value;
+      in->build_backtrace(pool, memory_backtrace);
+      bool equivalent, divergent;
+      int memory_newer;
+
+      MDCache *mdcache = in->mdcache;  // For the benefit of dout
+      auto ino = [this]() { return in->ino(); }; // For the benefit of dout
+
+      // Ignore rval because it's the result of a FAILOK operation
+      // from fetch_backtrace_and_tag: the real result is in
+      // backtrace.ondisk_read_retval
+      dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl;
+      if (results->backtrace.ondisk_read_retval != 0) {
+        results->backtrace.error_str << "failed to read off disk; see retval";
+        // we probably have a new unwritten file!
+        // so skip the backtrace scrub for this entry and say that all's well
+        if (in->is_dirty_parent()) {
+          dout(20) << "forcing backtrace as passed since inode is dirty parent" << dendl;
+          results->backtrace.passed = true;
+        }
+        goto next;
+      }
+
+      // extract the backtrace, and compare it to a newly-constructed one
+      try {
+        auto p = bl.cbegin();
+	using ceph::decode;
+        decode(results->backtrace.ondisk_value, p);
+        dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl;
+      } catch (buffer::error&) {
+        if (results->backtrace.ondisk_read_retval == 0 && rval != 0) {
+          // Cases where something has clearly gone wrong with the overall
+          // fetch op, though we didn't get a nonzero rc from the getxattr
+          // operation.  e.g. object missing.
+          results->backtrace.ondisk_read_retval = rval;
+        }
+        results->backtrace.error_str << "failed to decode on-disk backtrace ("
+                                     << bl.length() << " bytes)!";
+        // we probably have a new unwritten file!
+        // so skip the backtrace scrub for this entry and say that all's well
+        if (in->is_dirty_parent()) {
+          dout(20) << "decode failed; forcing backtrace as passed since "
+                      "inode is dirty parent" << dendl;
+          results->backtrace.passed = true;
+        }
+
+	goto next;
+      }
+
+      memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value,
+					      &equivalent, &divergent);
+
+      if (divergent || memory_newer < 0) {
+        // we're divergent, or on-disk version is newer
+        results->backtrace.error_str << "On-disk backtrace is divergent or newer";
+        /* if the backtraces are divergent and the link count is 0, then
+         * most likely its a stray entry that's being purged and things are
+         * well and there's no reason for alarm
+         */
+        if (divergent && (in->is_dirty_parent() || in->get_inode()->nlink == 0)) {
+          results->backtrace.passed = true;
+          dout(20) << "divergent backtraces are acceptable when dn "
+                      "is being purged or has been renamed or moved to a "
+                      "different directory " << *in << dendl;
+        }
+      } else {
+        results->backtrace.passed = true;
+      }
+next:
+
+      if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) {
+        std::string path;
+        in->make_path_string(path);
+        in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino()
+                                       << "(" << path << "), rewriting it";
+        in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(),
+                           false);
+        // Flag that we repaired this BT so that it won't go into damagetable
+        results->backtrace.repaired = true;
+        if (in->mdcache->mds->logger)
+          in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_repaired);
+      }
+
+      // If the inode's number was free in the InoTable, fix that
+      // (#15619)
+      {
+        InoTable *inotable = mdcache->mds->inotable;
+
+        dout(10) << "scrub: inotable ino = " << in->ino() << dendl;
+        dout(10) << "scrub: inotable free says "
+          << inotable->is_marked_free(in->ino()) << dendl;
+
+        if (inotable->is_marked_free(in->ino())) {
+          LogChannelRef clog = in->mdcache->mds->clog;
+          clog->error() << "scrub: inode wrongly marked free: " << in->ino();
+
+          if (in->scrub_infop->header->get_repair()) {
+            bool repaired = inotable->repair(in->ino());
+            if (repaired) {
+              clog->error() << "inode table repaired for inode: " << in->ino();
+
+              inotable->save();
+              if (in->mdcache->mds->logger)
+                in->mdcache->mds->logger->inc(l_mds_scrub_inotable_repaired);
+            } else {
+              clog->error() << "Cannot repair inotable while other operations"
+                " are in progress";
+            }
+          }
+        }
+      }
+
+
+      if (in->is_dir()) {
+        if (in->mdcache->mds->logger)
+          in->mdcache->mds->logger->inc(l_mds_scrub_dir_inodes);
+	return validate_directory_data();
+      } else {
+        if (in->mdcache->mds->logger)
+          in->mdcache->mds->logger->inc(l_mds_scrub_file_inodes);
+	// TODO: validate on-disk inode for normal files
+	return true;
+      }
+    }
+
+    bool validate_directory_data() {
+      ceph_assert(in->is_dir());
+
+      if (in->is_base()) {
+	if (!shadow_in) {
+	  shadow_in = new CInode(in->mdcache);
+	  in->mdcache->create_unlinked_system_inode(shadow_in, in->ino(), in->get_inode()->mode);
+	  in->mdcache->num_shadow_inodes++;
+	}
+        shadow_in->fetch(get_internal_callback(INODE));
+        if (in->mdcache->mds->logger)
+          in->mdcache->mds->logger->inc(l_mds_scrub_dir_base_inodes);
+        return false;
+      } else {
+	// TODO: validate on-disk inode for non-base directories
+        if (in->mdcache->mds->logger)
+          in->mdcache->mds->logger->inc(l_mds_scrub_dirfrag_rstats);
+	results->inode.passed = true;
+	return check_dirfrag_rstats();
+      }
+    }
+
+    bool _inode_disk(int rval) {
+      const auto& si = shadow_in->get_inode();
+      const auto& i = in->get_inode();
+
+      results->inode.checked = true;
+      results->inode.ondisk_read_retval = rval;
+      results->inode.ondisk_value = *si;
+      results->inode.memory_value = *i;
+
+      if (si->version > i->version) {
+        // uh, what?
+        results->inode.error_str << "On-disk inode is newer than in-memory one; ";
+	goto next;
+      } else {
+        bool divergent = false;
+        int r = i->compare(*si, &divergent);
+        results->inode.passed = !divergent && r >= 0;
+        if (!results->inode.passed) {
+          results->inode.error_str <<
+              "On-disk inode is divergent or newer than in-memory one; ";
+	  goto next;
+        }
+      }
+next:
+      return check_dirfrag_rstats();
+    }
+
+    bool check_dirfrag_rstats() {
+      if (in->has_subtree_root_dirfrag()) {
+	in->mdcache->rdlock_dirfrags_stats(in, get_internal_callback(DIRFRAGS));
+	return false;
+      } else {
+	return immediate(DIRFRAGS, 0);
+      }
+    }
+
+    bool _dirfrags(int rval) {
+      // basic reporting setup
+      results->raw_stats.checked = true;
+      results->raw_stats.ondisk_read_retval = rval;
+
+      results->raw_stats.memory_value.dirstat = in->get_inode()->dirstat;
+      results->raw_stats.memory_value.rstat = in->get_inode()->rstat;
+      frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat;
+      nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat;
+
+      if (rval != 0) {
+        results->raw_stats.error_str << "Failed to read dirfrags off disk";
+	goto next;
+      }
+
+      // check each dirfrag...
+      for (const auto &p : in->dirfrags) {
+	CDir *dir = p.second;
+	ceph_assert(dir->get_version() > 0);
+	nest_info.add(dir->get_fnode()->accounted_rstat);
+	dir_info.add(dir->get_fnode()->accounted_fragstat);
+      }
+      nest_info.rsubdirs++; // it gets one to account for self
+      if (const sr_t *srnode = in->get_projected_srnode(); srnode)
+	nest_info.rsnaps += srnode->snaps.size();
+
+      // ...and that their sum matches our inode settings
+      if (!dir_info.same_sums(in->get_inode()->dirstat) ||
+	  !nest_info.same_sums(in->get_inode()->rstat)) {
+	if (in->scrub_infop->header->get_repair()) {
+	  results->raw_stats.error_str
+	    << "freshly-calculated rstats don't match existing ones (will be fixed)";
+	  in->mdcache->repair_inode_stats(in);
+          results->raw_stats.repaired = true;
+	} else {
+	  results->raw_stats.error_str
+	    << "freshly-calculated rstats don't match existing ones";
+	}
+        if (in->is_dirty()) {
+          MDCache *mdcache = in->mdcache; // for dout()
+          auto ino = [this]() { return in->ino(); }; // for dout()
+          dout(20) << "raw stats most likely wont match since inode is dirty; "
+                      "please rerun scrub when system is stable; "
+                      "assuming passed for now;" << dendl;
+          results->raw_stats.passed = true;
+        }
+	goto next;
+      }
+
+      results->raw_stats.passed = true;
+      {
+        MDCache *mdcache = in->mdcache; // for dout()
+        auto ino = [this]() { return in->ino(); }; // for dout()
+        dout(20) << "raw stats check passed on " << *in << dendl;
+      }
+
+next:
+      return true;
+    }
+
+    void _done() override {
+      if ((!results->raw_stats.checked || results->raw_stats.passed) &&
+	  (!results->backtrace.checked || results->backtrace.passed) &&
+	  (!results->inode.checked || results->inode.passed))
+	results->passed_validation = true;
+
+      // Flag that we did some repair work so that our repair operation
+      // can be flushed at end of scrub
+      if (results->backtrace.repaired ||
+	  results->inode.repaired ||
+	  results->raw_stats.repaired)
+	in->scrub_infop->header->set_repaired();
+      if (fin)
+	fin->complete(get_rval());
+
+      in->auth_unpin(this);
+    }
+  };
+
+
+  dout(10) << "scrub starting validate_disk_state on " << *this << dendl;
+  ValidationContinuation *vc = new ValidationContinuation(this,
+                                                          results,
+                                                          fin);
+  vc->begin();
+}
+
+void CInode::validated_data::dump(Formatter *f) const
+{
+  f->open_object_section("results");
+  {
+    f->dump_bool("performed_validation", performed_validation);
+    f->dump_bool("passed_validation", passed_validation);
+    f->open_object_section("backtrace");
+    {
+      f->dump_bool("checked", backtrace.checked);
+      f->dump_bool("passed", backtrace.passed);
+      f->dump_int("read_ret_val", backtrace.ondisk_read_retval);
+      f->dump_stream("ondisk_value") << backtrace.ondisk_value;
+      f->dump_stream("memoryvalue") << backtrace.memory_value;
+      f->dump_string("error_str", backtrace.error_str.str());
+    }
+    f->close_section(); // backtrace
+    f->open_object_section("raw_stats");
+    {
+      f->dump_bool("checked", raw_stats.checked);
+      f->dump_bool("passed", raw_stats.passed);
+      f->dump_int("read_ret_val", raw_stats.ondisk_read_retval);
+      f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat;
+      f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat;
+      f->dump_stream("memory_value.dirstat") << raw_stats.memory_value.dirstat;
+      f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat;
+      f->dump_string("error_str", raw_stats.error_str.str());
+    }
+    f->close_section(); // raw_stats
+    // dump failure return code
+    int rc = 0;
+    if (backtrace.checked && backtrace.ondisk_read_retval)
+      rc = backtrace.ondisk_read_retval;
+    if (inode.checked && inode.ondisk_read_retval)
+      rc = inode.ondisk_read_retval;
+    if (raw_stats.checked && raw_stats.ondisk_read_retval)
+      rc = raw_stats.ondisk_read_retval;
+    f->dump_int("return_code", rc);
+  }
+  f->close_section(); // results
+}
+
+bool CInode::validated_data::all_damage_repaired() const
+{
+  bool unrepaired =
+    (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired)
+    ||
+    (backtrace.checked && !backtrace.passed && !backtrace.repaired)
+    ||
+    (inode.checked && !inode.passed && !inode.repaired);
+
+  return !unrepaired;
+}
+
+void CInode::dump(Formatter *f, int flags) const
+{
+  if (flags & DUMP_PATH) {
+    std::string path;
+    make_path_string(path, true);
+    if (path.empty())
+      path = "/";
+    f->dump_string("path", path);
+  }
+
+  if (flags & DUMP_INODE_STORE_BASE)
+    InodeStoreBase::dump(f);
+  
+  if (flags & DUMP_MDS_CACHE_OBJECT)
+    MDSCacheObject::dump(f);
+
+  if (flags & DUMP_LOCKS) {
+    f->open_object_section("versionlock");
+    versionlock.dump(f);
+    f->close_section();
+
+    f->open_object_section("authlock");
+    authlock.dump(f);
+    f->close_section();
+
+    f->open_object_section("linklock");
+    linklock.dump(f);
+    f->close_section();
+
+    f->open_object_section("dirfragtreelock");
+    dirfragtreelock.dump(f);
+    f->close_section();
+
+    f->open_object_section("filelock");
+    filelock.dump(f);
+    f->close_section();
+
+    f->open_object_section("xattrlock");
+    xattrlock.dump(f);
+    f->close_section();
+
+    f->open_object_section("snaplock");
+    snaplock.dump(f);
+    f->close_section();
+
+    f->open_object_section("nestlock");
+    nestlock.dump(f);
+    f->close_section();
+
+    f->open_object_section("flocklock");
+    flocklock.dump(f);
+    f->close_section();
+
+    f->open_object_section("policylock");
+    policylock.dump(f);
+    f->close_section();
+  }
+
+  if (flags & DUMP_STATE) {
+    f->open_array_section("states");
+    MDSCacheObject::dump_states(f);
+    if (state_test(STATE_EXPORTING))
+      f->dump_string("state", "exporting");
+    if (state_test(STATE_OPENINGDIR))
+      f->dump_string("state", "openingdir");
+    if (state_test(STATE_FREEZING))
+      f->dump_string("state", "freezing");
+    if (state_test(STATE_FROZEN))
+      f->dump_string("state", "frozen");
+    if (state_test(STATE_AMBIGUOUSAUTH))
+      f->dump_string("state", "ambiguousauth");
+    if (state_test(STATE_EXPORTINGCAPS))
+      f->dump_string("state", "exportingcaps");
+    if (state_test(STATE_NEEDSRECOVER))
+      f->dump_string("state", "needsrecover");
+    if (state_test(STATE_PURGING))
+      f->dump_string("state", "purging");
+    if (state_test(STATE_DIRTYPARENT))
+      f->dump_string("state", "dirtyparent");
+    if (state_test(STATE_DIRTYRSTAT))
+      f->dump_string("state", "dirtyrstat");
+    if (state_test(STATE_STRAYPINNED))
+      f->dump_string("state", "straypinned");
+    if (state_test(STATE_FROZENAUTHPIN))
+      f->dump_string("state", "frozenauthpin");
+    if (state_test(STATE_DIRTYPOOL))
+      f->dump_string("state", "dirtypool");
+    if (state_test(STATE_ORPHAN))
+      f->dump_string("state", "orphan");
+    if (state_test(STATE_MISSINGOBJS))
+      f->dump_string("state", "missingobjs");
+    f->close_section();
+  }
+
+  if (flags & DUMP_CAPS) {
+    f->open_array_section("client_caps");
+    for (const auto &p : client_caps) {
+      auto &client = p.first;
+      auto cap = &p.second;
+      f->open_object_section("client_cap");
+      f->dump_int("client_id", client.v);
+      f->dump_string("pending", ccap_string(cap->pending()));
+      f->dump_string("issued", ccap_string(cap->issued()));
+      f->dump_string("wanted", ccap_string(cap->wanted()));
+      f->dump_int("last_sent", cap->get_last_seq());
+      f->close_section();
+    }
+    f->close_section();
+
+    f->dump_int("loner", loner_cap.v);
+    f->dump_int("want_loner", want_loner_cap.v);
+
+    f->open_array_section("mds_caps_wanted");
+    for (const auto &p : mds_caps_wanted) {
+      f->open_object_section("mds_cap_wanted");
+      f->dump_int("rank", p.first);
+      f->dump_string("cap", ccap_string(p.second));
+      f->close_section();
+    }
+    f->close_section();
+  }
+
+  if (flags & DUMP_DIRFRAGS) {
+    f->open_array_section("dirfrags");
+    auto&& dfs = get_dirfrags();
+    for(const auto &dir: dfs) {
+      f->open_object_section("dir");
+      dir->dump(f, CDir::DUMP_DEFAULT | CDir::DUMP_ITEMS);
+      dir->check_rstats();
+      f->close_section();
+    }
+    f->close_section();
+  }
+}
+
+/****** Scrub Stuff *****/
+void CInode::scrub_info_create() const
+{
+  dout(25) << __func__ << dendl;
+  ceph_assert(!scrub_infop);
+
+  // break out of const-land to set up implicit initial state
+  CInode *me = const_cast<CInode*>(this);
+  const auto& pi = me->get_projected_inode();
+
+  std::unique_ptr<scrub_info_t> si(new scrub_info_t());
+  si->last_scrub_stamp = pi->last_scrub_stamp;
+  si->last_scrub_version = pi->last_scrub_version;
+
+  me->scrub_infop.swap(si);
+}
+
+void CInode::scrub_maybe_delete_info()
+{
+  if (scrub_infop &&
+      !scrub_infop->scrub_in_progress &&
+      !scrub_infop->last_scrub_dirty) {
+    scrub_infop.reset();
+  }
+}
+
+void CInode::scrub_initialize(ScrubHeaderRef& header)
+{
+  dout(20) << __func__ << " with scrub_version " << get_version() << dendl;
+
+  scrub_info();
+  scrub_infop->scrub_in_progress = true;
+  scrub_infop->queued_frags.clear();
+  scrub_infop->header = header;
+  header->inc_num_pending();
+  // right now we don't handle remote inodes
+}
+
+void CInode::scrub_aborted() {
+  dout(20) << __func__ << dendl;
+  ceph_assert(scrub_is_in_progress());
+
+  scrub_infop->scrub_in_progress = false;
+  scrub_infop->header->dec_num_pending();
+  scrub_maybe_delete_info();
+}
+
+void CInode::scrub_finished() {
+  dout(20) << __func__ << dendl;
+  ceph_assert(scrub_is_in_progress());
+
+  scrub_infop->last_scrub_version = get_version();
+  scrub_infop->last_scrub_stamp = ceph_clock_now();
+  scrub_infop->last_scrub_dirty = true;
+  scrub_infop->scrub_in_progress = false;
+  scrub_infop->header->dec_num_pending();
+}
+
+int64_t CInode::get_backtrace_pool() const
+{
+  if (is_dir()) {
+    return mdcache->mds->get_metadata_pool();
+  } else {
+    // Files are required to have an explicit layout that specifies
+    // a pool
+    ceph_assert(get_inode()->layout.pool_id != -1);
+    return get_inode()->layout.pool_id;
+  }
+}
+
+void CInode::queue_export_pin(mds_rank_t export_pin)
+{
+  if (state_test(CInode::STATE_QUEUEDEXPORTPIN))
+    return;
+
+  mds_rank_t target;
+  if (export_pin >= 0)
+    target = export_pin;
+  else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
+    target = mdcache->hash_into_rank_bucket(ino());
+  else
+    target = MDS_RANK_NONE;
+
+  unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
+  bool queue = false;
+  for (auto& p : dirfrags) {
+    CDir *dir = p.second;
+    if (!dir->is_auth())
+      continue;
+
+    if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
+      if (dir->get_frag().bits() < min_frag_bits) {
+	// needs split
+	queue = true;
+	break;
+      }
+      target = mdcache->hash_into_rank_bucket(ino(), dir->get_frag());
+    }
+
+    if (target != MDS_RANK_NONE) {
+      if (dir->is_subtree_root()) {
+	// set auxsubtree bit or export it
+	if (!dir->state_test(CDir::STATE_AUXSUBTREE) ||
+	    target != dir->get_dir_auth().first)
+	  queue = true;
+      } else {
+	// create aux subtree or export it
+	queue = true;
+      }
+    } else {
+      // clear aux subtrees ?
+      queue = dir->state_test(CDir::STATE_AUXSUBTREE);
+    }
+
+    if (queue)
+      break;
+  }
+  if (queue) {
+    state_set(CInode::STATE_QUEUEDEXPORTPIN);
+    mdcache->export_pin_queue.insert(this);
+  }
+}
+
+void CInode::maybe_export_pin(bool update)
+{
+  if (!g_conf()->mds_bal_export_pin)
+    return;
+  if (!is_dir() || !is_normal())
+    return;
+
+  dout(15) << __func__ << " update=" << update << " " << *this << dendl;
+
+  mds_rank_t export_pin = get_export_pin(false);
+  if (export_pin == MDS_RANK_NONE && !update)
+    return;
+
+  check_pin_policy(export_pin);
+  queue_export_pin(export_pin);
+}
+
+void CInode::set_ephemeral_pin(bool dist, bool rand)
+{
+  unsigned state = 0;
+  if (dist)
+    state |= STATE_DISTEPHEMERALPIN;
+  if (rand)
+    state |= STATE_RANDEPHEMERALPIN;
+  if (!state)
+    return;
+
+  if (state_test(state) != state) {
+    dout(10) << "set ephemeral (" << (dist ? "dist" : "")
+	     << (rand ? " rand" : "") << ") pin on " << *this << dendl;
+    if (!is_ephemerally_pinned()) {
+      auto p = mdcache->export_ephemeral_pins.insert(this);
+      ceph_assert(p.second);
+    }
+    state_set(state);
+  }
+}
+
+void CInode::clear_ephemeral_pin(bool dist, bool rand)
+{
+  unsigned state = 0;
+  if (dist)
+    state |= STATE_DISTEPHEMERALPIN;
+  if (rand)
+    state |= STATE_RANDEPHEMERALPIN;
+
+  if (state_test(state)) {
+    dout(10) << "clear ephemeral (" << (dist ? "dist" : "")
+	     << (rand ? " rand" : "") << ") pin on " << *this << dendl;
+    state_clear(state);
+    if (!is_ephemerally_pinned()) {
+      auto count = mdcache->export_ephemeral_pins.erase(this);
+      ceph_assert(count == 1);
+    }
+  }
+}
+
+void CInode::maybe_ephemeral_rand(double threshold)
+{
+  if (!mdcache->get_export_ephemeral_random_config()) {
+    dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl;
+    clear_ephemeral_pin(false, true);
+    return;
+  } else if (!is_dir() || !is_normal()) {
+    dout(15) << __func__ << " !dir or !normal: cannot ephemeral random pin " << *this << dendl;
+    clear_ephemeral_pin(false, true);
+    return;
+  } else if (get_inode()->nlink == 0) {
+    dout(15) << __func__ << " unlinked directory: cannot ephemeral random pin " << *this << dendl;
+    clear_ephemeral_pin(false, true);
+    return;
+  } else if (state_test(CInode::STATE_RANDEPHEMERALPIN)) {
+    dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl;
+    queue_export_pin(MDS_RANK_EPHEMERAL_RAND);
+    return;
+  }
+
+  /* not precomputed? */
+  if (threshold < 0.0) {
+    threshold = get_ephemeral_rand();
+  }
+  if (threshold <= 0.0) {
+    return;
+  }
+  double n = ceph::util::generate_random_number(0.0, 1.0);
+
+  dout(15) << __func__ << " rand " << n << " <?= " << threshold
+           << " " << *this << dendl;
+
+  if (n <= threshold) {
+    dout(10) << __func__ << " randomly export pinning " << *this << dendl;
+    set_ephemeral_pin(false, true);
+    queue_export_pin(MDS_RANK_EPHEMERAL_RAND);
+  }
+}
+
+void CInode::setxattr_ephemeral_rand(double probability)
+{
+  ceph_assert(is_dir());
+  _get_projected_inode()->export_ephemeral_random_pin = probability;
+}
+
+void CInode::setxattr_ephemeral_dist(bool val)
+{
+  ceph_assert(is_dir());
+  _get_projected_inode()->export_ephemeral_distributed_pin = val;
+}
+
+void CInode::set_export_pin(mds_rank_t rank)
+{
+  ceph_assert(is_dir());
+  _get_projected_inode()->export_pin = rank;
+  maybe_export_pin(true);
+}
+
+mds_rank_t CInode::get_export_pin(bool inherit) const
+{
+  if (!g_conf()->mds_bal_export_pin)
+    return MDS_RANK_NONE;
+
+  /* An inode that is export pinned may not necessarily be a subtree root, we
+   * need to traverse the parents. A base or system inode cannot be pinned.
+   * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
+   * have a parent yet.
+   */
+  mds_rank_t r_target = MDS_RANK_NONE;
+  const CInode *in = this;
+  const CDir *dir = nullptr;
+  while (true) {
+    if (in->is_system())
+      break;
+    const CDentry *pdn = in->get_parent_dn();
+    if (!pdn)
+      break;
+    if (in->get_inode()->nlink == 0) {
+      // ignore export pin for unlinked directory
+      break;
+    }
+
+    if (in->get_inode()->export_pin >= 0) {
+      return in->get_inode()->export_pin;
+    } else if (in->get_inode()->export_ephemeral_distributed_pin &&
+	       mdcache->get_export_ephemeral_distributed_config()) {
+      if (in != this)
+	return mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag());
+      return MDS_RANK_EPHEMERAL_DIST;
+    } else if (r_target != MDS_RANK_NONE && in->get_inode()->export_ephemeral_random_pin > 0.0) {
+      return r_target;
+    } else if (r_target == MDS_RANK_NONE && in->is_ephemeral_rand() &&
+	       mdcache->get_export_ephemeral_random_config()) {
+      /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
+      if (!inherit)
+	return MDS_RANK_EPHEMERAL_RAND;
+      if (in == this)
+	r_target = MDS_RANK_EPHEMERAL_RAND;
+      else
+	r_target = mdcache->hash_into_rank_bucket(in->ino());
+    }
+
+    if (!inherit)
+      break;
+    dir = pdn->get_dir();
+    in = dir->inode;
+  }
+  return MDS_RANK_NONE;
+}
+
+void CInode::check_pin_policy(mds_rank_t export_pin)
+{
+  if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
+    set_ephemeral_pin(true, false);
+    clear_ephemeral_pin(false, true);
+  } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) {
+    set_ephemeral_pin(false, true);
+    clear_ephemeral_pin(true, false);
+  } else if (is_ephemerally_pinned()) {
+    // export_pin >= 0 || export_pin == MDS_RANK_NONE
+    clear_ephemeral_pin(true, true);
+    if (export_pin != get_inode()->export_pin) // inherited export_pin
+      queue_export_pin(MDS_RANK_NONE);
+  }
+}
+
+double CInode::get_ephemeral_rand() const
+{
+  /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
+   * have a parent yet.
+   */
+  const CInode *in = this;
+  double max = mdcache->export_ephemeral_random_max;
+  while (true) {
+    if (in->is_system())
+      break;
+    const CDentry *pdn = in->get_parent_dn();
+    if (!pdn)
+      break;
+    // ignore export pin for unlinked directory
+    if (in->get_inode()->nlink == 0)
+      break;
+
+    if (in->get_inode()->export_ephemeral_random_pin > 0.0)
+      return std::min(in->get_inode()->export_ephemeral_random_pin, max);
+
+    /* An export_pin overrides only if no closer parent (incl. this one) has a
+     * random pin set.
+     */
+    if (in->get_inode()->export_pin >= 0 ||
+	in->get_inode()->export_ephemeral_distributed_pin)
+      return 0.0;
+
+    in = pdn->get_dir()->inode;
+  }
+  return 0.0;
+}
+
+void CInode::get_nested_dirfrags(std::vector<CDir*>& v) const
+{
+  for (const auto &p : dirfrags) {
+    const auto& dir = p.second;
+    if (!dir->is_subtree_root())
+      v.push_back(dir);
+  }
+}
+
+void CInode::get_subtree_dirfrags(std::vector<CDir*>& v) const
+{
+  for (const auto &p : dirfrags) {
+    const auto& dir = p.second;
+    if (dir->is_subtree_root())
+      v.push_back(dir);
+  }
+}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co);
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
new file mode 100644
index 000000000..fcf215c70
--- /dev/null
+++ b/src/mds/CInode.h
@@ -0,0 +1,1250 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_CINODE_H
+#define CEPH_CINODE_H
+
+#include <list>
+#include <map>
+#include <set>
+#include <string_view>
+
+#include "common/config.h"
+#include "common/RefCountedObj.h"
+#include "include/compat.h"
+#include "include/counter.h"
+#include "include/elist.h"
+#include "include/types.h"
+#include "include/lru.h"
+#include "include/compact_set.h"
+
+#include "MDSCacheObject.h"
+#include "MDSContext.h"
+#include "flock.h"
+
+#include "BatchOp.h"
+#include "CDentry.h"
+#include "SimpleLock.h"
+#include "ScatterLock.h"
+#include "LocalLockC.h"
+#include "Capability.h"
+#include "SnapRealm.h"
+#include "Mutation.h"
+
+#include "messages/MClientCaps.h"
+
+#define dout_context g_ceph_context
+
+class Context;
+class CDir;
+class CInode;
+class MDCache;
+class LogSegment;
+struct SnapRealm;
+class Session;
+struct ObjectOperation;
+class EMetaBlob;
+
+struct cinode_lock_info_t {
+  int lock;
+  int wr_caps;
+};
+
+struct CInodeCommitOperation {
+public:
+  CInodeCommitOperation(int prio, int64_t po)
+    : pool(po), priority(prio) {
+  }
+  CInodeCommitOperation(int prio, int64_t po, file_layout_t l, uint64_t f)
+    : pool(po), priority(prio), _layout(l), _features(f) {
+      update_layout = true;
+  }
+
+  void update(ObjectOperation &op, inode_backtrace_t &bt);
+  int64_t get_pool() { return pool; }
+
+private:
+  int64_t pool;     ///< pool id
+  int priority;
+  bool update_layout = false;
+  file_layout_t _layout;
+  uint64_t _features;
+};
+
+struct CInodeCommitOperations {
+  std::vector<CInodeCommitOperation> ops_vec;
+  inode_backtrace_t bt;
+  version_t version;
+  CInode *in;
+};
+
+/**
+ * Base class for CInode, containing the backing store data and
+ * serialization methods.  This exists so that we can read and
+ * handle CInodes from the backing store without hitting all
+ * the business logic in CInode proper.
+ */
+class InodeStoreBase {
+public:
+  using mempool_inode = inode_t<mempool::mds_co::pool_allocator>;
+  using inode_ptr = std::shared_ptr<mempool_inode>;
+  using inode_const_ptr = std::shared_ptr<const mempool_inode>;
+
+  template <typename ...Args>
+  static inode_ptr allocate_inode(Args && ...args) {
+    static mempool::mds_co::pool_allocator<mempool_inode> allocator;
+    return std::allocate_shared<mempool_inode>(allocator, std::forward<Args>(args)...);
+  }
+  
+  using mempool_xattr_map = xattr_map<mempool::mds_co::pool_allocator>; // FIXME bufferptr not in mempool
+  using xattr_map_ptr = std::shared_ptr<mempool_xattr_map>;
+  using xattr_map_const_ptr = std::shared_ptr<const mempool_xattr_map>;
+
+  template <typename ...Args>
+  static xattr_map_ptr allocate_xattr_map(Args && ...args) {
+    static mempool::mds_co::pool_allocator<mempool_xattr_map> allocator;
+    return std::allocate_shared<mempool_xattr_map>(allocator, std::forward<Args>(args)...);
+  }
+
+  using mempool_old_inode = old_inode_t<mempool::mds_co::pool_allocator>;
+  using mempool_old_inode_map = mempool::mds_co::map<snapid_t, mempool_old_inode>;
+  using old_inode_map_ptr = std::shared_ptr<mempool_old_inode_map>;
+  using old_inode_map_const_ptr = std::shared_ptr<const mempool_old_inode_map>;
+
+  template <typename ...Args>
+  static old_inode_map_ptr allocate_old_inode_map(Args && ...args) {
+    static mempool::mds_co::pool_allocator<mempool_old_inode_map> allocator;
+    return std::allocate_shared<mempool_old_inode_map>(allocator, std::forward<Args>(args)...);
+  }
+
+  void reset_inode(inode_const_ptr&& ptr) {
+    inode = std::move(ptr);
+  }
+
+  void reset_xattrs(xattr_map_const_ptr&& ptr) {
+    xattrs = std::move(ptr);
+  }
+
+  void reset_old_inodes(old_inode_map_const_ptr&& ptr) {
+    old_inodes = std::move(ptr);
+  }
+
+  void encode_xattrs(bufferlist &bl) const;
+  void decode_xattrs(bufferlist::const_iterator &p);
+  void encode_old_inodes(bufferlist &bl, uint64_t features) const;
+  void decode_old_inodes(bufferlist::const_iterator &p);
+
+  /* Helpers */
+  static object_t get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix);
+
+  /* Full serialization for use in ".inode" root inode objects */
+  void encode(ceph::buffer::list &bl, uint64_t features, const ceph::buffer::list *snap_blob=NULL) const;
+  void decode(ceph::buffer::list::const_iterator &bl, ceph::buffer::list& snap_blob);
+
+  /* Serialization without ENCODE_START/FINISH blocks for use embedded in dentry */
+  void encode_bare(ceph::buffer::list &bl, uint64_t features, const ceph::buffer::list *snap_blob=NULL) const;
+  void decode_bare(ceph::buffer::list::const_iterator &bl, ceph::buffer::list &snap_blob, __u8 struct_v=5);
+
+  /* For test/debug output */
+  void dump(ceph::Formatter *f) const;
+
+  void decode_json(JSONObj *obj);
+  static void xattrs_cb(InodeStoreBase::mempool_xattr_map& c, JSONObj *obj);
+  static void old_indoes_cb(InodeStoreBase::mempool_old_inode_map& c, JSONObj *obj);
+  
+  /* For use by offline tools */
+  __u32 hash_dentry_name(std::string_view dn);
+  frag_t pick_dirfrag(std::string_view dn);
+
+  mempool::mds_co::string	symlink;      // symlink dest, if symlink
+  fragtree_t			dirfragtree;  // dir frag tree, if any.  always consistent with our dirfrag map.
+  snapid_t 			oldest_snap = CEPH_NOSNAP;
+  damage_flags_t 		damage_flags = 0;
+
+protected:
+  static inode_const_ptr	empty_inode;
+
+  // Following members are pointers to constant data, the constant data can
+  // be shared by CInode and log events. To update these members in CInode,
+  // read-copy-update should be used.
+  inode_const_ptr		inode = empty_inode;
+  xattr_map_const_ptr		xattrs;
+  old_inode_map_const_ptr	old_inodes;   // key = last, value.first = first
+};
+
+inline void decode_noshare(InodeStoreBase::mempool_xattr_map& xattrs,
+                          ceph::buffer::list::const_iterator &p)
+{
+  decode_noshare<mempool::mds_co::pool_allocator>(xattrs, p);
+}
+
+class InodeStore : public InodeStoreBase {
+public:
+  mempool_inode* get_inode() {
+    if (inode == empty_inode)
+      reset_inode(allocate_inode());
+    return const_cast<mempool_inode*>(inode.get());
+  }
+  mempool_xattr_map* get_xattrs() { return const_cast<mempool_xattr_map*>(xattrs.get()); }
+
+  void encode(ceph::buffer::list &bl, uint64_t features) const {
+    InodeStoreBase::encode(bl, features, &snap_blob);
+  }
+  void decode(ceph::buffer::list::const_iterator &bl) {
+    InodeStoreBase::decode(bl, snap_blob);
+  }
+  void encode_bare(ceph::buffer::list &bl, uint64_t features) const {
+    InodeStoreBase::encode_bare(bl, features, &snap_blob);
+  }
+  void decode_bare(ceph::buffer::list::const_iterator &bl) {
+    InodeStoreBase::decode_bare(bl, snap_blob);
+  }
+
+  static void generate_test_instances(std::list<InodeStore*>& ls);
+
+  using InodeStoreBase::inode;
+  using InodeStoreBase::xattrs;
+  using InodeStoreBase::old_inodes;
+
+  // FIXME bufferlist not part of mempool
+  ceph::buffer::list snap_blob;  // Encoded copy of SnapRealm, because we can't
+			 // rehydrate it without full MDCache
+};
+WRITE_CLASS_ENCODER_FEATURES(InodeStore)
+
+// just for ceph-dencoder
+class InodeStoreBare : public InodeStore {
+public:
+  void encode(ceph::buffer::list &bl, uint64_t features) const {
+    InodeStore::encode_bare(bl, features);
+  }
+  void decode(ceph::buffer::list::const_iterator &bl) {
+    InodeStore::decode_bare(bl);
+  }
+  static void generate_test_instances(std::list<InodeStoreBare*>& ls);
+};
+WRITE_CLASS_ENCODER_FEATURES(InodeStoreBare)
+
+// cached inode wrapper
+class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CInode> {
+ public:
+  MEMPOOL_CLASS_HELPERS();
+
+  using mempool_cap_map = mempool::mds_co::map<client_t, Capability>;
+  /**
+   * @defgroup Scrubbing and fsck
+   */
+
+  /**
+   * Report the results of validation against a particular inode.
+   * Each member is a pair of bools.
+   * <member>.first represents if validation was performed against the member.
+   * <member.second represents if the member passed validation.
+   * performed_validation is set to true if the validation was actually
+   * run. It might not be run if, for instance, the inode is marked as dirty.
+   * passed_validation is set to true if everything that was checked
+   * passed its validation.
+   */
+  struct validated_data {
+    template<typename T>struct member_status {
+      bool checked = false;
+      bool passed = false;
+      bool repaired = false;
+      int ondisk_read_retval = 0;
+      T ondisk_value;
+      T memory_value;
+      std::stringstream error_str;
+    };
+
+    struct raw_stats_t {
+      frag_info_t dirstat;
+      nest_info_t rstat;
+    };
+
+    validated_data() {}
+
+    void dump(ceph::Formatter *f) const;
+
+    bool all_damage_repaired() const;
+
+    bool performed_validation = false;
+    bool passed_validation = false;
+
+    member_status<inode_backtrace_t> backtrace;
+    member_status<mempool_inode> inode; // XXX should not be in mempool; wait for pmr
+    member_status<raw_stats_t> raw_stats;
+  };
+
+  // friends
+  friend class Server;
+  friend class Locker;
+  friend class Migrator;
+  friend class MDCache;
+  friend class StrayManager;
+  friend class CDir;
+  friend std::ostream& operator<<(std::ostream&, const CInode&);
+
+  class scrub_info_t {
+  public:
+    scrub_info_t() {}
+
+    version_t last_scrub_version = 0;
+    utime_t last_scrub_stamp;
+
+    bool last_scrub_dirty = false; /// are our stamps dirty with respect to disk state?
+    bool scrub_in_progress = false; /// are we currently scrubbing?
+
+    fragset_t queued_frags;
+
+    ScrubHeaderRef header;
+  };
+
+  // -- pins --
+  static const int PIN_DIRFRAG =         -1; 
+  static const int PIN_CAPS =             2;  // client caps
+  static const int PIN_IMPORTING =       -4;  // importing
+  static const int PIN_OPENINGDIR =       7;
+  static const int PIN_REMOTEPARENT =     8;
+  static const int PIN_BATCHOPENJOURNAL = 9;
+  static const int PIN_SCATTERED =        10;
+  static const int PIN_STICKYDIRS =       11;
+  //static const int PIN_PURGING =         -12;	
+  static const int PIN_FREEZING =         13;
+  static const int PIN_FROZEN =           14;
+  static const int PIN_IMPORTINGCAPS =   -15;
+  static const int PIN_PASTSNAPPARENT =  -16;
+  static const int PIN_OPENINGSNAPPARENTS = 17;
+  static const int PIN_TRUNCATING =       18;
+  static const int PIN_STRAY =            19;  // we pin our stray inode while active
+  static const int PIN_NEEDSNAPFLUSH =    20;
+  static const int PIN_DIRTYRSTAT =       21;
+  static const int PIN_EXPORTINGCAPS =    22;
+  static const int PIN_DIRTYPARENT =      23;
+  static const int PIN_DIRWAITER =        24;
+
+  // -- dump flags --
+  static const int DUMP_INODE_STORE_BASE = (1 << 0);
+  static const int DUMP_MDS_CACHE_OBJECT = (1 << 1);
+  static const int DUMP_LOCKS =            (1 << 2);
+  static const int DUMP_STATE =            (1 << 3);
+  static const int DUMP_CAPS =             (1 << 4);
+  static const int DUMP_PATH =             (1 << 5);
+  static const int DUMP_DIRFRAGS =         (1 << 6);
+  static const int DUMP_ALL =              (-1);
+  static const int DUMP_DEFAULT = DUMP_ALL & (~DUMP_PATH) & (~DUMP_DIRFRAGS);
+
+  // -- state --
+  static const int STATE_EXPORTING 		= (1<<0);   // on nonauth bystander.
+  static const int STATE_OPENINGDIR		= (1<<1);
+  static const int STATE_FREEZING		= (1<<2);
+  static const int STATE_FROZEN			= (1<<3);
+  static const int STATE_AMBIGUOUSAUTH		= (1<<4);
+  static const int STATE_EXPORTINGCAPS		= (1<<5);
+  static const int STATE_NEEDSRECOVER		= (1<<6);
+  static const int STATE_RECOVERING		= (1<<7);
+  static const int STATE_PURGING		= (1<<8);
+  static const int STATE_DIRTYPARENT		= (1<<9);
+  static const int STATE_DIRTYRSTAT		= (1<<10);
+  static const int STATE_STRAYPINNED		= (1<<11);
+  static const int STATE_FROZENAUTHPIN		= (1<<12);
+  static const int STATE_DIRTYPOOL		= (1<<13);
+  static const int STATE_REPAIRSTATS		= (1<<14);
+  static const int STATE_MISSINGOBJS		= (1<<15);
+  static const int STATE_EVALSTALECAPS		= (1<<16);
+  static const int STATE_QUEUEDEXPORTPIN	= (1<<17);
+  static const int STATE_TRACKEDBYOFT		= (1<<18);  // tracked by open file table
+  static const int STATE_DELAYEDEXPORTPIN	= (1<<19);
+  static const int STATE_DISTEPHEMERALPIN       = (1<<20);
+  static const int STATE_RANDEPHEMERALPIN       = (1<<21);
+  static const int STATE_CLIENTWRITEABLE	= (1<<22);
+
+  // orphan inode needs notification of releasing reference
+  static const int STATE_ORPHAN =	STATE_NOTIFYREF;
+
+  static const int MASK_STATE_EXPORTED =
+    (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL|
+    STATE_DISTEPHEMERALPIN|STATE_RANDEPHEMERALPIN);
+  static const int MASK_STATE_EXPORT_KEPT =
+    (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS|
+     STATE_QUEUEDEXPORTPIN|STATE_TRACKEDBYOFT|STATE_DELAYEDEXPORTPIN|
+     STATE_DISTEPHEMERALPIN|STATE_RANDEPHEMERALPIN);
+
+  /* These are for "permanent" state markers that are passed around between
+   * MDS. Nothing protects/updates it like a typical MDS lock.
+   *
+   * Currently, we just use this for REPLICATED inodes. The reason we need to
+   * replicate the random epin state is because the directory inode is still
+   * under the authority of the parent subtree. So it's not exported normally
+   * and we can't pass around the state that way. The importer of the dirfrags
+   * still needs to know that the inode is random pinned though otherwise it
+   * doesn't know that the dirfrags are pinned.
+   */
+  static const int MASK_STATE_REPLICATED = STATE_RANDEPHEMERALPIN;
+
+  // -- waiters --
+  static const uint64_t WAIT_DIR         = (1<<0);
+  static const uint64_t WAIT_FROZEN      = (1<<1);
+  static const uint64_t WAIT_TRUNC       = (1<<2);
+  static const uint64_t WAIT_FLOCK       = (1<<3);
+  
+  static const uint64_t WAIT_ANY_MASK	= (uint64_t)(-1);
+
+  // misc
+  static const unsigned EXPORT_NONCE = 1; // nonce given to replicas created by export
+
+  // ---------------------------
+  CInode() = delete;
+  CInode(MDCache *c, bool auth=true, snapid_t f=2, snapid_t l=CEPH_NOSNAP);
+  ~CInode() override {
+    close_dirfrags();
+    close_snaprealm();
+    clear_file_locks();
+    ceph_assert(num_projected_srnodes == 0);
+    ceph_assert(num_caps_notable == 0);
+    ceph_assert(num_subtree_roots == 0);
+    ceph_assert(num_exporting_dirs == 0);
+    ceph_assert(batch_ops.empty());
+  }
+
+  std::map<int, std::unique_ptr<BatchOp>> batch_ops;
+
+  std::string_view pin_name(int p) const override;
+
+  std::ostream& print_db_line_prefix(std::ostream& out) override;
+
+  const scrub_info_t *scrub_info() const {
+    if (!scrub_infop)
+      scrub_info_create();
+    return scrub_infop.get();
+  }
+
+  const ScrubHeaderRef& get_scrub_header() {
+    static const ScrubHeaderRef nullref;
+    return scrub_infop ? scrub_infop->header : nullref;
+  }
+
+  bool scrub_is_in_progress() const {
+    return (scrub_infop && scrub_infop->scrub_in_progress);
+  }
+  /**
+   * Start scrubbing on this inode. That could be very short if it's
+   * a file, or take a long time if we're recursively scrubbing a directory.
+   * @pre It is not currently scrubbing
+   * @post it has set up internal scrubbing state
+   * @param scrub_version What version are we scrubbing at (usually, parent
+   * directory's get_projected_version())
+   */
+  void scrub_initialize(ScrubHeaderRef& header);
+  /**
+   * Call this once the scrub has been completed, whether it's a full
+   * recursive scrub on a directory or simply the data on a file (or
+   * anything in between).
+   * @param c An out param which is filled in with a Context* that must
+   * be complete()ed.
+   */
+  void scrub_finished();
+
+  void scrub_aborted();
+
+  fragset_t& scrub_queued_frags() {
+    ceph_assert(scrub_infop);
+    return scrub_infop->queued_frags;
+  }
+
+  bool is_multiversion() const {
+    return snaprealm ||  // other snaprealms will link to me
+      get_inode()->is_dir() ||  // links to me in other snaps
+      get_inode()->nlink > 1 || // there are remote links, possibly snapped, that will need to find me
+      is_any_old_inodes(); // once multiversion, always multiversion.  until old_inodes gets cleaned out.
+  }
+  snapid_t get_oldest_snap();
+
+  bool is_dirty_rstat() {
+    return state_test(STATE_DIRTYRSTAT);
+  }
+  void mark_dirty_rstat();
+  void clear_dirty_rstat();
+
+  //bool hack_accessed = false;
+  //utime_t hack_load_stamp;
+
+  /**
+   * Projection methods, used to store inode changes until they have been journaled,
+   * at which point they are popped.
+   * Usage:
+   * project_inode as needed. If you're changing xattrs or sr_t, then pass true
+   * as needed then change the xattrs/snapnode member as needed. (Dirty
+   * exception: project_past_snaprealm_parent allows you to project the
+   * snapnode after doing project_inode (i.e. you don't need to pass
+   * snap=true).
+   *
+   * Then, journal. Once journaling is done, pop_and_dirty_projected_inode.
+   * This function will take care of the inode itself, the xattrs, and the snaprealm.
+   */
+
+  struct projected_inode {
+    static sr_t* const UNDEF_SRNODE;
+
+    inode_ptr const inode;
+    xattr_map_ptr const xattrs;
+    sr_t* const snapnode;
+
+    projected_inode() = delete;
+    explicit projected_inode(inode_ptr&& i, xattr_map_ptr&& x, sr_t *s=nullptr) :
+      inode(std::move(i)), xattrs(std::move(x)), snapnode(s) {}
+  };
+  projected_inode project_inode(const MutationRef& mut,
+				bool xattr = false, bool snap = false);
+
+  void pop_and_dirty_projected_inode(LogSegment *ls, const MutationRef& mut);
+
+  version_t get_projected_version() const {
+    if (projected_nodes.empty())
+      return get_inode()->version;
+    else
+      return projected_nodes.back().inode->version;
+  }
+  bool is_projected() const {
+    return !projected_nodes.empty();
+  }
+
+  const inode_const_ptr& get_projected_inode() const {
+    if (projected_nodes.empty())
+      return get_inode();
+    else
+      return projected_nodes.back().inode;
+  }
+  // inode should have already been projected in caller's context
+  mempool_inode* _get_projected_inode() {
+    ceph_assert(!projected_nodes.empty());
+    return const_cast<mempool_inode*>(projected_nodes.back().inode.get());
+  }
+  const inode_const_ptr& get_previous_projected_inode() const {
+    ceph_assert(!projected_nodes.empty());
+    auto it = projected_nodes.rbegin();
+    ++it;
+    if (it != projected_nodes.rend())
+      return it->inode;
+    else
+      return get_inode();
+  }
+
+  const xattr_map_const_ptr& get_projected_xattrs() {
+    if (projected_nodes.empty())
+      return xattrs;
+    else
+      return projected_nodes.back().xattrs;
+  }
+  const xattr_map_const_ptr& get_previous_projected_xattrs() {
+    ceph_assert(!projected_nodes.empty());
+    auto it = projected_nodes.rbegin();
+    ++it;
+    if (it != projected_nodes.rend())
+      return it->xattrs;
+    else
+      return xattrs;
+  }
+
+  sr_t *prepare_new_srnode(snapid_t snapid);
+  void project_snaprealm(sr_t *new_srnode);
+  sr_t *project_snaprealm(snapid_t snapid=0) {
+    sr_t* new_srnode = prepare_new_srnode(snapid);
+    project_snaprealm(new_srnode);
+    return new_srnode;
+  }
+  const sr_t *get_projected_srnode() const;
+
+  void mark_snaprealm_global(sr_t *new_srnode);
+  void clear_snaprealm_global(sr_t *new_srnode);
+  bool is_projected_snaprealm_global() const;
+
+  void record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent);
+  void record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *newparent,
+				      CDentry *dn, bool primary_dn);
+  void project_snaprealm_past_parent(SnapRealm *newparent);
+  void early_pop_projected_snaprealm();
+
+  const mempool_old_inode& cow_old_inode(snapid_t follows, bool cow_head);
+  void split_old_inode(snapid_t snap);
+  snapid_t pick_old_inode(snapid_t last) const;
+  void pre_cow_old_inode();
+  bool has_snap_data(snapid_t s);
+  void purge_stale_snap_data(const std::set<snapid_t>& snaps);
+
+  size_t get_num_dirfrags() const { return dirfrags.size(); }
+  CDir* get_dirfrag(frag_t fg) {
+    auto pi = dirfrags.find(fg);
+    if (pi != dirfrags.end()) {
+      //assert(g_conf()->debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME
+      return pi->second;
+    } 
+    return NULL;
+  }
+  std::pair<bool, std::vector<CDir*>> get_dirfrags_under(frag_t fg);
+  CDir* get_approx_dirfrag(frag_t fg);
+
+  template<typename Container>
+  void get_dirfrags(Container& ls) const {
+    // all dirfrags
+    if constexpr (std::is_same_v<Container, std::vector<CDir*>>)
+      ls.reserve(ls.size() + dirfrags.size());
+    for (const auto &p : dirfrags)
+      ls.push_back(p.second);
+  }
+
+  auto get_dirfrags() const {
+    std::vector<CDir*> result;
+    get_dirfrags(result);
+    return result;
+  }
+
+  void get_nested_dirfrags(std::vector<CDir*>&) const;
+  std::vector<CDir*> get_nested_dirfrags() const {
+    std::vector<CDir*> v;
+    get_nested_dirfrags(v);
+    return v;
+  }
+  void get_subtree_dirfrags(std::vector<CDir*>&) const;
+  std::vector<CDir*> get_subtree_dirfrags() const {
+    std::vector<CDir*> v;
+    get_subtree_dirfrags(v);
+    return v;
+  }
+  int get_num_subtree_roots() const {
+    return num_subtree_roots;
+  }
+
+  CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg);
+  CDir *add_dirfrag(CDir *dir);
+  void close_dirfrag(frag_t fg);
+  void close_dirfrags();
+  bool has_subtree_root_dirfrag(int auth=-1);
+  bool has_subtree_or_exporting_dirfrag();
+
+  void force_dirfrags();
+  void verify_dirfrags();
+
+  void get_stickydirs();
+  void put_stickydirs();
+
+  void add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
+  void remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
+  std::pair<bool,bool> split_need_snapflush(CInode *cowin, CInode *in);
+
+  // -- accessors --
+
+  inodeno_t ino() const { return get_inode()->ino; }
+  vinodeno_t vino() const { return vinodeno_t(ino(), last); }
+  int d_type() const { return IFTODT(get_inode()->mode); }
+  bool is_root() const { return ino() == CEPH_INO_ROOT; }
+  bool is_stray() const { return MDS_INO_IS_STRAY(ino()); }
+  mds_rank_t get_stray_owner() const {
+    return (mds_rank_t)MDS_INO_STRAY_OWNER(ino());
+  }
+  bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(ino()); }
+  bool is_base() const { return MDS_INO_IS_BASE(ino()); }
+  bool is_system() const { return ino() < MDS_INO_SYSTEM_BASE; }
+  bool is_normal() const { return !(is_base() || is_system() || is_stray()); }
+  bool is_file() const    { return get_inode()->is_file(); }
+  bool is_symlink() const { return get_inode()->is_symlink(); }
+  bool is_dir() const     { return get_inode()->is_dir(); }
+
+  bool is_head() const { return last == CEPH_NOSNAP; }
+
+  // note: this overloads MDSCacheObject
+  bool is_ambiguous_auth() const {
+    return state_test(STATE_AMBIGUOUSAUTH) ||
+      MDSCacheObject::is_ambiguous_auth();
+  }
+  void set_ambiguous_auth() {
+    state_set(STATE_AMBIGUOUSAUTH);
+  }
+  void clear_ambiguous_auth(MDSContext::vec& finished);
+  void clear_ambiguous_auth();
+
+  const inode_const_ptr& get_inode() const {
+    return inode;
+  }
+
+  // only used for updating newly allocated CInode
+  mempool_inode* _get_inode() {
+    if (inode == empty_inode)
+      reset_inode(allocate_inode());
+    return const_cast<mempool_inode*>(inode.get());
+  }
+
+  const xattr_map_const_ptr& get_xattrs() const { return xattrs; }
+
+  bool is_any_old_inodes() const { return old_inodes && !old_inodes->empty(); }
+  const old_inode_map_const_ptr& get_old_inodes() const { return old_inodes; }
+
+  CDentry* get_parent_dn() { return parent; }
+  const CDentry* get_parent_dn() const { return parent; }
+  CDentry* get_projected_parent_dn() { return !projected_parent.empty() ? projected_parent.back() : parent; }
+  const CDentry* get_projected_parent_dn() const { return !projected_parent.empty() ? projected_parent.back() : parent; }
+  const CDentry* get_oldest_parent_dn() const {
+    if (parent)
+      return parent;
+    return !projected_parent.empty() ? projected_parent.front(): NULL;
+  }
+  CDir *get_parent_dir();
+  const CDir *get_projected_parent_dir() const;
+  CDir *get_projected_parent_dir();
+  CInode *get_parent_inode();
+  
+  bool is_lt(const MDSCacheObject *r) const override {
+    const CInode *o = static_cast<const CInode*>(r);
+    return ino() < o->ino() ||
+      (ino() == o->ino() && last < o->last);
+  }
+
+  // -- misc -- 
+  bool is_ancestor_of(const CInode *other) const;
+  bool is_projected_ancestor_of(const CInode *other) const;
+
+  void make_path_string(std::string& s, bool projected=false, const CDentry *use_parent=NULL) const;
+  void make_path(filepath& s, bool projected=false) const;
+  void name_stray_dentry(std::string& dname);
+  
+  // -- dirtyness --
+  version_t get_version() const { return get_inode()->version; }
+
+  version_t pre_dirty();
+  void _mark_dirty(LogSegment *ls);
+  void mark_dirty(LogSegment *ls);
+  void mark_clean();
+
+  void store(MDSContext *fin);
+  void _stored(int r, version_t cv, Context *fin);
+  /**
+   * Flush a CInode to disk. This includes the backtrace, the parent
+   * directory's link, and the Inode object itself (if a base directory).
+   * @pre is_auth() on both the inode and its containing directory
+   * @pre can_auth_pin()
+   * @param fin The Context to call when the flush is completed.
+   */
+  void flush(MDSContext *fin);
+  void fetch(MDSContext *fin);
+  void _fetched(ceph::buffer::list& bl, ceph::buffer::list& bl2, Context *fin);  
+
+  void _commit_ops(int r, C_GatherBuilder &gather_bld,
+                   std::vector<CInodeCommitOperation> &ops_vec,
+                   inode_backtrace_t &bt);
+  void build_backtrace(int64_t pool, inode_backtrace_t& bt);
+  void _store_backtrace(std::vector<CInodeCommitOperation> &ops_vec,
+                        inode_backtrace_t &bt, int op_prio);
+  void store_backtrace(CInodeCommitOperations &op, int op_prio);
+  void store_backtrace(MDSContext *fin, int op_prio=-1);
+  void _stored_backtrace(int r, version_t v, Context *fin);
+  void fetch_backtrace(Context *fin, ceph::buffer::list *backtrace);
+
+  void mark_dirty_parent(LogSegment *ls, bool dirty_pool=false);
+  void clear_dirty_parent();
+  void verify_diri_backtrace(ceph::buffer::list &bl, int err);
+  bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); }
+  bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL); }
+
+  void encode_snap_blob(ceph::buffer::list &bl);
+  void decode_snap_blob(const ceph::buffer::list &bl);
+  void encode_store(ceph::buffer::list& bl, uint64_t features);
+  void decode_store(ceph::buffer::list::const_iterator& bl);
+
+  void add_dir_waiter(frag_t fg, MDSContext *c);
+  void take_dir_waiting(frag_t fg, MDSContext::vec& ls);
+  bool is_waiting_for_dir(frag_t fg) {
+    return waiting_on_dir.count(fg);
+  }
+  void add_waiter(uint64_t tag, MDSContext *c) override;
+  void take_waiting(uint64_t tag, MDSContext::vec& ls) override;
+
+  // -- encode/decode helpers --
+  void _encode_base(ceph::buffer::list& bl, uint64_t features);
+  void _decode_base(ceph::buffer::list::const_iterator& p);
+  void _encode_locks_full(ceph::buffer::list& bl);
+  void _decode_locks_full(ceph::buffer::list::const_iterator& p);
+  void _encode_locks_state_for_replica(ceph::buffer::list& bl, bool need_recover);
+  void _encode_locks_state_for_rejoin(ceph::buffer::list& bl, int rep);
+  void _decode_locks_state_for_replica(ceph::buffer::list::const_iterator& p, bool is_new);
+  void _decode_locks_rejoin(ceph::buffer::list::const_iterator& p, MDSContext::vec& waiters,
+			    std::list<SimpleLock*>& eval_locks, bool survivor);
+
+  // -- import/export --
+  void encode_export(ceph::buffer::list& bl);
+  void finish_export();
+  void abort_export() {
+    put(PIN_TEMPEXPORTING);
+    ceph_assert(state_test(STATE_EXPORTINGCAPS));
+    state_clear(STATE_EXPORTINGCAPS);
+    put(PIN_EXPORTINGCAPS);
+  }
+  void decode_import(ceph::buffer::list::const_iterator& p, LogSegment *ls);
+  
+  // for giving to clients
+  int encode_inodestat(ceph::buffer::list& bl, Session *session, SnapRealm *realm,
+		       snapid_t snapid=CEPH_NOSNAP, unsigned max_bytes=0,
+		       int getattr_wants=0);
+  void encode_cap_message(const ceph::ref_t<MClientCaps> &m, Capability *cap);
+
+  SimpleLock* get_lock(int type) override;
+
+  void set_object_info(MDSCacheObjectInfo &info) override;
+
+  void encode_lock_state(int type, ceph::buffer::list& bl) override;
+  void decode_lock_state(int type, const ceph::buffer::list& bl) override;
+  void encode_lock_iauth(ceph::buffer::list& bl);
+  void decode_lock_iauth(ceph::buffer::list::const_iterator& p);
+  void encode_lock_ilink(ceph::buffer::list& bl);
+  void decode_lock_ilink(ceph::buffer::list::const_iterator& p);
+  void encode_lock_idft(ceph::buffer::list& bl);
+  void decode_lock_idft(ceph::buffer::list::const_iterator& p);
+  void encode_lock_ifile(ceph::buffer::list& bl);
+  void decode_lock_ifile(ceph::buffer::list::const_iterator& p);
+  void encode_lock_inest(ceph::buffer::list& bl);
+  void decode_lock_inest(ceph::buffer::list::const_iterator& p);
+  void encode_lock_ixattr(ceph::buffer::list& bl);
+  void decode_lock_ixattr(ceph::buffer::list::const_iterator& p);
+  void encode_lock_isnap(ceph::buffer::list& bl);
+  void decode_lock_isnap(ceph::buffer::list::const_iterator& p);
+  void encode_lock_iflock(ceph::buffer::list& bl);
+  void decode_lock_iflock(ceph::buffer::list::const_iterator& p);
+  void encode_lock_ipolicy(ceph::buffer::list& bl);
+  void decode_lock_ipolicy(ceph::buffer::list::const_iterator& p);
+
+  void _finish_frag_update(CDir *dir, MutationRef& mut);
+
+  void clear_dirty_scattered(int type) override;
+  bool is_dirty_scattered();
+  void clear_scatter_dirty();  // on rejoin ack
+
+  void start_scatter(ScatterLock *lock);
+  void finish_scatter_update(ScatterLock *lock, CDir *dir,
+			     version_t inode_version, version_t dir_accounted_version);
+  void finish_scatter_gather_update(int type, MutationRef& mut);
+  void finish_scatter_gather_update_accounted(int type, EMetaBlob *metablob);
+
+  // -- snap --
+  void open_snaprealm(bool no_split=false);
+  void close_snaprealm(bool no_join=false);
+  SnapRealm *find_snaprealm() const;
+  void encode_snap(ceph::buffer::list& bl);
+  void decode_snap(ceph::buffer::list::const_iterator& p);
+
+  client_t get_loner() const { return loner_cap; }
+  client_t get_wanted_loner() const { return want_loner_cap; }
+
+  // this is the loner state our locks should aim for
+  client_t get_target_loner() const {
+    if (loner_cap == want_loner_cap)
+      return loner_cap;
+    else
+      return -1;
+  }
+
+  client_t calc_ideal_loner();
+  void set_loner_cap(client_t l);
+  bool choose_ideal_loner();
+  bool try_set_loner();
+  bool try_drop_loner();
+
+  // choose new lock state during recovery, based on issued caps
+  void choose_lock_state(SimpleLock *lock, int allissued);
+  void choose_lock_states(int dirty_caps);
+
+  int count_nonstale_caps();
+  bool multiple_nonstale_caps();
+
+  bool is_any_caps() { return !client_caps.empty(); }
+  bool is_any_nonstale_caps() { return count_nonstale_caps(); }
+
+  const mempool::mds_co::compact_map<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted; }
+  void set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m);
+  void set_mds_caps_wanted(mds_rank_t mds, int32_t wanted);
+
+  const mempool_cap_map& get_client_caps() const { return client_caps; }
+  Capability *get_client_cap(client_t client) {
+    auto client_caps_entry = client_caps.find(client);
+    if (client_caps_entry != client_caps.end())
+      return &client_caps_entry->second;
+    return 0;
+  }
+  int get_client_cap_pending(client_t client) const {
+    auto client_caps_entry = client_caps.find(client);
+    if (client_caps_entry != client_caps.end()) {
+      return client_caps_entry->second.pending();
+    } else {
+      return 0;
+    }
+  }
+
+  int get_num_caps_notable() const { return num_caps_notable; }
+  void adjust_num_caps_notable(int d);
+
+  Capability *add_client_cap(client_t client, Session *session,
+			     SnapRealm *conrealm=nullptr, bool new_inode=false);
+  void remove_client_cap(client_t client);
+  void move_to_realm(SnapRealm *realm);
+
+  Capability *reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session);
+  void clear_client_caps_after_export();
+  void export_client_caps(std::map<client_t,Capability::Export>& cl);
+
+  // caps allowed
+  int get_caps_liked() const;
+  int get_caps_allowed_ever() const;
+  int get_caps_allowed_by_type(int type) const;
+  int get_caps_careful() const;
+  int get_xlocker_mask(client_t client) const;
+  int get_caps_allowed_for_client(Session *s, Capability *cap,
+				  const mempool_inode *file_i) const;
+
+  // caps issued, wanted
+  int get_caps_issued(int *ploner = 0, int *pother = 0, int *pxlocker = 0,
+		      int shift = 0, int mask = -1);
+  bool is_any_caps_wanted() const;
+  int get_caps_wanted(int *ploner = 0, int *pother = 0, int shift = 0, int mask = -1) const;
+  bool issued_caps_need_gather(SimpleLock *lock);
+
+  // client writeable
+  bool is_clientwriteable() const { return state & STATE_CLIENTWRITEABLE; }
+  void mark_clientwriteable();
+  void clear_clientwriteable();
+
+  // -- authority --
+  mds_authority_t authority() const override;
+
+  // -- auth pins --
+  bool can_auth_pin(int *err_ret=nullptr) const override;
+  void auth_pin(void *by) override;
+  void auth_unpin(void *by) override;
+
+  // -- freeze --
+  bool is_freezing_inode() const { return state_test(STATE_FREEZING); }
+  bool is_frozen_inode() const { return state_test(STATE_FROZEN); }
+  bool is_frozen_auth_pin() const { return state_test(STATE_FROZENAUTHPIN); }
+  bool is_frozen() const override;
+  bool is_frozen_dir() const;
+  bool is_freezing() const override;
+
+  /* Freeze the inode. auth_pin_allowance lets the caller account for any
+   * auth_pins it is itself holding/responsible for. */
+  bool freeze_inode(int auth_pin_allowance=0);
+  void unfreeze_inode(MDSContext::vec& finished);
+  void unfreeze_inode();
+
+  void freeze_auth_pin();
+  void unfreeze_auth_pin();
+
+  // -- reference counting --
+  void bad_put(int by) override {
+    generic_dout(0) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref
+#ifdef MDS_REF_SET
+		    << " (" << ref_map << ")"
+#endif
+		    << dendl;
+#ifdef MDS_REF_SET
+    ceph_assert(ref_map[by] > 0);
+#endif
+    ceph_assert(ref > 0);
+  }
+  void bad_get(int by) override {
+    generic_dout(0) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref
+#ifdef MDS_REF_SET
+		    << " (" << ref_map << ")"
+#endif
+		    << dendl;
+#ifdef MDS_REF_SET
+    ceph_assert(ref_map[by] >= 0);
+#endif
+  }
+  void first_get() override;
+  void last_put() override;
+  void _put() override;
+
+  // -- hierarchy stuff --
+  void set_primary_parent(CDentry *p) {
+    ceph_assert(parent == 0 ||
+	   g_conf().get_val<bool>("mds_hack_allow_loading_invalid_metadata"));
+    parent = p;
+  }
+  void remove_primary_parent(CDentry *dn) {
+    ceph_assert(dn == parent);
+    parent = 0;
+  }
+  void add_remote_parent(CDentry *p);
+  void remove_remote_parent(CDentry *p);
+  int num_remote_parents() {
+    return remote_parents.size(); 
+  }
+
+  void push_projected_parent(CDentry *dn) {
+    projected_parent.push_back(dn);
+  }
+  void pop_projected_parent() {
+    ceph_assert(projected_parent.size());
+    parent = projected_parent.front();
+    projected_parent.pop_front();
+  }
+  bool is_parent_projected() const {
+    return !projected_parent.empty();
+  }
+
+  mds_rank_t get_export_pin(bool inherit=true) const;
+  void check_pin_policy(mds_rank_t target);
+  void set_export_pin(mds_rank_t rank);
+  void queue_export_pin(mds_rank_t target);
+  void maybe_export_pin(bool update=false);
+
+  void set_ephemeral_pin(bool dist, bool rand);
+  void clear_ephemeral_pin(bool dist, bool rand);
+
+  void setxattr_ephemeral_dist(bool val=false);
+  bool is_ephemeral_dist() const {
+    return state_test(STATE_DISTEPHEMERALPIN);
+  }
+
+  double get_ephemeral_rand() const;
+  void maybe_ephemeral_rand(double threshold=-1.0);
+  void setxattr_ephemeral_rand(double prob=0.0);
+  bool is_ephemeral_rand() const {
+    return state_test(STATE_RANDEPHEMERALPIN);
+  }
+
+  bool has_ephemeral_policy() const {
+    return get_inode()->export_ephemeral_random_pin > 0.0 ||
+           get_inode()->export_ephemeral_distributed_pin;
+  }
+  bool is_ephemerally_pinned() const {
+    return state_test(STATE_DISTEPHEMERALPIN) ||
+           state_test(STATE_RANDEPHEMERALPIN);
+  }
+
+  void print(std::ostream& out) override;
+  void dump(ceph::Formatter *f, int flags = DUMP_DEFAULT) const;
+
+  /**
+   * Validate that the on-disk state of an inode matches what
+   * we expect from our memory state. Currently this checks that:
+   * 1) The backtrace associated with the file data exists and is correct
+   * 2) For directories, the actual inode metadata matches our memory state,
+   * 3) For directories, the rstats match
+   *
+   * @param results A freshly-created validated_data struct, with values set
+   * as described in the struct documentation.
+   * @param mdr The request to be responeded upon the completion of the
+   * validation (or NULL)
+   * @param fin Context to call back on completion (or NULL)
+   */
+  void validate_disk_state(validated_data *results,
+                           MDSContext *fin);
+  static void dump_validation_results(const validated_data& results,
+                                      ceph::Formatter *f);
+
+  //bool hack_accessed = false;
+  //utime_t hack_load_stamp;
+
+  MDCache *mdcache;
+
+  SnapRealm        *snaprealm = nullptr;
+  SnapRealm        *containing_realm = nullptr;
+  snapid_t          first, last;
+  mempool::mds_co::compact_set<snapid_t> dirty_old_rstats;
+
+  uint64_t last_journaled = 0;       // log offset for the last time i was journaled
+  //loff_t last_open_journaled;  // log offset for the last journaled EOpen
+  utime_t last_dirstat_prop;
+
+  // list item node for when we have unpropagated rstat data
+  elist<CInode*>::item dirty_rstat_item;
+
+  mempool::mds_co::set<client_t> client_snap_caps;
+  mempool::mds_co::compact_map<snapid_t, mempool::mds_co::set<client_t> > client_need_snapflush;
+
+  // LogSegment lists i (may) belong to
+  elist<CInode*>::item item_dirty;
+  elist<CInode*>::item item_caps;
+  elist<CInode*>::item item_open_file;
+  elist<CInode*>::item item_dirty_parent;
+  elist<CInode*>::item item_dirty_dirfrag_dir;
+  elist<CInode*>::item item_dirty_dirfrag_nest;
+  elist<CInode*>::item item_dirty_dirfrag_dirfragtree;
+
+  // also update RecoveryQueue::RecoveryQueue() if you change this
+  elist<CInode*>::item& item_recover_queue = item_dirty_dirfrag_dir;
+  elist<CInode*>::item& item_recover_queue_front = item_dirty_dirfrag_nest;
+
+  inode_load_vec_t pop;
+  elist<CInode*>::item item_pop_lru;
+
+  // -- locks --
+  static LockType versionlock_type;
+  static LockType authlock_type;
+  static LockType linklock_type;
+  static LockType dirfragtreelock_type;
+  static LockType filelock_type;
+  static LockType xattrlock_type;
+  static LockType snaplock_type;
+  static LockType nestlock_type;
+  static LockType flocklock_type;
+  static LockType policylock_type;
+
+  // FIXME not part of mempool
+  LocalLockC  versionlock;
+  SimpleLock authlock;
+  SimpleLock linklock;
+  ScatterLock dirfragtreelock;
+  ScatterLock filelock;
+  SimpleLock xattrlock;
+  SimpleLock snaplock;
+  ScatterLock nestlock;
+  SimpleLock flocklock;
+  SimpleLock policylock;
+
+  // -- caps -- (new)
+  // client caps
+  client_t loner_cap = -1, want_loner_cap = -1;
+
+protected:
+  ceph_lock_state_t *get_fcntl_lock_state() {
+    if (!fcntl_locks)
+      fcntl_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FCNTL);
+    return fcntl_locks;
+  }
+  void clear_fcntl_lock_state() {
+    delete fcntl_locks;
+    fcntl_locks = NULL;
+  }
+  ceph_lock_state_t *get_flock_lock_state() {
+    if (!flock_locks)
+      flock_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FLOCK);
+    return flock_locks;
+  }
+  void clear_flock_lock_state() {
+    delete flock_locks;
+    flock_locks = NULL;
+  }
+  void clear_file_locks() {
+    clear_fcntl_lock_state();
+    clear_flock_lock_state();
+  }
+  void _encode_file_locks(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    bool has_fcntl_locks = fcntl_locks && !fcntl_locks->empty();
+    encode(has_fcntl_locks, bl);
+    if (has_fcntl_locks)
+      encode(*fcntl_locks, bl);
+    bool has_flock_locks = flock_locks && !flock_locks->empty();
+    encode(has_flock_locks, bl);
+    if (has_flock_locks)
+      encode(*flock_locks, bl);
+  }
+  void _decode_file_locks(ceph::buffer::list::const_iterator& p) {
+    using ceph::decode;
+    bool has_fcntl_locks;
+    decode(has_fcntl_locks, p);
+    if (has_fcntl_locks)
+      decode(*get_fcntl_lock_state(), p);
+    else
+      clear_fcntl_lock_state();
+    bool has_flock_locks;
+    decode(has_flock_locks, p);
+    if (has_flock_locks)
+      decode(*get_flock_lock_state(), p);
+    else
+      clear_flock_lock_state();
+  }
+
+  /**
+   * Return the pool ID where we currently write backtraces for
+   * this inode (in addition to inode.old_pools)
+   *
+   * @returns a pool ID >=0
+   */
+  int64_t get_backtrace_pool() const;
+
+  // parent dentries in cache
+  CDentry         *parent = nullptr;             // primary link
+  mempool::mds_co::compact_set<CDentry*>    remote_parents;     // if hard linked
+
+  mempool::mds_co::list<CDentry*>   projected_parent;   // for in-progress rename, (un)link, etc.
+
+  mds_authority_t inode_auth = CDIR_AUTH_DEFAULT;
+
+  // -- distributed state --
+  // file capabilities
+  mempool_cap_map client_caps; // client -> caps
+  mempool::mds_co::compact_map<int32_t, int32_t> mds_caps_wanted;     // [auth] mds -> caps wanted
+  int replica_caps_wanted = 0; // [replica] what i've requested from auth
+  int num_caps_notable = 0;
+
+  ceph_lock_state_t *fcntl_locks = nullptr;
+  ceph_lock_state_t *flock_locks = nullptr;
+
+  // -- waiting --
+  mempool::mds_co::compact_map<frag_t, MDSContext::vec > waiting_on_dir;
+
+
+  // -- freezing inode --
+  int auth_pin_freeze_allowance = 0;
+  elist<CInode*>::item item_freezing_inode;
+  void maybe_finish_freeze_inode();
+private:
+
+  friend class ValidationContinuation;
+
+  /**
+   * Create a scrub_info_t struct for the scrub_infop pointer.
+   */
+  void scrub_info_create() const;
+  /**
+   * Delete the scrub_info_t struct if it's not got any useful data
+   */
+  void scrub_maybe_delete_info();
+
+  void pop_projected_snaprealm(sr_t *next_snaprealm, bool early);
+
+  bool _validate_disk_state(class ValidationContinuation *c,
+                            int rval, int stage);
+
+  struct projected_const_node {
+    inode_const_ptr inode;
+    xattr_map_const_ptr xattrs;
+    sr_t *snapnode;
+
+    projected_const_node() = delete;
+    projected_const_node(projected_const_node&&) = default;
+    explicit projected_const_node(const inode_const_ptr& i, const xattr_map_const_ptr& x, sr_t *s) :
+      inode(i), xattrs(x), snapnode(s) {}
+  };
+
+  mempool::mds_co::list<projected_const_node> projected_nodes;   // projected values (only defined while dirty)
+  size_t num_projected_srnodes = 0;
+
+  // -- cache infrastructure --
+  mempool::mds_co::compact_map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode
+
+  //for the purpose of quickly determining whether there's a subtree root or exporting dir
+  int num_subtree_roots = 0;
+  int num_exporting_dirs = 0;
+
+  int stickydir_ref = 0;
+  std::unique_ptr<scrub_info_t> scrub_infop;
+  /** @} Scrubbing and fsck */
+};
+
+std::ostream& operator<<(std::ostream& out, const CInode& in);
+
+extern cinode_lock_info_t cinode_lock_info[];
+extern int num_cinode_locks;
+#undef dout_context
+#endif
diff --git a/src/mds/CMakeLists.txt b/src/mds/CMakeLists.txt
new file mode 100644
index 000000000..a12898f38
--- /dev/null
+++ b/src/mds/CMakeLists.txt
@@ -0,0 +1,53 @@
+set(mds_srcs
+  BatchOp.cc
+  Capability.cc
+  MDSDaemon.cc
+  MDSRank.cc
+  Beacon.cc
+  flock.cc
+  locks.c
+  journal.cc
+  Server.cc
+  Mutation.cc
+  MDCache.cc
+  RecoveryQueue.cc
+  StrayManager.cc
+  PurgeQueue.cc
+  Locker.cc
+  Migrator.cc
+  MDBalancer.cc
+  CDentry.cc
+  CDir.cc
+  CInode.cc
+  LogEvent.cc
+  MDSTable.cc
+  InoTable.cc
+  JournalPointer.cc
+  MDSTableClient.cc
+  MDSTableServer.cc
+  ScrubStack.cc
+  DamageTable.cc
+  SimpleLock.cc
+  SnapRealm.cc
+  SnapServer.cc
+  SnapClient.cc
+  snap.cc
+  SessionMap.cc
+  MDSContext.cc
+  MDSAuthCaps.cc
+  MDLog.cc
+  MDSCacheObject.cc
+  Mantle.cc
+  Anchor.cc
+  OpenFileTable.cc
+  MDSPinger.cc
+  MetricAggregator.cc
+  MetricsHandler.cc
+  ${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc
+  ${CMAKE_SOURCE_DIR}/src/common/MemoryModel.cc
+  ${CMAKE_SOURCE_DIR}/src/osdc/Journaler.cc
+  ${CMAKE_SOURCE_DIR}/src/mgr/MDSPerfMetricTypes.cc)
+add_library(mds STATIC ${mds_srcs})
+target_link_libraries(mds PRIVATE
+  heap_profiler cpu_profiler osdc ${LUA_LIBRARIES})
+target_include_directories(mds PRIVATE "${LUA_INCLUDE_DIR}")
diff --git a/src/mds/Capability.cc b/src/mds/Capability.cc
new file mode 100644
index 000000000..b6258e466
--- /dev/null
+++ b/src/mds/Capability.cc
@@ -0,0 +1,295 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "Capability.h"
+#include "CInode.h"
+#include "SessionMap.h"
+
+#include "common/Formatter.h"
+
+
+/*
+ * Capability::Export
+ */
+
+void Capability::Export::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(3, 2, bl);
+  encode(cap_id, bl);
+  encode(wanted, bl);
+  encode(issued, bl);
+  encode(pending, bl);
+  encode(client_follows, bl);
+  encode(seq, bl);
+  encode(mseq, bl);
+  encode(last_issue_stamp, bl);
+  encode(state, bl);
+  ENCODE_FINISH(bl);
+}
+
+void Capability::Export::decode(ceph::buffer::list::const_iterator &p)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, p);
+  decode(cap_id, p);
+  decode(wanted, p);
+  decode(issued, p);
+  decode(pending, p);
+  decode(client_follows, p);
+  decode(seq, p);
+  decode(mseq, p);
+  decode(last_issue_stamp, p);
+  if (struct_v >= 3)
+    decode(state, p);
+  DECODE_FINISH(p);
+}
+
+void Capability::Export::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("cap_id", cap_id);
+  f->dump_stream("wanted") << ccap_string(wanted);
+  f->dump_stream("issued") << ccap_string(issued);
+  f->dump_stream("pending") << ccap_string(pending);
+  f->dump_unsigned("client_follows", client_follows);
+  f->dump_unsigned("seq", seq);
+  f->dump_unsigned("migrate_seq", mseq);
+  f->dump_stream("last_issue_stamp") << last_issue_stamp;
+}
+
+void Capability::Export::generate_test_instances(std::list<Capability::Export*>& ls)
+{
+  ls.push_back(new Export);
+  ls.push_back(new Export);
+  ls.back()->wanted = 1;
+  ls.back()->issued = 2;
+  ls.back()->pending = 3;
+  ls.back()->client_follows = 4;
+  ls.back()->mseq = 5;
+  ls.back()->last_issue_stamp = utime_t(6, 7);
+}
+
+void Capability::Import::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(cap_id, bl);
+  encode(issue_seq, bl);
+  encode(mseq, bl);
+  ENCODE_FINISH(bl);
+}
+
+void Capability::Import::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  decode(cap_id, bl);
+  decode(issue_seq, bl);
+  decode(mseq, bl);
+  DECODE_FINISH(bl);
+}
+
+void Capability::Import::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("cap_id", cap_id);
+  f->dump_unsigned("issue_seq", issue_seq);
+  f->dump_unsigned("migrate_seq", mseq);
+}
+
+/*
+ * Capability::revoke_info
+ */
+
+void Capability::revoke_info::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(2, 2, bl)
+  encode(before, bl);
+  encode(seq, bl);
+  encode(last_issue, bl);
+  ENCODE_FINISH(bl);
+}
+
+void Capability::revoke_info::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(before, bl);
+  decode(seq, bl);
+  decode(last_issue, bl);
+  DECODE_FINISH(bl);
+}
+
+void Capability::revoke_info::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("before", before);
+  f->dump_unsigned("seq", seq);
+  f->dump_unsigned("last_issue", last_issue);
+}
+
+void Capability::revoke_info::generate_test_instances(std::list<Capability::revoke_info*>& ls)
+{
+  ls.push_back(new revoke_info);
+  ls.push_back(new revoke_info);
+  ls.back()->before = 1;
+  ls.back()->seq = 2;
+  ls.back()->last_issue = 3;
+}
+
+
+/*
+ * Capability
+ */
+Capability::Capability(CInode *i, Session *s, uint64_t id) :
+  item_session_caps(this), item_snaprealm_caps(this),
+  item_revoking_caps(this), item_client_revoking_caps(this),
+  lock_caches(member_offset(MDLockCache, item_cap_lock_cache)),
+  inode(i), session(s), cap_id(id)
+{
+  if (session) {
+    session->touch_cap_bottom(this);
+    cap_gen = session->get_cap_gen();
+    if (session->is_stale())
+      --cap_gen; // not valid
+
+    auto& conn = session->get_connection();
+    if (conn) {
+      if (!conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
+	state |= STATE_NOINLINE;
+      if (!conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
+	state |= STATE_NOPOOLNS;
+      if (!conn->has_feature(CEPH_FEATURE_MDS_QUOTA))
+	state |= STATE_NOQUOTA;
+    }
+  } else {
+    cap_gen = 0;
+  }
+}
+
+client_t Capability::get_client() const
+{
+  return session ? session->get_client() : client_t(-1);
+}
+
+bool Capability::is_stale() const
+{
+  return session ? session->is_stale() : false;
+}
+
+bool Capability::is_valid() const
+{
+  return !session || session->get_cap_gen() == cap_gen;
+}
+
+void Capability::revalidate()
+{
+  if (!is_valid())
+    cap_gen = session->get_cap_gen();
+}
+
+void Capability::mark_notable()
+{
+  state |= STATE_NOTABLE;
+  session->touch_cap(this);
+}
+
+void Capability::maybe_clear_notable()
+{
+  if ((_issued == _pending) &&
+      !is_clientwriteable() &&
+      !is_wanted_notable(_wanted)) {
+    ceph_assert(is_notable());
+    state &= ~STATE_NOTABLE;
+    session->touch_cap_bottom(this);
+  }
+}
+
+void Capability::set_wanted(int w) {
+  CInode *in = get_inode();
+  if (in) {
+    if (!is_wanted_notable(_wanted) && is_wanted_notable(w)) {
+      in->adjust_num_caps_notable(1);
+      if (!is_notable())
+	mark_notable();
+    } else if (is_wanted_notable(_wanted) && !is_wanted_notable(w)) {
+      in->adjust_num_caps_notable(-1);
+      maybe_clear_notable();
+    }
+  }
+  _wanted = w;
+}
+
+void Capability::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(2, 2, bl)
+  encode(last_sent, bl);
+  encode(last_issue_stamp, bl);
+
+  encode(_wanted, bl);
+  encode(_pending, bl);
+  encode(_revokes, bl);
+  ENCODE_FINISH(bl);
+}
+
+void Capability::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl)
+  decode(last_sent, bl);
+  decode(last_issue_stamp, bl);
+
+  __u32 tmp_wanted;
+  decode(tmp_wanted, bl);
+  set_wanted(tmp_wanted);
+  decode(_pending, bl);
+  decode(_revokes, bl);
+  DECODE_FINISH(bl);
+  
+  calc_issued();
+}
+
+void Capability::dump(ceph::Formatter *f) const
+{
+  if (inode)
+    f->dump_stream("ino") << inode->ino();
+  f->dump_unsigned("last_sent", last_sent);
+  f->dump_stream("last_issue_stamp") << last_issue_stamp;
+  f->dump_stream("wanted") << ccap_string(_wanted);
+  f->dump_stream("pending") << ccap_string(_pending);
+
+  f->open_array_section("revokes");
+  for (const auto &r : _revokes) {
+    f->open_object_section("revoke");
+    r.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void Capability::generate_test_instances(std::list<Capability*>& ls)
+{
+  ls.push_back(new Capability);
+  ls.push_back(new Capability);
+  ls.back()->last_sent = 11;
+  ls.back()->last_issue_stamp = utime_t(12, 13);
+  ls.back()->set_wanted(14);
+  ls.back()->_pending = 15;
+  {
+    auto &r = ls.back()->_revokes.emplace_back();
+    r.before = 16;
+    r.seq = 17;
+    r.last_issue = 18;
+  }
+  {
+    auto &r = ls.back()->_revokes.emplace_back();
+    r.before = 19;
+    r.seq = 20;
+    r.last_issue = 21;
+  }
+}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(Capability, co_cap, mds_co);
diff --git a/src/mds/Capability.h b/src/mds/Capability.h
new file mode 100644
index 000000000..f7119f002
--- /dev/null
+++ b/src/mds/Capability.h
@@ -0,0 +1,423 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_CAPABILITY_H
+#define CEPH_CAPABILITY_H
+
+#include "include/buffer_fwd.h"
+#include "include/counter.h"
+#include "include/mempool.h"
+#include "include/xlist.h"
+#include "include/elist.h"
+
+#include "common/config.h"
+
+#include "mdstypes.h"
+
+
+/*
+
+  Capability protocol notes.
+
+- two types of cap events from mds -> client:
+  - cap "issue" in a MClientReply, or an MClientCaps IMPORT op.
+  - cap "update" (revocation or grant) .. an MClientCaps message.
+- if client has cap, the mds should have it too.
+
+- if client has no dirty data, it can release it without waiting for an mds ack.
+  - client may thus get a cap _update_ and not have the cap.  ignore it.
+
+- mds should track seq of last issue.  any release
+  attempt will only succeed if the client has seen the latest.
+
+- a UPDATE updates the clients issued caps, wanted, etc.  it may also flush dirty metadata.
+  - 'caps' are which caps the client retains.
+    - if 0, client wishes to release the cap
+  - 'wanted' is which caps the client wants.
+  - 'dirty' is which metadata is to be written.
+    - client gets a FLUSH_ACK with matching dirty flags indicating which caps were written.
+
+- a FLUSH_ACK acks a FLUSH.
+  - 'dirty' is the _original_ FLUSH's dirty (i.e., which metadata was written back)
+  - 'seq' is the _original_ FLUSH's seq.
+  - 'caps' is the _original_ FLUSH's caps (not actually important)
+  - client can conclude that (dirty & ~caps) bits were successfully cleaned.
+
+- a FLUSHSNAP flushes snapshot metadata.
+  - 'dirty' indicates which caps, were dirty, if any.
+  - mds writes metadata.  if dirty!=0, replies with FLUSHSNAP_ACK.
+
+ */
+
+class CInode;
+class Session;
+class MDLockCache;
+
+namespace ceph {
+  class Formatter;
+}
+
+class Capability : public Counter<Capability> {
+public:
+  MEMPOOL_CLASS_HELPERS();
+
+  struct Export {
+    Export() {}
+    Export(int64_t id, int w, int i, int p, snapid_t cf,
+	   ceph_seq_t s, ceph_seq_t m, utime_t lis, unsigned st) :
+      cap_id(id), wanted(w), issued(i), pending(p), client_follows(cf),
+      seq(s), mseq(m), last_issue_stamp(lis), state(st) {}
+    void encode(ceph::buffer::list &bl) const;
+    void decode(ceph::buffer::list::const_iterator &p);
+    void dump(ceph::Formatter *f) const;
+    static void generate_test_instances(std::list<Export*>& ls);
+
+    int64_t cap_id = 0;
+    int32_t wanted = 0;
+    int32_t issued = 0;
+    int32_t pending = 0;
+    snapid_t client_follows;
+    ceph_seq_t seq = 0;
+    ceph_seq_t mseq = 0;
+    utime_t last_issue_stamp;
+    uint32_t state = 0;
+  };
+  struct Import {
+    Import() {}
+    Import(int64_t i, ceph_seq_t s, ceph_seq_t m) : cap_id(i), issue_seq(s), mseq(m) {}
+    void encode(ceph::buffer::list &bl) const;
+    void decode(ceph::buffer::list::const_iterator &p);
+    void dump(ceph::Formatter *f) const;
+
+    int64_t cap_id = 0;
+    ceph_seq_t issue_seq = 0;
+    ceph_seq_t mseq = 0;
+  };
+  struct revoke_info {
+    revoke_info() {}
+    revoke_info(__u32 b, ceph_seq_t s, ceph_seq_t li) : before(b), seq(s), last_issue(li) {}
+    void encode(ceph::buffer::list& bl) const;
+    void decode(ceph::buffer::list::const_iterator& bl);
+    void dump(ceph::Formatter *f) const;
+    static void generate_test_instances(std::list<revoke_info*>& ls);
+
+    __u32 before = 0;
+    ceph_seq_t seq = 0;
+    ceph_seq_t last_issue = 0;
+  };
+
+  const static unsigned STATE_NOTABLE		= (1<<0);
+  const static unsigned STATE_NEW		= (1<<1);
+  const static unsigned STATE_IMPORTING		= (1<<2);
+  const static unsigned STATE_NEEDSNAPFLUSH	= (1<<3);
+  const static unsigned STATE_CLIENTWRITEABLE	= (1<<4);
+  const static unsigned STATE_NOINLINE		= (1<<5);
+  const static unsigned STATE_NOPOOLNS		= (1<<6);
+  const static unsigned STATE_NOQUOTA		= (1<<7);
+
+  const static unsigned MASK_STATE_EXPORTED =
+    (STATE_CLIENTWRITEABLE | STATE_NOINLINE | STATE_NOPOOLNS | STATE_NOQUOTA);
+
+  Capability(CInode *i=nullptr, Session *s=nullptr, uint64_t id=0);
+  Capability(const Capability& other) = delete;
+
+  const Capability& operator=(const Capability& other) = delete;
+
+  int pending() const {
+    return _pending;
+  }
+  int issued() const {
+    return _issued;
+  }
+  int revoking() const {
+    return _issued & ~_pending;
+  }
+  ceph_seq_t issue(unsigned c, bool reval=false) {
+    if (reval)
+      revalidate();
+
+    if (_pending & ~c) {
+      // revoking (and maybe adding) bits.  note caps prior to this revocation
+      _revokes.emplace_back(_pending, last_sent, last_issue);
+      _pending = c;
+      _issued |= c;
+      if (!is_notable())
+	mark_notable();
+    } else if (~_pending & c) {
+      // adding bits only.  remove obsolete revocations?
+      _pending |= c;
+      _issued |= c;
+      // drop old _revokes with no bits we don't have
+      while (!_revokes.empty() &&
+	     (_revokes.back().before & ~_pending) == 0)
+	_revokes.pop_back();
+    } else {
+      // no change.
+      ceph_assert(_pending == c);
+    }
+    //last_issue = 
+    inc_last_seq();
+    return last_sent;
+  }
+  ceph_seq_t issue_norevoke(unsigned c, bool reval=false) {
+    if (reval)
+      revalidate();
+
+    _pending |= c;
+    _issued |= c;
+    clear_new();
+
+    inc_last_seq();
+    return last_sent;
+  }
+  int confirm_receipt(ceph_seq_t seq, unsigned caps) {
+    int was_revoking = (_issued & ~_pending);
+    if (seq == last_sent) {
+      _revokes.clear();
+      _issued = caps;
+      // don't add bits
+      _pending &= caps;
+    } else {
+      // can i forget any revocations?
+      while (!_revokes.empty() && _revokes.front().seq < seq)
+	_revokes.pop_front();
+      if (!_revokes.empty()) {
+	if (_revokes.front().seq == seq)
+	  _revokes.begin()->before = caps;
+	calc_issued();
+      } else {
+	// seq < last_sent
+	_issued = caps | _pending;
+      }
+    }
+
+    if (was_revoking && _issued == _pending) {
+      item_revoking_caps.remove_myself();
+      item_client_revoking_caps.remove_myself();
+      maybe_clear_notable();
+    }
+    return was_revoking & ~_issued; // return revoked
+  }
+  // we may get a release racing with revocations, which means our revokes will be ignored
+  // by the client.  clean them out of our _revokes history so we don't wait on them.
+  void clean_revoke_from(ceph_seq_t li) {
+    bool changed = false;
+    while (!_revokes.empty() && _revokes.front().last_issue <= li) {
+      _revokes.pop_front();
+      changed = true;
+    }
+    if (changed) {
+      bool was_revoking = (_issued & ~_pending);
+      calc_issued();
+      if (was_revoking && _issued == _pending) {
+	item_revoking_caps.remove_myself();
+	item_client_revoking_caps.remove_myself();
+	maybe_clear_notable();
+      }
+    }
+  }
+  ceph_seq_t get_mseq() const { return mseq; }
+  void inc_mseq() { mseq++; }
+
+  utime_t get_last_issue_stamp() const { return last_issue_stamp; }
+  utime_t get_last_revoke_stamp() const { return last_revoke_stamp; }
+
+  void set_last_issue() { last_issue = last_sent; }
+  void set_last_issue_stamp(utime_t t) { last_issue_stamp = t; }
+  void set_last_revoke_stamp(utime_t t) { last_revoke_stamp = t; }
+  void reset_num_revoke_warnings() { num_revoke_warnings = 0; }
+  void inc_num_revoke_warnings() { ++num_revoke_warnings; }
+  unsigned get_num_revoke_warnings() const { return num_revoke_warnings; }
+
+  void set_cap_id(uint64_t i) { cap_id = i; }
+  uint64_t get_cap_id() const { return cap_id; }
+
+  //ceph_seq_t get_last_issue() { return last_issue; }
+
+  bool is_suppress() const { return suppress > 0; }
+  void inc_suppress() { suppress++; }
+  void dec_suppress() { suppress--; }
+
+  static bool is_wanted_notable(int wanted) {
+    return wanted & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_WR|CEPH_CAP_FILE_RD);
+  }
+  bool is_wanted_notable() const {
+    return is_wanted_notable(wanted());
+  }
+  bool is_notable() const { return state & STATE_NOTABLE; }
+
+  bool is_stale() const;
+  bool is_valid() const;
+  bool is_new() const { return state & STATE_NEW; }
+  void mark_new() { state |= STATE_NEW; }
+  void clear_new() { state &= ~STATE_NEW; }
+  bool is_importing() const { return state & STATE_IMPORTING; }
+  void mark_importing() { state |= STATE_IMPORTING; }
+  void clear_importing() { state &= ~STATE_IMPORTING; }
+  bool need_snapflush() const { return state & STATE_NEEDSNAPFLUSH; }
+  void mark_needsnapflush() { state |= STATE_NEEDSNAPFLUSH; }
+  void clear_needsnapflush() { state &= ~STATE_NEEDSNAPFLUSH; }
+
+  bool is_clientwriteable() const { return state & STATE_CLIENTWRITEABLE; }
+  void mark_clientwriteable() {
+    if (!is_clientwriteable()) {
+      state |= STATE_CLIENTWRITEABLE;
+      if (!is_notable())
+	mark_notable();
+    }
+  }
+  void clear_clientwriteable() {
+    if (is_clientwriteable()) {
+      state &= ~STATE_CLIENTWRITEABLE;
+      maybe_clear_notable();
+    }
+  }
+
+  bool is_noinline() const { return state & STATE_NOINLINE; }
+  bool is_nopoolns() const { return state & STATE_NOPOOLNS; }
+  bool is_noquota() const { return state & STATE_NOQUOTA; }
+
+  CInode *get_inode() const { return inode; }
+  Session *get_session() const { return session; }
+  client_t get_client() const;
+
+  // caps this client wants to hold
+  int wanted() const { return _wanted; }
+  void set_wanted(int w);
+
+  void inc_last_seq() { last_sent++; }
+  ceph_seq_t get_last_seq() const {
+    return last_sent;
+  }
+  ceph_seq_t get_last_issue() const { return last_issue; }
+
+  void reset_seq() {
+    last_sent = 0;
+    last_issue = 0;
+  }
+  
+  // -- exports --
+  Export make_export() const {
+    return Export(cap_id, wanted(), issued(), pending(), client_follows, get_last_seq(), mseq+1, last_issue_stamp, state);
+  }
+  void merge(const Export& other, bool auth_cap) {
+    // issued + pending
+    int newpending = other.pending | pending();
+    if (other.issued & ~newpending)
+      issue(other.issued | newpending);
+    else
+      issue(newpending);
+    last_issue_stamp = other.last_issue_stamp;
+
+    client_follows = other.client_follows;
+
+    state |= other.state & MASK_STATE_EXPORTED;
+    if ((other.state & STATE_CLIENTWRITEABLE) && !is_notable())
+      mark_notable();
+
+    // wanted
+    set_wanted(wanted() | other.wanted);
+    if (auth_cap)
+      mseq = other.mseq;
+  }
+  void merge(int otherwanted, int otherissued) {
+    // issued + pending
+    int newpending = pending();
+    if (otherissued & ~newpending)
+      issue(otherissued | newpending);
+    else
+      issue(newpending);
+
+    // wanted
+    set_wanted(wanted() | otherwanted);
+  }
+
+  int revoke() {
+    if (revoking())
+      return confirm_receipt(last_sent, pending());
+    return 0;
+  }
+
+  // serializers
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<Capability*>& ls);
+
+  snapid_t client_follows = 0;
+  version_t client_xattr_version = 0;
+  version_t client_inline_version = 0;
+  int64_t last_rbytes = 0;
+  int64_t last_rsize = 0;
+
+  xlist<Capability*>::item item_session_caps;
+  xlist<Capability*>::item item_snaprealm_caps;
+  xlist<Capability*>::item item_revoking_caps;
+  xlist<Capability*>::item item_client_revoking_caps;
+
+  elist<MDLockCache*> lock_caches;
+  int get_lock_cache_allowed() const { return lock_cache_allowed; }
+  void set_lock_cache_allowed(int c) { lock_cache_allowed |= c; }
+  void clear_lock_cache_allowed(int c) { lock_cache_allowed &= ~c; }
+
+private:
+  void calc_issued() {
+    _issued = _pending;
+    for (const auto &r : _revokes) {
+      _issued |= r.before;
+    }
+  }
+
+  void revalidate();
+
+  void mark_notable();
+  void maybe_clear_notable();
+
+  CInode *inode;
+  Session *session;
+
+  uint64_t cap_id;
+  uint32_t cap_gen;
+
+  __u32 _wanted = 0;     // what the client wants (ideally)
+
+  utime_t last_issue_stamp;
+  utime_t last_revoke_stamp;
+  unsigned num_revoke_warnings = 0;
+
+  // track in-flight caps --------------
+  //  - add new caps to _pending
+  //  - track revocations in _revokes list
+  __u32 _pending = 0, _issued = 0;
+  mempool::mds_co::list<revoke_info> _revokes;
+
+  ceph_seq_t last_sent = 0;
+  ceph_seq_t last_issue = 0;
+  ceph_seq_t mseq = 0;
+
+  int suppress = 0;
+  unsigned state = 0;
+
+  int lock_cache_allowed = 0;
+};
+
+WRITE_CLASS_ENCODER(Capability::Export)
+WRITE_CLASS_ENCODER(Capability::Import)
+WRITE_CLASS_ENCODER(Capability::revoke_info)
+WRITE_CLASS_ENCODER(Capability)
+
+
+
+#endif
diff --git a/src/mds/DamageTable.cc b/src/mds/DamageTable.cc
new file mode 100644
index 000000000..22802079d
--- /dev/null
+++ b/src/mds/DamageTable.cc
@@ -0,0 +1,275 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/debug.h"
+
+#include "mds/CDir.h"
+
+#include "DamageTable.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << rank << ".damage " << __func__ << " "
+
+namespace {
+/**
+ * Record damage to a particular dirfrag, implicitly affecting
+ * any dentries within it.
+ */
+class DirFragDamage : public DamageEntry
+{
+  public:
+  inodeno_t ino;
+  frag_t frag;
+
+  DirFragDamage(inodeno_t ino_, frag_t frag_)
+    : ino(ino_), frag(frag_)
+  {}
+
+  damage_entry_type_t get_type() const override
+  {
+    return DAMAGE_ENTRY_DIRFRAG;
+  }
+
+  void dump(Formatter *f) const override
+  {
+    f->open_object_section("dir_frag_damage");
+    f->dump_string("damage_type", "dir_frag");
+    f->dump_int("id", id);
+    f->dump_int("ino", ino);
+    f->dump_stream("frag") << frag;
+    f->dump_string("path", path);
+    f->close_section();
+  }
+};
+
+
+/**
+ * Record damage to a particular dname within a particular dirfrag
+ */
+class DentryDamage : public DamageEntry
+{
+  public:
+  inodeno_t ino;
+  frag_t frag;
+  std::string dname;
+  snapid_t snap_id;
+
+  DentryDamage(
+      inodeno_t ino_,
+      frag_t frag_,
+      std::string_view dname_,
+      snapid_t snap_id_)
+    : ino(ino_), frag(frag_), dname(dname_), snap_id(snap_id_)
+  {}
+
+  damage_entry_type_t get_type() const override
+  {
+    return DAMAGE_ENTRY_DENTRY;
+  }
+
+  void dump(Formatter *f) const override
+  {
+    f->open_object_section("dentry_damage");
+    f->dump_string("damage_type", "dentry");
+    f->dump_int("id", id);
+    f->dump_int("ino", ino);
+    f->dump_stream("frag") << frag;
+    f->dump_string("dname", dname);
+    f->dump_stream("snap_id") << snap_id;
+    f->dump_string("path", path);
+    f->close_section();
+  }
+};
+
+
+/**
+ * Record damage to our ability to look up an ino by number
+ */
+class BacktraceDamage : public DamageEntry
+{
+  public:
+  inodeno_t ino;
+
+  BacktraceDamage(inodeno_t ino_)
+    : ino(ino_)
+  {}
+
+  damage_entry_type_t get_type() const override
+  {
+    return DAMAGE_ENTRY_BACKTRACE;
+  }
+
+  void dump(Formatter *f) const override
+  {
+    f->open_object_section("backtrace_damage");
+    f->dump_string("damage_type", "backtrace");
+    f->dump_int("id", id);
+    f->dump_int("ino", ino);
+    f->dump_string("path", path);
+    f->close_section();
+  }
+};
+}
+
+DamageEntry::~DamageEntry()
+{}
+
+bool DamageTable::notify_dentry(
+    inodeno_t ino, frag_t frag,
+    snapid_t snap_id, std::string_view dname, std::string_view path)
+{
+  if (oversized()) {
+    return true;
+  }
+
+  // Special cases: damage to these dirfrags is considered fatal to
+  // the MDS rank that owns them.
+  if (
+      (MDS_INO_IS_MDSDIR(ino) && MDS_INO_MDSDIR_OWNER(ino) == rank)
+      ||
+      (MDS_INO_IS_STRAY(ino) && MDS_INO_STRAY_OWNER(ino) == rank)
+     ) {
+    derr << "Damage to dentries in fragment " << frag << " of ino " << ino
+         << "is fatal because it is a system directory for this rank" << dendl;
+    return true;
+  }
+
+  auto& df_dentries = dentries[DirFragIdent(ino, frag)];
+  if (auto [it, inserted] = df_dentries.try_emplace(DentryIdent(dname, snap_id)); inserted) {
+    auto entry = std::make_shared<DentryDamage>(ino, frag, dname, snap_id);
+    entry->path = path;
+    it->second = entry;
+    by_id[entry->id] = std::move(entry);
+  }
+
+  return false;
+}
+
+bool DamageTable::notify_dirfrag(inodeno_t ino, frag_t frag,
+                                 std::string_view path)
+{
+  // Special cases: damage to these dirfrags is considered fatal to
+  // the MDS rank that owns them.
+  if ((MDS_INO_IS_STRAY(ino) && MDS_INO_STRAY_OWNER(ino) == rank)
+      || (ino == CEPH_INO_ROOT)) {
+    derr << "Damage to fragment " << frag << " of ino " << ino
+         << " is fatal because it is a system directory for this rank" << dendl;
+    return true;
+  }
+
+  if (oversized()) {
+    return true;
+  }
+
+  if (auto [it, inserted] = dirfrags.try_emplace(DirFragIdent(ino, frag)); inserted) {
+    DamageEntryRef entry = std::make_shared<DirFragDamage>(ino, frag);
+    entry->path = path;
+    it->second = entry;
+    by_id[entry->id] = std::move(entry);
+  }
+
+  return false;
+}
+
+bool DamageTable::notify_remote_damaged(inodeno_t ino, std::string_view path)
+{
+  if (oversized()) {
+    return true;
+  }
+
+  if (auto [it, inserted] = remotes.try_emplace(ino); inserted) {
+    auto entry = std::make_shared<BacktraceDamage>(ino);
+    entry->path = path;
+    it->second = entry;
+    by_id[entry->id] = std::move(entry);
+  }
+
+  return false;
+}
+
+bool DamageTable::oversized() const
+{
+  return by_id.size() > (size_t)(g_conf()->mds_damage_table_max_entries);
+}
+
+bool DamageTable::is_dentry_damaged(
+        const CDir *dir_frag,
+        std::string_view dname,
+        const snapid_t snap_id) const
+{
+  if (dentries.count(
+        DirFragIdent(dir_frag->inode->ino(), dir_frag->frag)
+        ) == 0) {
+    return false;
+  }
+
+  const std::map<DentryIdent, DamageEntryRef> &frag_dentries =
+    dentries.at(DirFragIdent(dir_frag->inode->ino(), dir_frag->frag));
+
+  return frag_dentries.count(DentryIdent(dname, snap_id)) > 0;
+}
+
+bool DamageTable::is_dirfrag_damaged(
+    const CDir *dir_frag) const
+{
+  return dirfrags.count(
+      DirFragIdent(dir_frag->inode->ino(), dir_frag->frag)) > 0;
+}
+
+bool DamageTable::is_remote_damaged(
+    const inodeno_t ino) const
+{
+  return remotes.count(ino) > 0;
+}
+
+void DamageTable::dump(Formatter *f) const
+{
+  f->open_array_section("damage_table");
+  for (const auto &i : by_id)
+  {
+    i.second->dump(f);
+  }
+  f->close_section();
+}
+
+void DamageTable::erase(damage_entry_id_t damage_id)
+{
+  auto by_id_entry = by_id.find(damage_id);
+  if (by_id_entry == by_id.end()) {
+    return;
+  }
+
+  DamageEntryRef entry = by_id_entry->second;
+  ceph_assert(entry->id == damage_id);  // Sanity
+
+  const auto type = entry->get_type();
+  if (type == DAMAGE_ENTRY_DIRFRAG) {
+    auto dirfrag_entry = std::static_pointer_cast<DirFragDamage>(entry);
+    dirfrags.erase(DirFragIdent(dirfrag_entry->ino, dirfrag_entry->frag));
+  } else if (type == DAMAGE_ENTRY_DENTRY) {
+    auto dentry_entry = std::static_pointer_cast<DentryDamage>(entry);
+    dentries.erase(DirFragIdent(dentry_entry->ino, dentry_entry->frag));
+  } else if (type == DAMAGE_ENTRY_BACKTRACE) {
+    auto backtrace_entry = std::static_pointer_cast<BacktraceDamage>(entry);
+    remotes.erase(backtrace_entry->ino);
+  } else {
+    derr << "Invalid type " << type << dendl;
+    ceph_abort();
+  }
+
+  by_id.erase(by_id_entry);
+}
+
diff --git a/src/mds/DamageTable.h b/src/mds/DamageTable.h
new file mode 100644
index 000000000..18a61e08b
--- /dev/null
+++ b/src/mds/DamageTable.h
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef DAMAGE_TABLE_H_
+#define DAMAGE_TABLE_H_
+
+#include <string_view>
+
+#include "mdstypes.h"
+#include "include/random.h"
+
+class CDir;
+
+typedef uint64_t damage_entry_id_t;
+
+typedef enum
+{
+  DAMAGE_ENTRY_DIRFRAG,
+  DAMAGE_ENTRY_DENTRY,
+  DAMAGE_ENTRY_BACKTRACE
+
+} damage_entry_type_t;
+
+class DamageEntry
+{
+  public:
+    DamageEntry()
+    {
+      id = ceph::util::generate_random_number<damage_entry_id_t>(0, 0xffffffff);
+      reported_at = ceph_clock_now();
+    }
+
+    virtual ~DamageEntry();
+
+    virtual damage_entry_type_t get_type() const = 0;
+    virtual void dump(Formatter *f) const = 0;
+
+    damage_entry_id_t id;
+    utime_t reported_at;
+
+    // path is optional, advisory.  Used to give the admin an idea of what
+    // part of his tree the damage affects.
+    std::string path;
+};
+
+typedef std::shared_ptr<DamageEntry> DamageEntryRef;
+
+class DirFragIdent
+{
+  public:
+    DirFragIdent(inodeno_t ino_, frag_t frag_)
+      : ino(ino_), frag(frag_)
+    {}
+
+    bool operator<(const DirFragIdent &rhs) const
+    {
+      if (ino == rhs.ino) {
+        return frag < rhs.frag;
+      } else {
+        return ino < rhs.ino;
+      }
+    }
+
+    inodeno_t ino;
+    frag_t frag;
+};
+
+class DentryIdent
+{
+  public:
+    DentryIdent(std::string_view dname_, snapid_t snap_id_)
+      : dname(dname_), snap_id(snap_id_)
+    {}
+
+    bool operator<(const DentryIdent &rhs) const
+    {
+      if (dname == rhs.dname) {
+        return snap_id < rhs.snap_id;
+      } else {
+        return dname < rhs.dname;
+      }
+    }
+
+    std::string dname;
+    snapid_t snap_id;
+};
+
+/**
+ * Registry of in-RADOS metadata damage identified
+ * during forward scrub or during normal fetches.
+ *
+ * Used to indicate damage to the administrator, and
+ * to cache known-bad paths so that we don't hit them
+ * repeatedly.
+ *
+ * Callers notifying damage must check return code; if
+ * an fatal condition is indicated then they should mark the MDS
+ * rank damaged.
+ *
+ * An artificial limit on the number of damage entries
+ * is imposed to avoid this structure growing indefinitely.  If
+ * a notification causes the limit to be exceeded, the fatal
+ * condition will be indicated in the return code and the MDS
+ * rank should be marked damaged.
+ *
+ * Protected by MDS::mds_lock
+ */
+class DamageTable
+{
+  public:
+    explicit DamageTable(const mds_rank_t rank_)
+      : rank(rank_)
+    {
+      ceph_assert(rank_ != MDS_RANK_NONE);
+    }
+
+    /**
+     * Return true if no damage entries exist
+     */
+    bool empty() const
+    {
+      return by_id.empty();
+    }
+
+    /**
+     * Indicate that a dirfrag cannot be loaded.
+     *
+     * @return true if fatal
+     */
+    bool notify_dirfrag(inodeno_t ino, frag_t frag, std::string_view path);
+
+    /**
+     * Indicate that a particular dentry cannot be loaded.
+     *
+     * @return true if fatal
+     */
+    bool notify_dentry(
+      inodeno_t ino, frag_t frag,
+      snapid_t snap_id, std::string_view dname, std::string_view path);
+
+    /**
+     * Indicate that a particular Inode could not be loaded by number
+     */
+    bool notify_remote_damaged(inodeno_t ino, std::string_view path);
+
+    bool is_dentry_damaged(
+      const CDir *dir_frag,
+      std::string_view dname,
+      const snapid_t snap_id) const;
+
+    bool is_dirfrag_damaged(const CDir *dir_frag) const;
+
+    bool is_remote_damaged(const inodeno_t ino) const;
+
+    void dump(Formatter *f) const;
+
+    void erase(damage_entry_id_t damage_id);
+
+  protected:
+    // I need to know my MDS rank so that I can check if
+    // metadata items are part of my mydir.
+    const mds_rank_t rank;
+
+    bool oversized() const;
+
+    // Map of all dirfrags reported damaged
+    std::map<DirFragIdent, DamageEntryRef> dirfrags;
+
+    // Store dentries in a map per dirfrag, so that we can
+    // readily look up all the bad dentries in a particular
+    // dirfrag
+    std::map<DirFragIdent, std::map<DentryIdent, DamageEntryRef> > dentries;
+
+    // Map of all inodes which could not be resolved remotely
+    // (i.e. have probably/possibly missing backtraces)
+    std::map<inodeno_t, DamageEntryRef> remotes;
+
+    // All damage, by ID.  This is a secondary index
+    // to the dirfrag, dentry, remote maps.  It exists
+    // to enable external tools to unambiguously operate
+    // on particular entries.
+    std::map<damage_entry_id_t, DamageEntryRef> by_id;
+};
+#endif // DAMAGE_TABLE_H_
diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc
new file mode 100644
index 000000000..d16ffd3f0
--- /dev/null
+++ b/src/mds/FSMap.cc
@@ -0,0 +1,1211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <ostream>
+
+#include "FSMap.h"
+
+#include "common/StackStringStream.h"
+
+#ifdef WITH_SEASTAR
+#include "crimson/common/config_proxy.h"
+#else
+#include "common/config_proxy.h"
+#endif
+#include "global/global_context.h"
+#include "mon/health_check.h"
+
+using std::list;
+using std::pair;
+using std::ostream;
+using std::string;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+void ClusterInfo::encode(ceph::buffer::list &bl) const {
+  ENCODE_START(1, 1, bl);
+  encode(client_name, bl);
+  encode(cluster_name, bl);
+  encode(fs_name, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ClusterInfo::decode(ceph::buffer::list::const_iterator &iter) {
+  DECODE_START(1, iter);
+  decode(client_name, iter);
+  decode(cluster_name, iter);
+  decode(fs_name, iter);
+  DECODE_FINISH(iter);
+}
+
+void ClusterInfo::dump(ceph::Formatter *f) const {
+  f->dump_string("client_name", client_name);
+  f->dump_string("cluster_name", cluster_name);
+  f->dump_string("fs_name", fs_name);
+}
+
+void ClusterInfo::print(std::ostream& out) const {
+  out << "[client_name=" << client_name << ", cluster_name=" << cluster_name
+      << ", fs_name=" << fs_name << "]" << std::endl;
+}
+
+void Peer::encode(ceph::buffer::list &bl) const {
+  ENCODE_START(1, 1, bl);
+  encode(uuid, bl);
+  encode(remote, bl);
+  ENCODE_FINISH(bl);
+}
+
+void Peer::decode(ceph::buffer::list::const_iterator &iter) {
+  DECODE_START(1, iter);
+  decode(uuid, iter);
+  decode(remote, iter);
+  DECODE_FINISH(iter);
+}
+
+void Peer::dump(ceph::Formatter *f) const {
+  f->open_object_section(uuid);
+  f->dump_object("remote", remote);
+  f->close_section();
+}
+
+void Peer::print(std::ostream& out) const {
+  out << "[uuid=" << uuid << ", remote=" << remote << "]" << std::endl;
+}
+
+void MirrorInfo::encode(ceph::buffer::list &bl) const {
+  ENCODE_START(1, 1, bl);
+  encode(mirrored, bl);
+  encode(peers, bl);
+  ENCODE_FINISH(bl);
+}
+
+void MirrorInfo::decode(ceph::buffer::list::const_iterator &iter) {
+  DECODE_START(1, iter);
+  decode(mirrored, iter);
+  decode(peers, iter);
+  DECODE_FINISH(iter);
+}
+
+void MirrorInfo::dump(ceph::Formatter *f) const {
+  f->open_object_section("peers");
+  for (auto &peer : peers) {
+    peer.dump(f);
+  }
+  f->close_section(); // peers
+}
+
+void MirrorInfo::print(std::ostream& out) const {
+  out << "[peers=" << peers << "]" << std::endl;
+}
+
+void Filesystem::dump(Formatter *f) const
+{
+  f->open_object_section("mdsmap");
+  mds_map.dump(f);
+  f->close_section();
+  f->dump_int("id", fscid);
+  if (mirror_info.is_mirrored()) {
+    f->open_object_section("mirror_info");
+    mirror_info.dump(f);
+    f->close_section(); // mirror_info
+  }
+}
+
+void FSMap::dump(Formatter *f) const
+{
+  f->dump_int("epoch", epoch);
+  // Use 'default' naming to match 'set-default' CLI
+  f->dump_int("default_fscid", legacy_client_fscid);
+
+  f->open_object_section("compat");
+  default_compat.dump(f);
+  f->close_section();
+
+  f->open_object_section("feature_flags");
+  f->dump_bool("enable_multiple", enable_multiple);
+  f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
+  f->close_section();
+
+  f->open_array_section("standbys");
+  for (const auto& [gid, info] : standby_daemons) {
+    f->open_object_section("info");
+    info.dump(f);
+    f->dump_int("epoch", standby_epochs.at(gid));
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("filesystems");
+  for (const auto &fs : filesystems) {
+    f->open_object_section("filesystem");
+    fs.second->dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+FSMap &FSMap::operator=(const FSMap &rhs)
+{
+  epoch = rhs.epoch;
+  next_filesystem_id = rhs.next_filesystem_id;
+  legacy_client_fscid = rhs.legacy_client_fscid;
+  default_compat = rhs.default_compat;
+  enable_multiple = rhs.enable_multiple;
+  mds_roles = rhs.mds_roles;
+  standby_daemons = rhs.standby_daemons;
+  standby_epochs = rhs.standby_epochs;
+
+  filesystems.clear();
+  for (const auto &i : rhs.filesystems) {
+    const auto &fs = i.second;
+    filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
+  }
+
+  return *this;
+}
+
+void FSMap::generate_test_instances(std::list<FSMap*>& ls)
+{
+  FSMap *m = new FSMap();
+
+  std::list<MDSMap*> mds_map_instances;
+  MDSMap::generate_test_instances(mds_map_instances);
+
+  int k = 20;
+  for (auto i : mds_map_instances) {
+    auto fs = Filesystem::create();
+    fs->fscid = k++;
+    fs->mds_map = *i;
+    delete i;
+    m->filesystems[fs->fscid] = fs;
+  }
+  mds_map_instances.clear();
+
+  ls.push_back(m);
+}
+
+void FSMap::print(ostream& out) const
+{
+  out << "e" << epoch << std::endl;
+  out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
+      << ever_enabled_multiple << std::endl;
+  out << "default compat: " << default_compat << std::endl;
+  out << "legacy client fscid: " << legacy_client_fscid << std::endl;
+  out << " " << std::endl;
+
+  if (filesystems.empty()) {
+    out << "No filesystems configured" << std::endl;
+  }
+
+  for (const auto& p : filesystems) {
+    p.second->print(out);
+    out << " " << std::endl << " " << std::endl;  // Space out a bit
+  }
+
+  if (!standby_daemons.empty()) {
+    out << "Standby daemons:" << std::endl << " " << std::endl;
+  }
+
+  for (const auto& p : standby_daemons) {
+    out << p.second << std::endl;
+  }
+}
+
+void FSMap::print_daemon_summary(ostream& out) const
+{
+  // this appears in the "services:" section of "ceph status"
+  int num_up = 0, num_in = 0, num_failed = 0;
+  int num_standby_replay = 0;
+  for (auto& [fscid, fs] : filesystems) {
+    num_up += fs->mds_map.get_num_up_mds();
+    num_in += fs->mds_map.get_num_in_mds();
+    num_failed += fs->mds_map.get_num_failed_mds();
+    num_standby_replay += fs->mds_map.get_num_standby_replay_mds();
+  }
+  int num_standby = standby_daemons.size();
+  out << num_up << "/" << num_in << " daemons up";
+  if (num_failed) {
+    out << " (" << num_failed << " failed)";
+  }
+  if (num_standby) {
+    out << ", " << num_standby << " standby";
+  }
+  if (num_standby_replay) {
+    out << ", " << num_standby_replay << " hot standby";
+  }
+}
+
+void FSMap::print_fs_summary(ostream& out) const
+{
+  // this appears in the "data:" section of "ceph status"
+  if (!filesystems.empty()) {
+    int num_failed = 0, num_recovering = 0, num_stopped = 0, num_healthy = 0;
+    int num_damaged = 0;
+    for (auto& [fscid, fs] : filesystems) {
+      if (fs->mds_map.is_any_damaged()) {
+	++num_damaged;
+      }
+      if (fs->mds_map.is_any_failed()) {
+	++num_failed;
+      } else if (fs->mds_map.is_degraded()) {
+	++num_recovering;
+      } else if (fs->mds_map.get_max_mds() == 0) {
+	++num_stopped;
+      } else {
+	++num_healthy;
+      }
+    }
+    out << "    volumes: "
+	<< num_healthy << "/" << filesystems.size() << " healthy";
+    if (num_recovering) {
+      out << ", " << num_recovering << " recovering";
+    }
+    if (num_failed) {
+      out << ", " << num_failed << " failed";
+    }
+    if (num_stopped) {
+      out << ", " << num_stopped << " stopped";
+    }
+    if (num_damaged) {
+      out << "; " << num_damaged << " damaged";
+    }
+    out << "\n";
+  }
+}
+
+void FSMap::print_summary(Formatter *f, ostream *out) const
+{
+  if (f) {
+    f->dump_unsigned("epoch", get_epoch());
+    for (const auto &p : filesystems) {
+      auto& fs = p.second;
+      f->dump_unsigned("id", fs->fscid);
+      f->dump_unsigned("up", fs->mds_map.up.size());
+      f->dump_unsigned("in", fs->mds_map.in.size());
+      f->dump_unsigned("max", fs->mds_map.max_mds);
+    }
+  } else {
+    auto count = filesystems.size();
+    if (count <= 3) {
+      bool first = true;
+      for (const auto& p : filesystems) {
+        const auto& fs = p.second;
+        if (!first) {
+          *out << " ";
+        }
+        if (fs->mds_map.is_degraded()) {
+          *out << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
+        } else {
+          *out << fs->mds_map.fs_name << ":" << fs->mds_map.in.size();
+        }
+        first = false;
+      }
+    } else {
+      *out << count << " fs";
+      unsigned degraded = 0;
+      CachedStackStringStream css;
+      *css << " (degraded: ";
+      for (const auto& p : filesystems) {
+        const auto& fs = p.second;
+        if (fs->mds_map.is_degraded()) {
+          degraded++;
+          if (degraded <= 3) {
+            *css << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
+          }
+        }
+      }
+      if (degraded > 0) {
+        if (degraded <= 3) {
+          *css << ")";
+          *out << css->strv();
+        } else {
+          *out << " (degraded: " << degraded << " fs)";
+        }
+      }
+    }
+  }
+
+  if (f) {
+    f->open_array_section("by_rank");
+  }
+
+  std::map<MDSMap::DaemonState,unsigned> by_state;
+  std::map<mds_role_t, std::pair<MDSMap::DaemonState, std::string>> by_rank;
+  by_state[MDSMap::DaemonState::STATE_STANDBY] = standby_daemons.size();
+  for (const auto& [gid, fscid] : mds_roles) {
+    if (fscid == FS_CLUSTER_ID_NONE)
+      continue;
+
+    const auto& info = filesystems.at(fscid)->mds_map.get_info_gid(gid);
+    auto s = std::string(ceph_mds_state_name(info.state));
+    if (info.laggy()) {
+      s += "(laggy or crashed)";
+    }
+
+    if (f) {
+      f->open_object_section("mds");
+      f->dump_unsigned("filesystem_id", fscid);
+      f->dump_unsigned("rank", info.rank);
+      f->dump_string("name", info.name);
+      f->dump_string("status", s);
+      f->dump_unsigned("gid", gid);
+      f->close_section();
+    } else if (info.state != MDSMap::DaemonState::STATE_STANDBY_REPLAY) {
+      by_rank[mds_role_t(fscid, info.rank)] = std::make_pair(info.state, info.name + "=" + s);
+    }
+    by_state[info.state]++;
+  }
+
+  if (f) {
+    f->close_section();
+  } else {
+    if (0 < by_rank.size() && by_rank.size() < 5) {
+      if (filesystems.size() > 1) {
+        // Disambiguate filesystems
+        std::map<std::string, std::string> pretty;
+        for (const auto& [role,status] : by_rank) {
+          const auto &fs_name = filesystems.at(role.fscid)->mds_map.fs_name;
+          CachedStackStringStream css;
+          *css << fs_name << ":" << role.rank;
+          pretty.emplace(std::piecewise_construct, std::forward_as_tuple(css->strv()), std::forward_as_tuple(status.second));
+          --by_state[status.first]; /* already printed! */
+        }
+        *out << " " << pretty;
+      } else {
+        // Omit FSCID in output when only one filesystem exists
+        std::map<mds_rank_t, std::string> shortened;
+        for (const auto& [role,status] : by_rank) {
+          shortened[role.rank] = status.second;
+          --by_state[status.first]; /* already printed! */
+        }
+        *out << " " << shortened;
+      }
+    }
+    for (const auto& [state, count] : by_state) {
+      if (count > 0) {
+        auto s = std::string_view(ceph_mds_state_name(state));
+        *out << " " << count << " " << s;
+      }
+    }
+  }
+
+  if (f) {
+    const auto state = MDSMap::DaemonState::STATE_STANDBY;
+    auto&& name = ceph_mds_state_name(state);
+    auto count = standby_daemons.size();
+    f->dump_unsigned(name, count);
+  }
+
+  size_t failed = 0;
+  size_t damaged = 0;
+  for (const auto& p : filesystems) {
+    auto& fs = p.second;
+    failed += fs->mds_map.failed.size();
+    damaged += fs->mds_map.damaged.size();
+  }
+
+  if (failed > 0) {
+    if (f) {
+      f->dump_unsigned("failed", failed);
+    } else {
+      *out << ", " << failed << " failed";
+    }
+  }
+
+  if (damaged > 0) {
+    if (f) {
+      f->dump_unsigned("damaged", damaged);
+    } else {
+      *out << ", " << damaged << " damaged";
+    }
+  }
+  //if (stopped.size())
+  //out << ", " << stopped.size() << " stopped";
+}
+
+mds_gid_t Filesystem::get_standby_replay(mds_gid_t who) const
+{
+  for (const auto &i : mds_map.mds_info) {
+    const auto &info = i.second;
+    if (info.state == MDSMap::STATE_STANDBY_REPLAY
+        && info.rank == mds_map.mds_info.at(who).rank) {
+      return info.global_id;
+    }
+  }
+  return MDS_GID_NONE;
+}
+
+Filesystem::ref FSMap::create_filesystem(std::string_view name,
+    int64_t metadata_pool, int64_t data_pool, uint64_t features,
+    fs_cluster_id_t fscid, bool recover)
+{
+  auto fs = Filesystem::create();
+  fs->mds_map.epoch = epoch;
+  fs->mds_map.fs_name = name;
+  fs->mds_map.data_pools.push_back(data_pool);
+  fs->mds_map.metadata_pool = metadata_pool;
+  fs->mds_map.cas_pool = -1;
+  fs->mds_map.compat = default_compat;
+  fs->mds_map.created = ceph_clock_now();
+  fs->mds_map.modified = ceph_clock_now();
+  fs->mds_map.enabled = true;
+  if (fscid == FS_CLUSTER_ID_NONE) {
+    fs->fscid = next_filesystem_id++;
+  } else {
+    fs->fscid = fscid;
+    next_filesystem_id = std::max(fscid,  (fs_cluster_id_t)next_filesystem_id) + 1;
+  }
+
+  if (recover) {
+    // Populate rank 0 as existing (so don't go into CREATING)
+    // but failed (so that next available MDS is assigned the rank)
+    fs->mds_map.in.insert(mds_rank_t(0));
+    fs->mds_map.failed.insert(mds_rank_t(0));
+
+    fs->mds_map.set_flag(CEPH_MDSMAP_NOT_JOINABLE);
+  }
+
+  // File system's ID can be FS_CLUSTER_ID_ANONYMOUS if we're recovering
+  // a legacy file system by passing FS_CLUSTER_ID_ANONYMOUS as the desired
+  // file system ID
+  if (fscid != FS_CLUSTER_ID_ANONYMOUS) {
+    // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
+    // have initialized next_filesystem_id such that it's never used here.
+    ceph_assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS);
+  }
+  filesystems[fs->fscid] = fs;
+
+  // Created first filesystem?  Set it as the one
+  // for legacy clients to use
+  if (filesystems.size() == 1) {
+    legacy_client_fscid = fs->fscid;
+  }
+
+  return fs;
+}
+
+Filesystem::const_ref FSMap::get_filesystem(std::string_view name) const
+{
+  for (const auto& p : filesystems) {
+    if (p.second->mds_map.fs_name == name) {
+      return p.second;
+    }
+  }
+  return nullptr;
+}
+
+std::vector<Filesystem::const_ref> FSMap::get_filesystems(void) const
+{
+  std::vector<Filesystem::const_ref> ret;
+  for (const auto& p : filesystems) {
+    ret.push_back(p.second);
+  }
+  return ret;
+}
+
+void FSMap::reset_filesystem(fs_cluster_id_t fscid)
+{
+  auto fs = get_filesystem(fscid);
+  auto new_fs = Filesystem::create();
+
+  // Populate rank 0 as existing (so don't go into CREATING)
+  // but failed (so that next available MDS is assigned the rank)
+  new_fs->mds_map.in.insert(mds_rank_t(0));
+  new_fs->mds_map.failed.insert(mds_rank_t(0));
+
+  // Carry forward what makes sense
+  new_fs->fscid = fs->fscid;
+  new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
+  new_fs->mds_map.data_pools = fs->mds_map.data_pools;
+  new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
+  new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
+  new_fs->mds_map.fs_name = fs->mds_map.fs_name;
+  new_fs->mds_map.compat = default_compat;
+  new_fs->mds_map.created = ceph_clock_now();
+  new_fs->mds_map.modified = ceph_clock_now();
+  new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
+  new_fs->mds_map.enabled = true;
+
+  // Remember mds ranks that have ever started. (They should load old inotable
+  // instead of creating new one if they start again.)
+  new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
+  new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
+  new_fs->mds_map.stopped.erase(mds_rank_t(0));
+
+  // Persist the new FSMap
+  filesystems[new_fs->fscid] = new_fs;
+}
+
+void FSMap::get_health(list<pair<health_status_t,string> >& summary,
+			list<pair<health_status_t,string> > *detail) const
+{
+  mds_rank_t standby_count_wanted = 0;
+  for (const auto &i : filesystems) {
+    const auto &fs = i.second;
+
+    // TODO: move get_health up into here so that we can qualify
+    // all the messages with what filesystem they're talking about
+    fs->mds_map.get_health(summary, detail);
+
+    standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
+  }
+
+  if (standby_count_wanted) {
+    CachedStackStringStream css;
+    *css << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
+    summary.push_back(make_pair(HEALTH_WARN, css->str()));
+  }
+}
+
+bool FSMap::check_health(void)
+{
+  bool changed = false;
+  for (auto &i : filesystems) {
+    changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
+  }
+  return changed;
+}
+
+void FSMap::get_health_checks(health_check_map_t *checks) const
+{
+  mds_rank_t standby_count_wanted = 0;
+  for (const auto &i : filesystems) {
+    const auto &fs = i.second;
+    health_check_map_t fschecks;
+
+    fs->mds_map.get_health_checks(&fschecks);
+
+    // Some of the failed ranks might be transient (i.e. there are standbys
+    // ready to replace them).  We will report only on "stuck" failed, i.e.
+    // ranks which are failed and have no standby replacement available.
+    std::set<mds_rank_t> stuck_failed;
+
+    for (const auto &rank : fs->mds_map.failed) {
+      auto rep_info = find_replacement_for({fs->fscid, rank});
+      if (!rep_info) {
+        stuck_failed.insert(rank);
+      }
+    }
+
+    // FS_WITH_FAILED_MDS
+    if (!stuck_failed.empty()) {
+      health_check_t& fscheck = checks->get_or_add(
+        "FS_WITH_FAILED_MDS", HEALTH_WARN,
+        "%num% filesystem%plurals% %hasorhave% a failed mds daemon", 1);
+      CachedStackStringStream css;
+      *css << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
+         << " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
+      fscheck.detail.push_back(css->str()); }
+
+    checks->merge(fschecks);
+    standby_count_wanted = std::max(
+      standby_count_wanted,
+      fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
+  }
+
+  // MDS_INSUFFICIENT_STANDBY
+  if (standby_count_wanted) {
+    CachedStackStringStream css1, css2;
+    *css1 << "insufficient standby MDS daemons available";
+    auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, css1->str(), 1);
+    *css2 << "have " << standby_daemons.size() << "; want " << standby_count_wanted
+	  << " more";
+    d.detail.push_back(css2->str());
+  }
+}
+
+void FSMap::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(STRUCT_VERSION, 6, bl);
+  encode(epoch, bl);
+  encode(next_filesystem_id, bl);
+  encode(legacy_client_fscid, bl);
+  encode(default_compat, bl);
+  encode(enable_multiple, bl);
+  {
+    std::vector<Filesystem::ref> v;
+    v.reserve(filesystems.size());
+    for (auto& p : filesystems) v.emplace_back(p.second);
+    encode(v, bl, features);
+  }
+  encode(mds_roles, bl);
+  encode(standby_daemons, bl, features);
+  encode(standby_epochs, bl);
+  encode(ever_enabled_multiple, bl);
+  ENCODE_FINISH(bl);
+}
+
+void FSMap::decode(bufferlist::const_iterator& p)
+{
+  struct_version = 0;
+  DECODE_START(STRUCT_VERSION, p);
+  DECODE_OLDEST(7);
+  struct_version = struct_v;
+  decode(epoch, p);
+  decode(next_filesystem_id, p);
+  decode(legacy_client_fscid, p);
+  decode(default_compat, p);
+  decode(enable_multiple, p);
+  {
+    std::vector<Filesystem::ref> v;
+    decode(v, p);
+    filesystems.clear();
+    for (auto& ref : v) {
+      auto em = filesystems.emplace(std::piecewise_construct, std::forward_as_tuple(ref->fscid), std::forward_as_tuple(std::move(ref)));
+      ceph_assert(em.second);
+    }
+  }
+  decode(mds_roles, p);
+  decode(standby_daemons, p);
+  decode(standby_epochs, p);
+  if (struct_v >= 7) {
+    decode(ever_enabled_multiple, p);
+  }
+  DECODE_FINISH(p);
+}
+
+void FSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
+{
+  for (auto &fs : filesystems) {
+    fs.second->mds_map.sanitize(pool_exists);
+  }
+}
+
+void Filesystem::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(2, 1, bl);
+  encode(fscid, bl);
+  bufferlist mdsmap_bl;
+  mds_map.encode(mdsmap_bl, features);
+  encode(mdsmap_bl, bl);
+  encode(mirror_info, bl);
+  ENCODE_FINISH(bl);
+}
+
+void Filesystem::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(2, p);
+  decode(fscid, p);
+  bufferlist mdsmap_bl;
+  decode(mdsmap_bl, p);
+  auto mdsmap_bl_iter = mdsmap_bl.cbegin();
+  mds_map.decode(mdsmap_bl_iter);
+  if (struct_v >= 2) {
+    decode(mirror_info, p);
+  }
+  DECODE_FINISH(p);
+}
+
+int FSMap::parse_filesystem(
+      std::string_view ns_str,
+      Filesystem::const_ref* result
+      ) const
+{
+  std::string ns_err;
+  std::string s(ns_str);
+  fs_cluster_id_t fscid = strict_strtol(s.c_str(), 10, &ns_err);
+  if (!ns_err.empty() || filesystems.count(fscid) == 0) {
+    for (auto &fs : filesystems) {
+      if (fs.second->mds_map.fs_name == s) {
+        *result = std::const_pointer_cast<const Filesystem>(fs.second);
+        return 0;
+      }
+    }
+    return -CEPHFS_ENOENT;
+  } else {
+    *result = get_filesystem(fscid);
+    return 0;
+  }
+}
+
+void Filesystem::print(std::ostream &out) const
+{
+  out << "Filesystem '" << mds_map.fs_name
+      << "' (" << fscid << ")" << std::endl;
+  mds_map.print(out);
+  if (mirror_info.is_mirrored()) {
+    mirror_info.print(out);
+  }
+}
+
+bool FSMap::is_any_degraded() const
+{
+  for (auto& i : filesystems) {
+    if (i.second->mds_map.is_degraded()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::map<mds_gid_t, MDSMap::mds_info_t> FSMap::get_mds_info() const
+{
+  std::map<mds_gid_t, mds_info_t> result;
+  for (const auto &i : standby_daemons) {
+    result[i.first] = i.second;
+  }
+
+  for (const auto &i : filesystems) {
+    const auto &fs_info = i.second->mds_map.get_mds_info();
+    for (const auto &j : fs_info) {
+      result[j.first] = j.second;
+    }
+  }
+
+  return result;
+}
+
+const MDSMap::mds_info_t* FSMap::get_available_standby(const Filesystem& fs) const
+{
+  const bool upgradeable = fs.is_upgradeable();
+  const mds_info_t* who = nullptr;
+  for (const auto& [gid, info] : standby_daemons) {
+    ceph_assert(info.rank == MDS_RANK_NONE);
+    ceph_assert(info.state == MDSMap::STATE_STANDBY);
+
+    if (info.laggy() || info.is_frozen()) {
+      continue;
+    } else if (!info.compat.writeable(fs.mds_map.compat)) {
+      /* standby is not compatible with this fs */
+      continue;
+    } else if (!upgradeable && !fs.mds_map.compat.writeable(info.compat)) {
+      /* promotion would change fs.mds_map.compat and we're not upgradeable */
+      continue;
+    }
+
+    if (info.join_fscid == fs.fscid) {
+      who = &info;
+      break;
+    } else if (info.join_fscid == FS_CLUSTER_ID_NONE) {
+      who = &info; /* vanilla standby */
+    } else if (who == nullptr) {
+      who = &info; /* standby for another fs, last resort */
+    }
+  }
+  return who;
+}
+
+mds_gid_t FSMap::find_mds_gid_by_name(std::string_view s) const
+{
+  const auto info = get_mds_info();
+  for (const auto &p : info) {
+    if (p.second.name == s) {
+      return p.first;
+    }
+  }
+  return MDS_GID_NONE;
+}
+
+const MDSMap::mds_info_t* FSMap::find_by_name(std::string_view name) const
+{
+  std::map<mds_gid_t, mds_info_t> result;
+  for (const auto &i : standby_daemons) {
+    if (i.second.name == name) {
+      return &(i.second);
+    }
+  }
+
+  for (const auto &i : filesystems) {
+    const auto &fs_info = i.second->mds_map.get_mds_info();
+    for (const auto &j : fs_info) {
+      if (j.second.name == name) {
+        return &(j.second);
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+const MDSMap::mds_info_t* FSMap::find_replacement_for(mds_role_t role) const
+{
+  auto&& fs = get_filesystem(role.fscid);
+
+  // First see if we have a STANDBY_REPLAY
+  for (const auto& [gid, info] : fs->mds_map.mds_info) {
+    if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
+      if (info.is_frozen()) {
+        /* the standby-replay is frozen, do nothing! */
+        return nullptr;
+      } else {
+        ceph_assert(info.compat.writeable(fs->mds_map.compat));
+        return &info;
+      }
+    }
+  }
+
+  return get_available_standby(*fs);
+}
+
+void FSMap::sanity(bool pending) const
+{
+  /* Only do some sanity checks on **new** FSMaps. Older versions may not be
+   * compliant.
+   */
+
+  if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
+    ceph_assert(filesystems.count(legacy_client_fscid) == 1);
+  }
+
+  for (const auto& [fscid, fs] : filesystems) {
+    ceph_assert(fscid  == fs->fscid);
+    for (const auto& [gid, info] : fs->mds_map.mds_info) {
+      ceph_assert(info.rank != MDS_RANK_NONE);
+      ceph_assert(mds_roles.at(gid) == fscid);
+      ceph_assert(standby_daemons.count(gid) == 0);
+      ceph_assert(standby_epochs.count(gid) == 0);
+      if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
+        ceph_assert(fs->mds_map.up.at(info.rank) == gid);
+        ceph_assert(fs->mds_map.failed.count(info.rank) == 0);
+        ceph_assert(fs->mds_map.damaged.count(info.rank) == 0);
+      } else {
+        ceph_assert(!pending || fs->mds_map.allows_standby_replay());
+      }
+      ceph_assert(info.compat.writeable(fs->mds_map.compat));
+    }
+
+    for (const auto &j : fs->mds_map.up) {
+      mds_rank_t rank = j.first;
+      ceph_assert(fs->mds_map.in.count(rank) == 1);
+      mds_gid_t gid = j.second;
+      ceph_assert(fs->mds_map.mds_info.count(gid) == 1);
+    }
+  }
+
+  for (const auto &i : standby_daemons) {
+    ceph_assert(i.second.state == MDSMap::STATE_STANDBY);
+    ceph_assert(i.second.rank == MDS_RANK_NONE);
+    ceph_assert(i.second.global_id == i.first);
+    ceph_assert(standby_epochs.count(i.first) == 1);
+    ceph_assert(mds_roles.count(i.first) == 1);
+    ceph_assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE);
+  }
+
+  for (const auto &i : standby_epochs) {
+    ceph_assert(standby_daemons.count(i.first) == 1);
+  }
+
+  for (const auto &i : mds_roles) {
+    if (i.second == FS_CLUSTER_ID_NONE) {
+      ceph_assert(standby_daemons.count(i.first) == 1);
+    } else {
+      ceph_assert(filesystems.count(i.second) == 1);
+      ceph_assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1);
+    }
+  }
+}
+
+void FSMap::promote(
+    mds_gid_t standby_gid,
+    Filesystem& filesystem,
+    mds_rank_t assigned_rank)
+{
+  ceph_assert(gid_exists(standby_gid));
+  bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE;
+  if (!is_standby_replay) {
+    ceph_assert(standby_daemons.count(standby_gid));
+    ceph_assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
+  }
+
+  MDSMap &mds_map = filesystem.mds_map;
+
+  // Insert daemon state to Filesystem
+  if (!is_standby_replay) {
+    mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
+  } else {
+    ceph_assert(mds_map.mds_info.count(standby_gid));
+    ceph_assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
+    ceph_assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
+  }
+  auto& info = mds_map.mds_info.at(standby_gid);
+
+  if (!filesystem.mds_map.compat.writeable(info.compat)) {
+    ceph_assert(filesystem.is_upgradeable());
+    filesystem.mds_map.compat.merge(info.compat);
+  }
+
+  if (mds_map.stopped.erase(assigned_rank)) {
+    // The cluster is being expanded with a stopped rank
+    info.state = MDSMap::STATE_STARTING;
+  } else if (!mds_map.is_in(assigned_rank)) {
+    // The cluster is being expanded with a new rank
+    info.state = MDSMap::STATE_CREATING;
+  } else {
+    // An existing rank is being assigned to a replacement
+    info.state = MDSMap::STATE_REPLAY;
+    mds_map.failed.erase(assigned_rank);
+  }
+  info.rank = assigned_rank;
+  info.inc = epoch;
+  mds_roles.at(standby_gid) = filesystem.fscid;
+
+  // Update the rank state in Filesystem
+  mds_map.in.insert(assigned_rank);
+  mds_map.up[assigned_rank] = standby_gid;
+
+  // Remove from the list of standbys
+  if (!is_standby_replay) {
+    standby_daemons.erase(standby_gid);
+    standby_epochs.erase(standby_gid);
+  }
+
+  // Indicate that Filesystem has been modified
+  mds_map.epoch = epoch;
+}
+
+void FSMap::assign_standby_replay(
+    const mds_gid_t standby_gid,
+    const fs_cluster_id_t leader_ns,
+    const mds_rank_t leader_rank)
+{
+  ceph_assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE);
+  ceph_assert(gid_exists(standby_gid));
+  ceph_assert(!gid_has_rank(standby_gid));
+  ceph_assert(standby_daemons.count(standby_gid));
+
+  // Insert to the filesystem
+  auto fs = filesystems.at(leader_ns);
+  fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
+  fs->mds_map.mds_info[standby_gid].rank = leader_rank;
+  fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY;
+  mds_roles[standby_gid] = leader_ns;
+
+  // Remove from the list of standbys
+  standby_daemons.erase(standby_gid);
+  standby_epochs.erase(standby_gid);
+
+  // Indicate that Filesystem has been modified
+  fs->mds_map.epoch = epoch;
+}
+
+void FSMap::erase(mds_gid_t who, epoch_t blocklist_epoch)
+{
+  if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
+    standby_daemons.erase(who);
+    standby_epochs.erase(who);
+  } else {
+    auto &fs = filesystems.at(mds_roles.at(who));
+    const auto &info = fs->mds_map.mds_info.at(who);
+    if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
+      if (info.state == MDSMap::STATE_CREATING) {
+        // If this gid didn't make it past CREATING, then forget
+        // the rank ever existed so that next time it's handed out
+        // to a gid it'll go back into CREATING.
+        fs->mds_map.in.erase(info.rank);
+      } else {
+        // Put this rank into the failed list so that the next available
+        // STANDBY will pick it up.
+        fs->mds_map.failed.insert(info.rank);
+      }
+      ceph_assert(fs->mds_map.up.at(info.rank) == info.global_id);
+      fs->mds_map.up.erase(info.rank);
+    }
+    fs->mds_map.mds_info.erase(who);
+    fs->mds_map.last_failure_osd_epoch = blocklist_epoch;
+    fs->mds_map.epoch = epoch;
+  }
+
+  mds_roles.erase(who);
+}
+
+void FSMap::damaged(mds_gid_t who, epoch_t blocklist_epoch)
+{
+  ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
+  auto fs = filesystems.at(mds_roles.at(who));
+  mds_rank_t rank = fs->mds_map.mds_info.at(who).rank;
+
+  erase(who, blocklist_epoch);
+  fs->mds_map.failed.erase(rank);
+  fs->mds_map.damaged.insert(rank);
+
+  ceph_assert(fs->mds_map.epoch == epoch);
+}
+
+/**
+ * Update to indicate that the rank `rank` is to be removed
+ * from the damaged list of the filesystem `fscid`
+ */
+bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank)
+{
+  auto fs = filesystems.at(fscid);
+
+  if (fs->mds_map.damaged.erase(rank)) {
+    fs->mds_map.failed.insert(rank);
+    fs->mds_map.epoch = epoch;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void FSMap::insert(const MDSMap::mds_info_t &new_info)
+{
+  static const CompatSet empty;
+
+  ceph_assert(new_info.state == MDSMap::STATE_STANDBY);
+  ceph_assert(new_info.rank == MDS_RANK_NONE);
+  mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE;
+  auto& info = standby_daemons[new_info.global_id];
+  info = new_info;
+  if (empty.compare(info.compat) == 0) {
+    // bootstrap old compat: boot beacon contains empty compat on old (v16.2.4
+    // or older) MDS.
+    info.compat = MDSMap::get_compat_set_v16_2_4();
+  }
+  /* TODO remove after R is released
+   * Insert INLINE; see comment in MDSMap::decode.
+   */
+  info.compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
+  standby_epochs[new_info.global_id] = epoch;
+}
+
+std::vector<mds_gid_t> FSMap::stop(mds_gid_t who)
+{
+  ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
+  auto fs = filesystems.at(mds_roles.at(who));
+  const auto &info = fs->mds_map.mds_info.at(who);
+  fs->mds_map.up.erase(info.rank);
+  fs->mds_map.in.erase(info.rank);
+  fs->mds_map.stopped.insert(info.rank);
+
+  // Also drop any standby replays that were following this rank
+  std::vector<mds_gid_t> standbys;
+  for (const auto &i : fs->mds_map.mds_info) {
+    const auto &other_gid = i.first;
+    const auto &other_info = i.second;
+    if (other_info.rank == info.rank
+        && other_info.state == MDSMap::STATE_STANDBY_REPLAY) {
+      standbys.push_back(other_gid);
+      erase(other_gid, 0);
+    }
+  }
+
+  fs->mds_map.mds_info.erase(who);
+  mds_roles.erase(who);
+
+  fs->mds_map.epoch = epoch;
+
+  return standbys;
+}
+
+
+/**
+ * Given one of the following forms:
+ *   <fs name>:<rank>
+ *   <fs id>:<rank>
+ *   <rank>
+ *
+ * Parse into a mds_role_t.  The rank-only form is only valid
+ * if legacy_client_ns is set.
+ */
+
+int FSMap::parse_role(
+    std::string_view role_str,
+    mds_role_t *role,
+    std::ostream &ss,
+    const std::vector<string> &filter) const
+{
+  int r = parse_role(role_str, role, ss);
+  if (r < 0) return r;
+
+  string_view fs_name = get_filesystem(role->fscid)->mds_map.get_fs_name();
+
+  if (!filter.empty() &&
+      std::find(filter.begin(), filter.end(), fs_name) == filter.end()) {
+    if (r >= 0) {
+      ss << "Invalid file system";
+    }
+    return -CEPHFS_ENOENT;
+  }
+
+  return r;
+}
+
+int FSMap::parse_role(
+    std::string_view role_str,
+    mds_role_t *role,
+    std::ostream &ss) const
+{
+  size_t colon_pos = role_str.find(":");
+  size_t rank_pos;
+  Filesystem::const_ref fs;
+  if (colon_pos == std::string::npos) {
+    if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
+      ss << "No filesystem selected";
+      return -CEPHFS_ENOENT;
+    }
+    fs = get_filesystem(legacy_client_fscid);
+    rank_pos = 0;
+  } else {
+    if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) {
+      ss << "Invalid filesystem";
+      return -CEPHFS_ENOENT;
+    }
+    rank_pos = colon_pos+1;
+  }
+
+  mds_rank_t rank;
+  std::string err;
+  std::string rank_str(role_str.substr(rank_pos));
+  long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
+  if (rank_i < 0 || !err.empty()) {
+    ss << "Invalid rank '" << rank_str << "'";
+    return -CEPHFS_EINVAL;
+  } else {
+    rank = rank_i;
+  }
+
+  if (fs->mds_map.in.count(rank) == 0) {
+    ss << "Rank '" << rank << "' not found";
+    return -CEPHFS_ENOENT;
+  }
+
+  *role = {fs->fscid, rank};
+
+  return 0;
+}
+
+bool FSMap::pool_in_use(int64_t poolid) const
+{
+  for (auto const &i : filesystems) {
+    if (i.second->mds_map.is_data_pool(poolid)
+        || i.second->mds_map.metadata_pool == poolid) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void FSMap::erase_filesystem(fs_cluster_id_t fscid)
+{
+  filesystems.erase(fscid);
+  for (auto& [gid, info] : standby_daemons) {
+    if (info.join_fscid == fscid) {
+      modify_daemon(gid, [](auto& info) {
+        info.join_fscid = FS_CLUSTER_ID_NONE;
+      });
+    }
+  }
+  for (auto& p : filesystems) {
+    for (auto& [gid, info] : p.second->mds_map.get_mds_info()) {
+      if (info.join_fscid == fscid) {
+        modify_daemon(gid, [](auto& info) {
+          info.join_fscid = FS_CLUSTER_ID_NONE;
+        });
+      }
+    }
+  }
+}
diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h
new file mode 100644
index 000000000..f6c24e4d4
--- /dev/null
+++ b/src/mds/FSMap.h
@@ -0,0 +1,628 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_FSMAP_H
+#define CEPH_FSMAP_H
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <string_view>
+
+#include <errno.h>
+
+#include "include/types.h"
+#include "common/Clock.h"
+#include "mds/MDSMap.h"
+
+#include "include/CompatSet.h"
+#include "include/ceph_features.h"
+#include "include/common_fwd.h"
+#include "common/Formatter.h"
+#include "mds/mdstypes.h"
+
+#if __cplusplus <= 201703L
+template<class Key, class T, class Compare, class Alloc, class Pred>
+typename std::map<Key, T, Compare, Alloc>::size_type
+erase_if(std::map<Key, T, Compare, Alloc>& c, Pred pred) {
+  auto old_size = c.size();
+  for (auto i = c.begin(), last = c.end(); i != last; ) {
+    if (pred(*i)) {
+      i = c.erase(i);
+    } else {
+      ++i;
+    }
+  }
+  return old_size - c.size();
+}
+#endif
+
+class health_check_map_t;
+
+struct ClusterInfo {
+  ClusterInfo() = default;
+  ClusterInfo(std::string_view client_name, std::string_view cluster_name,
+              std::string_view fs_name)
+    : client_name(client_name),
+      cluster_name(cluster_name),
+      fs_name(fs_name) {
+  }
+
+  std::string client_name;
+  std::string cluster_name;
+  std::string fs_name;
+
+  bool operator==(const ClusterInfo &cluster_info) const {
+    return client_name == cluster_info.client_name &&
+           cluster_name == cluster_info.cluster_name &&
+           fs_name == cluster_info.fs_name;
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void print(std::ostream& out) const;
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &iter);
+};
+
+inline std::ostream& operator<<(std::ostream& out, const ClusterInfo &cluster_info) {
+  out << "{client_name=" << cluster_info.client_name << ", cluster_name="
+      << cluster_info.cluster_name << ", fs_name=" << cluster_info.fs_name << "}";
+  return out;
+}
+
+struct Peer {
+  Peer() = default;
+  Peer(std::string_view uuid)
+    : uuid(uuid) {
+  }
+  Peer(std::string_view uuid,
+       const ClusterInfo &remote)
+    : uuid(uuid),
+      remote(remote) {
+  }
+
+  std::string uuid;
+  ClusterInfo remote;
+
+  bool operator==(const Peer &rhs) const {
+    return uuid == rhs.uuid;
+  }
+
+  bool operator<(const Peer &rhs) const {
+    return uuid < rhs.uuid;
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void print(std::ostream& out) const;
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &iter);
+};
+
+typedef std::set<Peer> Peers;
+inline std::ostream& operator<<(std::ostream& out, const Peer &peer) {
+  out << "{uuid=" << peer.uuid << ", remote_cluster=" << peer.remote << "}";
+  return out;
+}
+
+struct MirrorInfo {
+  MirrorInfo() = default;
+
+  bool is_mirrored() const {
+    return mirrored;
+  }
+  void enable_mirroring() {
+    mirrored = true;
+  }
+  void disable_mirroring() {
+    peers.clear();
+    mirrored = false;
+  }
+
+  // uuid variant check
+  bool has_peer(std::string_view uuid) const {
+    return peers.find(Peer(uuid)) != peers.end();
+  }
+  // client_name/cluster_name/fs_name variant check
+  bool has_peer(std::string_view client_name,
+                std::string_view cluster_name,
+                std::string_view fs_name) const {
+    ClusterInfo cluster_info(client_name, cluster_name, fs_name);
+    for (auto &peer : peers) {
+      if (peer.remote == cluster_info) {
+        return true;
+      }
+    }
+    return false;
+  }
+  bool has_peers() const {
+    return !peers.empty();
+  }
+
+  void peer_add(std::string_view uuid,
+                std::string_view client_name,
+                std::string_view cluster_name,
+                std::string_view fs_name) {
+    peers.emplace(Peer(uuid, ClusterInfo(client_name, cluster_name, fs_name)));
+  }
+  void peer_remove(std::string_view uuid) {
+    peers.erase(uuid);
+  }
+
+  bool mirrored = false;
+  Peers peers;
+
+  void dump(ceph::Formatter *f) const;
+  void print(std::ostream& out) const;
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &iter);
+};
+
+inline std::ostream& operator<<(std::ostream& out, const MirrorInfo &mirror_info) {
+  out << "{peers=" << mirror_info.peers << "}";
+  return out;
+}
+
+WRITE_CLASS_ENCODER(ClusterInfo)
+WRITE_CLASS_ENCODER(Peer)
+WRITE_CLASS_ENCODER(MirrorInfo)
+
+/**
+ * The MDSMap and any additional fields describing a particular
+ * filesystem (a unique fs_cluster_id_t).
+ */
+class Filesystem
+{
+public:
+  using ref = std::shared_ptr<Filesystem>;
+  using const_ref = std::shared_ptr<Filesystem const>;
+
+  template<typename... Args>
+  static ref create(Args&&... args)
+  {
+    return std::make_shared<Filesystem>(std::forward<Args>(args)...);
+  }
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+
+  void dump(ceph::Formatter *f) const;
+  void print(std::ostream& out) const;
+
+  bool is_upgradeable() const {
+    return (mds_map.allows_standby_replay() && mds_map.get_num_in_mds() == 0)
+       || (!mds_map.allows_standby_replay() && mds_map.get_num_in_mds() <= 1);
+  }
+
+  /**
+   * Return true if a daemon is already assigned as
+   * STANDBY_REPLAY for the gid `who`
+   */
+  bool has_standby_replay(mds_gid_t who) const
+  {
+    return get_standby_replay(who) != MDS_GID_NONE;
+  }
+  mds_gid_t get_standby_replay(mds_gid_t who) const;
+  bool is_standby_replay(mds_gid_t who) const
+  {
+    auto p = mds_map.mds_info.find(who);
+    if (p != mds_map.mds_info.end() &&
+	p->second.state == MDSMap::STATE_STANDBY_REPLAY) {
+      return true;
+    }
+    return false;
+  }
+
+  fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+  MDSMap mds_map;
+  MirrorInfo mirror_info;
+};
+WRITE_CLASS_ENCODER_FEATURES(Filesystem)
+
+class FSMap {
+public:
+  friend class MDSMonitor;
+  friend class PaxosFSMap;
+  using mds_info_t = MDSMap::mds_info_t;
+
+  static const version_t STRUCT_VERSION = 7;
+  static const version_t STRUCT_VERSION_TRIM_TO = 7;
+
+  FSMap() : default_compat(MDSMap::get_compat_set_default()) {}
+
+  FSMap(const FSMap &rhs)
+    :
+      epoch(rhs.epoch),
+      next_filesystem_id(rhs.next_filesystem_id),
+      legacy_client_fscid(rhs.legacy_client_fscid),
+      default_compat(rhs.default_compat),
+      enable_multiple(rhs.enable_multiple),
+      ever_enabled_multiple(rhs.ever_enabled_multiple),
+      mds_roles(rhs.mds_roles),
+      standby_daemons(rhs.standby_daemons),
+      standby_epochs(rhs.standby_epochs),
+      struct_version(rhs.struct_version)
+  {
+    filesystems.clear();
+    for (const auto &i : rhs.filesystems) {
+      const auto &fs = i.second;
+      filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
+    }
+  }
+
+  FSMap &operator=(const FSMap &rhs);
+
+  const CompatSet &get_default_compat() const {return default_compat;}
+
+  void filter(const std::vector<string>& allowed)
+  {
+    if (allowed.empty()) {
+      return;
+    }
+
+    erase_if(filesystems, [&](const auto& f) {
+      return std::find(allowed.begin(), allowed.end(), f.second->mds_map.get_fs_name()) == allowed.end();
+    });
+
+    erase_if(mds_roles, [&](const auto& r) {
+      return std::find(allowed.begin(), allowed.end(), fs_name_from_gid(r.first)) == allowed.end();
+    });
+  }
+
+  void set_enable_multiple(const bool v)
+  {
+    enable_multiple = v;
+    if (true == v) {
+      ever_enabled_multiple = true;
+    }
+  }
+
+  bool get_enable_multiple() const
+  {
+    return enable_multiple;
+  }
+
+  void set_legacy_client_fscid(fs_cluster_id_t fscid)
+  {
+    ceph_assert(fscid == FS_CLUSTER_ID_NONE || filesystems.count(fscid));
+    legacy_client_fscid = fscid;
+  }
+
+  fs_cluster_id_t get_legacy_client_fscid() const
+  {
+    return legacy_client_fscid;
+  }
+
+  size_t get_num_standby() const {
+    return standby_daemons.size();
+  }
+
+  bool is_any_degraded() const;
+
+  /**
+   * Get state of all daemons (for all filesystems, including all standbys)
+   */
+  std::map<mds_gid_t, mds_info_t> get_mds_info() const;
+
+  const mds_info_t* get_available_standby(const Filesystem& fs) const;
+
+  /**
+   * Resolve daemon name to GID
+   */
+  mds_gid_t find_mds_gid_by_name(std::string_view s) const;
+
+  /**
+   * Resolve daemon name to status
+   */
+  const mds_info_t* find_by_name(std::string_view name) const;
+
+  /**
+   * Does a daemon exist with this GID?
+   */
+  bool gid_exists(mds_gid_t gid,
+		  const std::vector<string>& in = {}) const
+  {
+    try {
+      string_view m = fs_name_from_gid(gid);
+      return in.empty() || std::find(in.begin(), in.end(), m) != in.end();
+    } catch (const std::out_of_range&) {
+      return false;
+    }
+  }
+
+  /**
+   * Does a daemon with this GID exist, *and* have an MDS rank assigned?
+   */
+  bool gid_has_rank(mds_gid_t gid) const
+  {
+    return gid_exists(gid) && mds_roles.at(gid) != FS_CLUSTER_ID_NONE;
+  }
+
+  /**
+   * Which filesystem owns this GID?
+   */
+  fs_cluster_id_t fscid_from_gid(mds_gid_t gid) const {
+    if (!gid_exists(gid)) {
+      return FS_CLUSTER_ID_NONE;
+    }
+    return mds_roles.at(gid);
+  }
+
+  /**
+   * Insert a new MDS daemon, as a standby
+   */
+  void insert(const MDSMap::mds_info_t &new_info);
+
+  /**
+   * Assign an MDS cluster standby replay rank to a standby daemon
+   */
+  void assign_standby_replay(
+      const mds_gid_t standby_gid,
+      const fs_cluster_id_t leader_ns,
+      const mds_rank_t leader_rank);
+
+  /**
+   * Assign an MDS cluster rank to a standby daemon
+   */
+  void promote(
+      mds_gid_t standby_gid,
+      Filesystem& filesystem,
+      mds_rank_t assigned_rank);
+
+  /**
+   * A daemon reports that it is STATE_STOPPED: remove it,
+   * and the rank it held.
+   *
+   * @returns a list of any additional GIDs that were removed from the map
+   * as a side effect (like standby replays)
+   */
+  std::vector<mds_gid_t> stop(mds_gid_t who);
+
+  /**
+   * The rank held by 'who', if any, is to be relinquished, and
+   * the state for the daemon GID is to be forgotten.
+   */
+  void erase(mds_gid_t who, epoch_t blocklist_epoch);
+
+  /**
+   * Update to indicate that the rank held by 'who' is damaged
+   */
+  void damaged(mds_gid_t who, epoch_t blocklist_epoch);
+
+  /**
+   * Update to indicate that the rank `rank` is to be removed
+   * from the damaged list of the filesystem `fscid`
+   */
+  bool undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank);
+
+  /**
+   * Initialize a Filesystem and assign a fscid.  Update legacy_client_fscid
+   * to point to the new filesystem if it's the only one.
+   *
+   * Caller must already have validated all arguments vs. the existing
+   * FSMap and OSDMap contents.
+   */
+  Filesystem::ref create_filesystem(
+      std::string_view name, int64_t metadata_pool,
+      int64_t data_pool, uint64_t features,
+      fs_cluster_id_t fscid, bool recover);
+
+  /**
+   * Remove the filesystem (it must exist).  Caller should already
+   * have failed out any MDSs that were assigned to the filesystem.
+   */
+  void erase_filesystem(fs_cluster_id_t fscid);
+
+  /**
+   * Reset all the state information (not configuration information)
+   * in a particular filesystem.  Caller must have verified that
+   * the filesystem already exists.
+   */
+  void reset_filesystem(fs_cluster_id_t fscid);
+
+  /**
+   * Mutator helper for Filesystem objects: expose a non-const
+   * Filesystem pointer to `fn` and update epochs appropriately.
+   */
+  template<typename T>
+  void modify_filesystem(fs_cluster_id_t fscid, T&& fn)
+  {
+    auto& fs = filesystems.at(fscid);
+    fn(fs);
+    fs->mds_map.epoch = epoch;
+  }
+
+  /**
+   * Apply a mutation to the mds_info_t structure for a particular
+   * daemon (identified by GID), and make appropriate updates to epochs.
+   */
+  template<typename T>
+  void modify_daemon(mds_gid_t who, T&& fn)
+  {
+    const auto& fscid = mds_roles.at(who);
+    if (fscid == FS_CLUSTER_ID_NONE) {
+      auto& info = standby_daemons.at(who);
+      fn(info);
+      ceph_assert(info.state == MDSMap::STATE_STANDBY);
+      standby_epochs[who] = epoch;
+    } else {
+      auto& fs = filesystems.at(fscid);
+      auto& info = fs->mds_map.mds_info.at(who);
+      fn(info);
+      fs->mds_map.epoch = epoch;
+    }
+  }
+
+  /**
+   * Given that gid exists in a filesystem or as a standby, return
+   * a reference to its info.
+   */
+  const mds_info_t& get_info_gid(mds_gid_t gid) const
+  {
+    auto fscid = mds_roles.at(gid);
+    if (fscid == FS_CLUSTER_ID_NONE) {
+      return standby_daemons.at(gid);
+    } else {
+      return filesystems.at(fscid)->mds_map.mds_info.at(gid);
+    }
+  }
+
+  std::string_view fs_name_from_gid(mds_gid_t gid) const
+  {
+    auto fscid = mds_roles.at(gid);
+    if (fscid == FS_CLUSTER_ID_NONE or !filesystem_exists(fscid)) {
+      return std::string_view();
+    } else {
+      return get_filesystem(fscid)->mds_map.get_fs_name();
+    }
+  }
+
+  bool is_standby_replay(mds_gid_t who) const
+  {
+    return filesystems.at(mds_roles.at(who))->is_standby_replay(who);
+  }
+
+  mds_gid_t get_standby_replay(mds_gid_t who) const
+  {
+    return filesystems.at(mds_roles.at(who))->get_standby_replay(who);
+  }
+
+  Filesystem::const_ref get_legacy_filesystem()
+  {
+    if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
+      return nullptr;
+    } else {
+      return filesystems.at(legacy_client_fscid);
+    }
+  }
+
+  /**
+   * A daemon has informed us of its offload targets
+   */
+  void update_export_targets(mds_gid_t who, const std::set<mds_rank_t> &targets)
+  {
+    auto fscid = mds_roles.at(who);
+    modify_filesystem(fscid, [who, &targets](auto&& fs) {
+      fs->mds_map.mds_info.at(who).export_targets = targets;
+    });
+  }
+
+  epoch_t get_epoch() const { return epoch; }
+  void inc_epoch() { epoch++; }
+
+  version_t get_struct_version() const { return struct_version; }
+  bool is_struct_old() const {
+    return struct_version < STRUCT_VERSION_TRIM_TO;
+  }
+
+  size_t filesystem_count() const {return filesystems.size();}
+  bool filesystem_exists(fs_cluster_id_t fscid) const {return filesystems.count(fscid) > 0;}
+  Filesystem::const_ref get_filesystem(fs_cluster_id_t fscid) const {return std::const_pointer_cast<const Filesystem>(filesystems.at(fscid));}
+  Filesystem::ref get_filesystem(fs_cluster_id_t fscid) {return filesystems.at(fscid);}
+  Filesystem::ref get_filesystem(mds_gid_t gid) {
+    return filesystems.at(mds_roles.at(gid));
+  }
+  Filesystem::const_ref get_filesystem(void) const {return std::const_pointer_cast<const Filesystem>(filesystems.begin()->second);}
+  Filesystem::const_ref get_filesystem(std::string_view name) const;
+  Filesystem::const_ref get_filesystem(mds_gid_t gid) const {
+    return filesystems.at(mds_roles.at(gid));
+  }
+
+  std::vector<Filesystem::const_ref> get_filesystems(void) const;
+
+  int parse_filesystem(
+      std::string_view ns_str,
+      Filesystem::const_ref *result
+      ) const;
+
+  int parse_role(
+      std::string_view role_str,
+      mds_role_t *role,
+      std::ostream &ss,
+      const std::vector<string> &filter) const;
+
+  int parse_role(
+      std::string_view role_str,
+      mds_role_t *role,
+      std::ostream &ss) const;
+
+  /**
+   * Return true if this pool is in use by any of the filesystems
+   */
+  bool pool_in_use(int64_t poolid) const;
+
+  const mds_info_t* find_replacement_for(mds_role_t role) const;
+
+  void get_health(std::list<std::pair<health_status_t,std::string> >& summary,
+		  std::list<std::pair<health_status_t,std::string> > *detail) const;
+
+  void get_health_checks(health_check_map_t *checks) const;
+
+  bool check_health(void);
+
+  /**
+   * Assert that the FSMap, Filesystem, MDSMap, mds_info_t relations are
+   * all self-consistent.
+   */
+  void sanity(bool pending=false) const;
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void decode(ceph::buffer::list& bl) {
+    auto p = bl.cbegin();
+    decode(p);
+  }
+  void sanitize(const std::function<bool(int64_t pool)>& pool_exists);
+
+  void print(std::ostream& out) const;
+  void print_summary(ceph::Formatter *f, std::ostream *out) const;
+  void print_daemon_summary(std::ostream& out) const;
+  void print_fs_summary(std::ostream& out) const;
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<FSMap*>& ls);
+
+protected:
+  epoch_t epoch = 0;
+  uint64_t next_filesystem_id = FS_CLUSTER_ID_ANONYMOUS + 1;
+  fs_cluster_id_t legacy_client_fscid = FS_CLUSTER_ID_NONE;
+  CompatSet default_compat;
+  bool enable_multiple = true;
+  bool ever_enabled_multiple = true; // < the cluster had multiple FS enabled once
+
+  std::map<fs_cluster_id_t, Filesystem::ref> filesystems;
+
+  // Remember which Filesystem an MDS daemon's info is stored in
+  // (or in standby_daemons for FS_CLUSTER_ID_NONE)
+  std::map<mds_gid_t, fs_cluster_id_t> mds_roles;
+
+  // For MDS daemons not yet assigned to a Filesystem
+  std::map<mds_gid_t, mds_info_t> standby_daemons;
+  std::map<mds_gid_t, epoch_t> standby_epochs;
+
+private:
+  epoch_t struct_version = 0;
+};
+WRITE_CLASS_ENCODER_FEATURES(FSMap)
+
+inline std::ostream& operator<<(std::ostream& out, const FSMap& m) {
+  m.print_summary(NULL, &out);
+  return out;
+}
+
+#endif
diff --git a/src/mds/FSMapUser.cc b/src/mds/FSMapUser.cc
new file mode 100644
index 000000000..63a58acc8
--- /dev/null
+++ b/src/mds/FSMapUser.cc
@@ -0,0 +1,81 @@
+#include "FSMapUser.h"
+
+void FSMapUser::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(epoch, bl);
+  encode(legacy_client_fscid, bl);
+  std::vector<fs_info_t> fs_list;
+  for (auto p = filesystems.begin(); p != filesystems.end(); ++p)
+    fs_list.push_back(p->second);
+  encode(fs_list, bl, features);
+  ENCODE_FINISH(bl);
+}
+
+void FSMapUser::decode(ceph::buffer::list::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(epoch, p);
+  decode(legacy_client_fscid, p);
+  std::vector<fs_info_t> fs_list;
+  decode(fs_list, p);
+  filesystems.clear();
+  for (auto p = fs_list.begin(); p != fs_list.end(); ++p)
+    filesystems[p->cid] = *p;
+  DECODE_FINISH(p);
+}
+
+void FSMapUser::fs_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(cid, bl);
+  encode(name, bl);
+  ENCODE_FINISH(bl);
+}
+
+void FSMapUser::fs_info_t::decode(ceph::buffer::list::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(cid, p);
+  decode(name, p);
+  DECODE_FINISH(p);
+}
+
+void FSMapUser::generate_test_instances(std::list<FSMapUser*>& ls)
+{
+  FSMapUser *m = new FSMapUser();
+  m->epoch = 2;
+  m->legacy_client_fscid = 1;
+  m->filesystems[1].cid = 1;
+  m->filesystems[2].name = "cephfs2";
+  m->filesystems[2].cid = 2;
+  m->filesystems[1].name = "cephfs1";
+  ls.push_back(m);
+}
+
+
+void FSMapUser::print(std::ostream& out) const
+{
+  out << "e" << epoch << std::endl;
+  out << "legacy_client_fscid: " << legacy_client_fscid << std::endl;
+  for (auto &p : filesystems)
+    out << " id " <<  p.second.cid << " name " << p.second.name << std::endl;
+}
+
+void FSMapUser::print_summary(ceph::Formatter *f, std::ostream *out)
+{
+  std::map<mds_role_t,std::string> by_rank;
+  std::map<std::string,int> by_state;
+
+  if (f) {
+    f->dump_unsigned("epoch", get_epoch());
+    for (auto &p : filesystems) {
+      f->dump_unsigned("id", p.second.cid);
+      f->dump_string("name", p.second.name);
+    }
+  } else {
+    *out << "e" << get_epoch() << ":";
+    for (auto &p : filesystems)
+      *out << " " << p.second.name << "(" << p.second.cid << ")";
+  }
+}
diff --git a/src/mds/FSMapUser.h b/src/mds/FSMapUser.h
new file mode 100644
index 000000000..a0be8e714
--- /dev/null
+++ b/src/mds/FSMapUser.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef CEPH_FSMAPCOMPACT_H
+#define CEPH_FSMAPCOMPACT_H
+
+#include <map>
+#include <string>
+#include <string_view>
+
+#include "mds/mdstypes.h"
+
+class FSMapUser {
+public:
+  struct fs_info_t {
+    fs_info_t() {}
+    void encode(ceph::buffer::list& bl, uint64_t features) const;
+    void decode(ceph::buffer::list::const_iterator &bl);
+    std::string name;
+    fs_cluster_id_t cid = FS_CLUSTER_ID_NONE;
+  };
+
+  FSMapUser() {}
+
+  epoch_t get_epoch() const { return epoch; }
+
+  fs_cluster_id_t get_fs_cid(std::string_view name) const {
+    for (auto &p : filesystems) {
+      if (p.second.name == name)
+	return p.first;
+    }
+    return FS_CLUSTER_ID_NONE;
+  }
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+
+  void print(std::ostream& out) const;
+  void print_summary(ceph::Formatter *f, std::ostream *out);
+
+  static void generate_test_instances(std::list<FSMapUser*>& ls);
+
+  std::map<fs_cluster_id_t, fs_info_t> filesystems;
+  fs_cluster_id_t legacy_client_fscid = FS_CLUSTER_ID_NONE;
+  epoch_t epoch = 0;
+};
+WRITE_CLASS_ENCODER_FEATURES(FSMapUser::fs_info_t)
+WRITE_CLASS_ENCODER_FEATURES(FSMapUser)
+
+inline std::ostream& operator<<(std::ostream& out, FSMapUser& m) {
+  m.print_summary(NULL, &out);
+  return out;
+}
+#endif
diff --git a/src/mds/InoTable.cc b/src/mds/InoTable.cc
new file mode 100644
index 000000000..87d7f5959
--- /dev/null
+++ b/src/mds/InoTable.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "InoTable.h"
+#include "MDSRank.h"
+
+#include "include/types.h"
+
+#include "common/config.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << rank << "." << table_name << ": "
+
+void InoTable::reset_state()
+{
+  // use generic range. FIXME THIS IS CRAP
+  free.clear();
+  //#ifdef __LP64__
+  uint64_t start = (uint64_t)(rank+1) << 40;
+  uint64_t len = (uint64_t)1 << 40;
+  //#else
+  //# warning this looks like a 32-bit system, using small inode numbers.
+  //  uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 25;
+  //  uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 25) - 1;
+  //#endif
+  free.insert(start, len);
+
+  projected_free = free;
+}
+
+inodeno_t InoTable::project_alloc_id(inodeno_t id) 
+{
+  dout(10) << "project_alloc_id " << id << " to " << projected_free << "/" << free << dendl;
+  ceph_assert(is_active());
+  if (!id)
+    id = projected_free.range_start();
+  projected_free.erase(id);
+  ++projected_version;
+  return id;
+}
+void InoTable::apply_alloc_id(inodeno_t id)
+{
+  dout(10) << "apply_alloc_id " << id << " to " << projected_free << "/" << free << dendl;
+  free.erase(id);
+  ++version;
+}
+
+void InoTable::project_alloc_ids(interval_set<inodeno_t>& ids, int want) 
+{
+  ceph_assert(is_active());
+  while (want > 0) {
+    inodeno_t start = projected_free.range_start();
+    inodeno_t end = projected_free.end_after(start);
+    inodeno_t num = end - start;
+    if (num > (inodeno_t)want)
+      num = want;
+    projected_free.erase(start, num);
+    ids.insert(start, num);
+    want -= num;
+  }
+  dout(10) << "project_alloc_ids " << ids << " to " << projected_free << "/" << free << dendl;
+  ++projected_version;
+}
+void InoTable::apply_alloc_ids(interval_set<inodeno_t>& ids)
+{
+  dout(10) << "apply_alloc_ids " << ids << " to " << projected_free << "/" << free << dendl;
+  free.subtract(ids);
+  ++version;
+}
+
+
+void InoTable::project_release_ids(const interval_set<inodeno_t>& ids) 
+{
+  dout(10) << "project_release_ids " << ids << " to " << projected_free << "/" << free << dendl;
+  projected_free.insert(ids);
+  ++projected_version;
+}
+void InoTable::apply_release_ids(const interval_set<inodeno_t>& ids) 
+{
+  dout(10) << "apply_release_ids " << ids << " to " << projected_free << "/" << free << dendl;
+  free.insert(ids);
+  ++version;
+}
+
+
+//
+
+void InoTable::replay_alloc_id(inodeno_t id) 
+{
+  ceph_assert(mds);  // Only usable in online mode
+
+  dout(10) << "replay_alloc_id " << id << dendl;
+  if (free.contains(id)) {
+    free.erase(id);
+    projected_free.erase(id);
+  } else {
+    mds->clog->error() << "journal replay alloc " << id
+      << " not in free " << free;
+  }
+  projected_version = ++version;
+}
+void InoTable::replay_alloc_ids(interval_set<inodeno_t>& ids) 
+{
+  ceph_assert(mds);  // Only usable in online mode
+
+  dout(10) << "replay_alloc_ids " << ids << dendl;
+  interval_set<inodeno_t> is;
+  is.intersection_of(free, ids);
+  if (!(is==ids)) {
+    mds->clog->error() << "journal replay alloc " << ids << ", only "
+	<< is << " is in free " << free;
+  }
+  free.subtract(is);
+  projected_free.subtract(is);
+
+  projected_version = ++version;
+}
+void InoTable::replay_release_ids(interval_set<inodeno_t>& ids) 
+{
+  dout(10) << "replay_release_ids " << ids << dendl;
+  free.insert(ids);
+  projected_free.insert(ids);
+  projected_version = ++version;
+}
+
+
+void InoTable::replay_reset()
+{
+  dout(10) << "replay_reset " << free << dendl;
+  skip_inos(inodeno_t(10000000));  // a lot!
+  projected_free = free;
+  projected_version = ++version;
+}
+
+
+void InoTable::skip_inos(inodeno_t i)
+{
+  dout(10) << "skip_inos was " << free << dendl;
+  inodeno_t first = free.range_start();
+  interval_set<inodeno_t> s;
+  s.insert(first, i);
+  s.intersection_of(free);
+  free.subtract(s);
+  projected_free = free;
+  projected_version = ++version;
+  dout(10) << "skip_inos now " << free << dendl;
+}
+
+void InoTable::dump(Formatter *f) const
+{
+  f->open_object_section("inotable");
+
+  f->open_array_section("projected_free");
+  for (interval_set<inodeno_t>::const_iterator i = projected_free.begin(); i != projected_free.end(); ++i) {
+    f->open_object_section("range");
+    f->dump_int("start", (*i).first);
+    f->dump_int("len", (*i).second);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("free");
+  for (interval_set<inodeno_t>::const_iterator i = free.begin(); i != free.end(); ++i) {
+    f->open_object_section("range");
+    f->dump_int("start", (*i).first);
+    f->dump_int("len", (*i).second);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->close_section();
+}
+
+
+void InoTable::generate_test_instances(std::list<InoTable*>& ls)
+{
+  ls.push_back(new InoTable());
+}
+
+
+bool InoTable::is_marked_free(inodeno_t id) const
+{
+  return free.contains(id) || projected_free.contains(id);
+}
+
+bool InoTable::intersects_free(
+    const interval_set<inodeno_t> &other,
+    interval_set<inodeno_t> *intersection)
+{
+  interval_set<inodeno_t> i;
+  i.intersection_of(free, other);
+  if (intersection != nullptr) {
+    *intersection = i;
+  }
+  return !(i.empty());
+}
+
+bool InoTable::repair(inodeno_t id)
+{
+  if (projected_version != version) {
+    // Can't do the repair while other things are in flight
+    return false;
+  }
+
+  ceph_assert(is_marked_free(id));
+  dout(10) << "repair: before status. ino = " << id << " pver =" << projected_version << " ver= " << version << dendl;
+  free.erase(id);
+  projected_free.erase(id);
+  projected_version = ++version;
+  dout(10) << "repair: after status. ino = " << id << " pver =" << projected_version << " ver= " << version << dendl;
+  return true;
+}
+
+bool InoTable::force_consume_to(inodeno_t ino)
+{
+  inodeno_t first = free.range_start();
+  if (first > ino)
+    return false;
+
+  skip_inos(inodeno_t(ino + 1 - first));
+  return true;
+}
diff --git a/src/mds/InoTable.h b/src/mds/InoTable.h
new file mode 100644
index 000000000..d5e0f4d94
--- /dev/null
+++ b/src/mds/InoTable.h
@@ -0,0 +1,104 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_INOTABLE_H
+#define CEPH_INOTABLE_H
+
+#include "MDSTable.h"
+#include "include/interval_set.h"
+
+class MDSRank;
+
+class InoTable : public MDSTable {
+ public:
+  explicit InoTable(MDSRank *m) : MDSTable(m, "inotable", true) {}
+  InoTable() : MDSTable(NULL, "inotable", true) {}
+
+  inodeno_t project_alloc_id(inodeno_t id=0);
+  void apply_alloc_id(inodeno_t id);
+
+  void project_alloc_ids(interval_set<inodeno_t>& inos, int want);
+  void apply_alloc_ids(interval_set<inodeno_t>& inos);
+
+  void project_release_ids(const interval_set<inodeno_t>& inos);
+  void apply_release_ids(const interval_set<inodeno_t>& inos);
+
+  void replay_alloc_id(inodeno_t ino);
+  void replay_alloc_ids(interval_set<inodeno_t>& inos);
+  void replay_release_ids(interval_set<inodeno_t>& inos);
+  void replay_reset();
+  bool repair(inodeno_t id);
+  bool is_marked_free(inodeno_t id) const;
+  bool intersects_free(
+      const interval_set<inodeno_t> &other,
+      interval_set<inodeno_t> *intersection);
+
+  void reset_state() override;
+  void encode_state(bufferlist& bl) const override {
+    ENCODE_START(2, 2, bl);
+    encode(free, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode_state(bufferlist::const_iterator& bl) override {
+    DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+    decode(free, bl);
+    projected_free = free;
+    DECODE_FINISH(bl);
+  }
+
+  // To permit enc/decoding in isolation in dencoder
+  void encode(bufferlist& bl) const {
+    encode_state(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    decode_state(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<InoTable*>& ls);
+
+  void skip_inos(inodeno_t i);
+
+  /**
+   * If the specified inode is marked as free, mark it as used.
+   * For use in tools, not normal operations.
+   *
+   * @returns true if the inode was previously marked as free
+   */
+  bool force_consume(inodeno_t ino)
+  {
+    if (free.contains(ino)) {
+      free.erase(ino);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * If this ino is in this rank's range, consume up to and including it.
+   * For use in tools, when we know the max ino in use and want to make
+   * sure we're only allocating new inodes from above it.
+   *
+   * @return true if the table was modified
+   */
+  bool force_consume_to(inodeno_t ino);
+
+ private:
+  interval_set<inodeno_t> free;   // unused ids
+  interval_set<inodeno_t> projected_free;
+};
+WRITE_CLASS_ENCODER(InoTable)
+
+#endif
diff --git a/src/mds/JournalPointer.cc b/src/mds/JournalPointer.cc
new file mode 100644
index 000000000..8a446a20f
--- /dev/null
+++ b/src/mds/JournalPointer.cc
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "osdc/Objecter.h"
+#include "mds/mdstypes.h"
+#include "msg/Messenger.h"
+
+#include "mds/JournalPointer.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_journaler
+#undef dout_prefix
+#define dout_prefix *_dout << objecter->messenger->get_myname() << ".journalpointer "
+
+
+std::string JournalPointer::get_object_id() const
+{
+  inodeno_t const pointer_ino = MDS_INO_LOG_POINTER_OFFSET + node_id;
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)pointer_ino, (long long unsigned)0);
+
+  return std::string(buf);
+}
+
+
+/**
+ * Blocking read of JournalPointer for this MDS
+ */
+int JournalPointer::load(Objecter *objecter)
+{
+  ceph_assert(objecter != NULL);
+
+  // Blocking read of data
+  std::string const object_id = get_object_id();
+  dout(4) << "Reading journal pointer '" << object_id << "'" << dendl;
+  bufferlist data;
+  C_SaferCond waiter;
+  objecter->read_full(object_t(object_id), object_locator_t(pool_id),
+      CEPH_NOSNAP, &data, 0, &waiter);
+  int r = waiter.wait();
+
+  // Construct JournalPointer result, null or decoded data
+  if (r == 0) {
+    auto q = data.cbegin();
+    try {
+      decode(q);
+    } catch (const buffer::error &e) {
+      return -CEPHFS_EINVAL;
+    }
+  } else {
+    dout(1) << "Journal pointer '" << object_id << "' read failed: " << cpp_strerror(r) << dendl;
+  }
+  return r;
+}
+
+
+/**
+ * Blocking write of JournalPointer for this MDS
+ *
+ * @return objecter write op status code
+ */
+int JournalPointer::save(Objecter *objecter) const
+{
+  ceph_assert(objecter != NULL);
+  // It is not valid to persist a null pointer
+  ceph_assert(!is_null());
+
+  // Serialize JournalPointer object
+  bufferlist data;
+  encode(data);
+
+  // Write to RADOS and wait for durability
+  std::string const object_id = get_object_id();
+  dout(4) << "Writing pointer object '" << object_id << "': 0x"
+    << std::hex << front << ":0x" << back << std::dec << dendl;
+
+  C_SaferCond waiter;
+  objecter->write_full(object_t(object_id), object_locator_t(pool_id),
+		       SnapContext(), data,
+		       ceph::real_clock::now(), 0,
+		       &waiter);
+  int write_result = waiter.wait();
+  if (write_result < 0) {
+    derr << "Error writing pointer object '" << object_id << "': " << cpp_strerror(write_result) << dendl;
+  }
+  return write_result;
+}
+
+
+/**
+ * Non-blocking variant of save() that assumes objecter lock already held by
+ * caller
+ */
+void JournalPointer::save(Objecter *objecter, Context *completion) const
+{
+  ceph_assert(objecter != NULL);
+
+  bufferlist data;
+  encode(data);
+
+  objecter->write_full(object_t(get_object_id()), object_locator_t(pool_id),
+		       SnapContext(), data,
+		       ceph::real_clock::now(), 0,
+		       completion);
+}
+
diff --git a/src/mds/JournalPointer.h b/src/mds/JournalPointer.h
new file mode 100644
index 000000000..8d4708e1d
--- /dev/null
+++ b/src/mds/JournalPointer.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef JOURNAL_POINTER_H
+#define JOURNAL_POINTER_H
+
+#include "include/encoding.h"
+#include "mdstypes.h"
+
+class Objecter;
+
+// This always lives in the same location for a given MDS
+// instance, it tells the daemon where to look for the journal.
+class JournalPointer {
+  public:
+  JournalPointer(int node_id_, int64_t pool_id_) : node_id(node_id_), pool_id(pool_id_) {}
+  JournalPointer() {}
+
+  void encode(bufferlist &bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(front, bl);
+    encode(back, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &bl) {
+    DECODE_START(1, bl);
+    decode(front, bl);
+    decode(back, bl);
+    DECODE_FINISH(bl);
+  }
+
+  int load(Objecter *objecter);
+  int save(Objecter *objecter) const;
+  void save(Objecter *objecter, Context *completion) const;
+
+  bool is_null() const {
+    return front == 0 && back == 0;
+  }
+
+  void dump(Formatter *f) const {
+    f->open_object_section("journal_pointer");
+    {
+      f->dump_unsigned("front", front);
+      f->dump_unsigned("back", back);
+    }
+    f->close_section(); // journal_header
+  }
+
+  static void generate_test_instances(std::list<JournalPointer*> &ls)
+  {
+    ls.push_back(new JournalPointer());
+    ls.push_back(new JournalPointer());
+    ls.back()->front = 0xdeadbeef;
+    ls.back()->back = 0xfeedbead;
+  }
+
+  // The currently active journal
+  inodeno_t front = 0;
+  // The backup journal, if any (may be 0)
+  inodeno_t back = 0;
+
+  private:
+  // MDS rank
+  int node_id = -1;
+  // Metadata pool ID
+  int64_t pool_id = -1;
+
+  std::string get_object_id() const;
+};
+WRITE_CLASS_ENCODER(JournalPointer)
+
+#endif // JOURNAL_POINTER_H
diff --git a/src/mds/LocalLockC.h b/src/mds/LocalLockC.h
new file mode 100644
index 000000000..96cea93eb
--- /dev/null
+++ b/src/mds/LocalLockC.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_LOCALLOCK_H
+#define CEPH_LOCALLOCK_H
+
+#include "SimpleLock.h"
+
+class LocalLockC : public SimpleLock {
+public:
+  LocalLockC(MDSCacheObject *o, LockType *t) : 
+    SimpleLock(o, t) {
+    set_state(LOCK_LOCK); // always.
+  }
+
+  bool is_locallock() const override {
+    return true;
+  }
+
+  bool can_xlock_local() const {
+    return !is_wrlocked() && (get_xlock_by() == MutationRef());
+  }
+
+  bool can_wrlock() const {
+    return !is_xlocked();
+  }
+  void get_wrlock(client_t client) {
+    ceph_assert(can_wrlock());
+    SimpleLock::get_wrlock();
+    last_wrlock_client = client;
+  }
+  void put_wrlock() {
+    SimpleLock::put_wrlock();
+    if (get_num_wrlocks() == 0)
+      last_wrlock_client = client_t();
+  }
+  client_t get_last_wrlock_client() const {
+    return last_wrlock_client;
+  }
+
+  void print(std::ostream& out) const override {
+    out << "(";
+    _print(out);
+    if (last_wrlock_client >= 0)
+      out << " last_client=" << last_wrlock_client;
+    out << ")";
+  }
+
+private:
+  client_t last_wrlock_client;
+};
+#endif
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
new file mode 100644
index 000000000..b515bcb30
--- /dev/null
+++ b/src/mds/Locker.cc
@@ -0,0 +1,5964 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "CDir.h"
+#include "CDentry.h"
+#include "CInode.h"
+#include "common/config.h"
+#include "events/EOpen.h"
+#include "events/EUpdate.h"
+#include "Locker.h"
+#include "MDBalancer.h"
+#include "MDCache.h"
+#include "MDLog.h"
+#include "MDSRank.h"
+#include "MDSMap.h"
+#include "messages/MInodeFileCaps.h"
+#include "messages/MMDSPeerRequest.h"
+#include "Migrator.h"
+#include "msg/Messenger.h"
+#include "osdc/Objecter.h"
+
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_context g_ceph_context
+#define dout_prefix _prefix(_dout, mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+  return *_dout << "mds." << mds->get_nodeid() << ".locker ";
+}
+
+
+class LockerContext : public MDSContext {
+protected:
+  Locker *locker;
+  MDSRank *get_mds() override
+  {
+    return locker->mds;
+  }
+
+public:
+  explicit LockerContext(Locker *locker_) : locker(locker_) {
+    ceph_assert(locker != NULL);
+  }
+};
+
+class LockerLogContext : public MDSLogContextBase {
+protected:
+  Locker *locker;
+  MDSRank *get_mds() override
+  {
+    return locker->mds;
+  }
+
+public:
+  explicit LockerLogContext(Locker *locker_) : locker(locker_) {
+    ceph_assert(locker != NULL);
+  }
+};
+
+Locker::Locker(MDSRank *m, MDCache *c) :
+  need_snapflush_inodes(member_offset(CInode, item_caps)), mds(m), mdcache(c) {}
+
+
+void Locker::dispatch(const cref_t<Message> &m)
+{
+
+  switch (m->get_type()) {
+    // inter-mds locking
+  case MSG_MDS_LOCK:
+    handle_lock(ref_cast<MLock>(m));
+    break;
+    // inter-mds caps
+  case MSG_MDS_INODEFILECAPS:
+    handle_inode_file_caps(ref_cast<MInodeFileCaps>(m));
+    break;
+    // client sync
+  case CEPH_MSG_CLIENT_CAPS:
+    handle_client_caps(ref_cast<MClientCaps>(m));
+    break;
+  case CEPH_MSG_CLIENT_CAPRELEASE:
+    handle_client_cap_release(ref_cast<MClientCapRelease>(m));
+    break;
+  case CEPH_MSG_CLIENT_LEASE:
+    handle_client_lease(ref_cast<MClientLease>(m));
+    break;
+  default:
+    derr << "locker unknown message " << m->get_type() << dendl;
+    ceph_abort_msg("locker unknown message");
+  }
+}
+
+void Locker::tick()
+{
+  scatter_tick();
+  caps_tick();
+}
+
+/*
+ * locks vs rejoin
+ *
+ * 
+ *
+ */
+
+void Locker::send_lock_message(SimpleLock *lock, int msg)
+{
+  for (const auto &it : lock->get_parent()->get_replicas()) {
+    if (mds->is_cluster_degraded() &&
+	mds->mdsmap->get_state(it.first) < MDSMap::STATE_REJOIN)
+      continue;
+    auto m = make_message<MLock>(lock, msg, mds->get_nodeid());
+    mds->send_message_mds(m, it.first);
+  }
+}
+
+void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data)
+{
+  for (const auto &it : lock->get_parent()->get_replicas()) {
+    if (mds->is_cluster_degraded() &&
+	mds->mdsmap->get_state(it.first) < MDSMap::STATE_REJOIN)
+      continue;
+    auto m = make_message<MLock>(lock, msg, mds->get_nodeid());
+    m->set_data(data);
+    mds->send_message_mds(m, it.first);
+  }
+}
+
+bool Locker::try_rdlock_snap_layout(CInode *in, MDRequestRef& mdr,
+				    int n, bool want_layout)
+{
+  dout(10) << __func__ << " " << *mdr << " " << *in << dendl;
+  // rdlock ancestor snaps
+  inodeno_t root;
+  int depth = -1;
+  bool found_locked = false;
+  bool found_layout = false;
+
+  if (want_layout)
+    ceph_assert(n == 0);
+
+  client_t client = mdr->get_client();
+
+  CInode *t = in;
+  while (true) {
+    ++depth;
+    if (!found_locked && mdr->is_rdlocked(&t->snaplock))
+      found_locked = true;
+
+    if (!found_locked) {
+      if (!t->snaplock.can_rdlock(client)) {
+	t->snaplock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
+	goto failed;
+      }
+      t->snaplock.get_rdlock();
+      mdr->locks.emplace(&t->snaplock, MutationImpl::LockOp::RDLOCK);
+      dout(20) << " got rdlock on " << t->snaplock << " " << *t << dendl;
+    }
+    if (want_layout && !found_layout) {
+      if (!mdr->is_rdlocked(&t->policylock)) {
+	if (!t->policylock.can_rdlock(client)) {
+	  t->policylock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
+	  goto failed;
+	}
+	t->policylock.get_rdlock();
+	mdr->locks.emplace(&t->policylock, MutationImpl::LockOp::RDLOCK);
+	dout(20) << " got rdlock on " << t->policylock << " " << *t << dendl;
+      }
+      if (t->get_projected_inode()->has_layout()) {
+	mdr->dir_layout = t->get_projected_inode()->layout;
+	found_layout = true;
+      }
+    }
+    CDentry* pdn = t->get_projected_parent_dn();
+    if (!pdn) {
+      root = t->ino();
+      break;
+    }
+    t = pdn->get_dir()->get_inode();
+  }
+
+  mdr->dir_root[n] = root;
+  mdr->dir_depth[n] = depth;
+  return true;
+
+failed:
+  dout(10) << __func__ << " failed" << dendl;
+
+  drop_locks(mdr.get(), nullptr);
+  mdr->drop_local_auth_pins();
+  return false;
+}
+
+struct MarkEventOnDestruct {
+  MDRequestRef& mdr;
+  std::string_view message;
+  bool mark_event;
+  MarkEventOnDestruct(MDRequestRef& _mdr, std::string_view _message) :
+      mdr(_mdr),
+      message(_message),
+      mark_event(true) {}
+  ~MarkEventOnDestruct() {
+    if (mark_event)
+      mdr->mark_event(message);
+  }
+};
+
+/* If this function returns false, the mdr has been placed
+ * on the appropriate wait list */
+bool Locker::acquire_locks(MDRequestRef& mdr,
+			   MutationImpl::LockOpVec& lov,
+			   CInode *auth_pin_freeze,
+			   bool auth_pin_nonblocking)
+{
+  dout(10) << "acquire_locks " << *mdr << dendl;
+
+  MarkEventOnDestruct marker(mdr, "failed to acquire_locks");
+
+  client_t client = mdr->get_client();
+
+  set<MDSCacheObject*> mustpin;  // items to authpin
+  if (auth_pin_freeze)
+    mustpin.insert(auth_pin_freeze);
+
+  // xlocks
+  for (size_t i = 0; i < lov.size(); ++i) {
+    auto& p = lov[i];
+    SimpleLock *lock = p.lock;
+    MDSCacheObject *object = lock->get_parent();
+
+    if (p.is_xlock()) {
+      if ((lock->get_type() == CEPH_LOCK_ISNAP ||
+	   lock->get_type() == CEPH_LOCK_IPOLICY) &&
+	  mds->is_cluster_degraded() &&
+	  mdr->is_leader() &&
+	  !mdr->is_queued_for_replay()) {
+	// waiting for recovering mds, to guarantee replayed requests and mksnap/setlayout
+	// get processed in proper order.
+	bool wait = false;
+	if (object->is_auth()) {
+	  if (!mdr->is_xlocked(lock)) {
+	    set<mds_rank_t> ls;
+	    object->list_replicas(ls);
+	    for (auto m : ls) {
+	      if (mds->mdsmap->get_state(m) < MDSMap::STATE_ACTIVE) {
+		wait = true;
+		break;
+	      }
+	    }
+	  }
+	} else {
+	  // if the lock is the latest locked one, it's possible that peer mds got the lock
+	  // while there are recovering mds.
+	  if (!mdr->is_xlocked(lock) || mdr->is_last_locked(lock))
+	    wait = true;
+	}
+	if (wait) {
+	  dout(10) << " must xlock " << *lock << " " << *object
+		   << ", waiting for cluster recovered" << dendl;
+	  mds->locker->drop_locks(mdr.get(), NULL);
+	  mdr->drop_local_auth_pins();
+	  mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
+	  return false;
+	}
+      }
+
+      dout(20) << " must xlock " << *lock << " " << *object << dendl;
+
+      mustpin.insert(object);
+
+      // augment xlock with a versionlock?
+      if (lock->get_type() == CEPH_LOCK_DN) {
+	CDentry *dn = static_cast<CDentry*>(object);
+	if (!dn->is_auth())
+	  continue;
+	if (mdr->is_leader()) {
+	  // leader.  wrlock versionlock so we can pipeline dentry updates to journal.
+	  lov.add_wrlock(&dn->versionlock, i + 1);
+	} else {
+	  // peer.  exclusively lock the dentry version (i.e. block other journal updates).
+	  // this makes rollback safe.
+	  lov.add_xlock(&dn->versionlock, i + 1);
+	}
+      }
+      if (lock->get_type() >= CEPH_LOCK_IFIRST && lock->get_type() != CEPH_LOCK_IVERSION) {
+	// inode version lock?
+	CInode *in = static_cast<CInode*>(object);
+	if (!in->is_auth())
+	  continue;
+	if (mdr->is_leader()) {
+	  // leader.  wrlock versionlock so we can pipeline inode updates to journal.
+	  lov.add_wrlock(&in->versionlock, i + 1);
+	} else {
+	  // peer.  exclusively lock the inode version (i.e. block other journal updates).
+	  // this makes rollback safe.
+	  lov.add_xlock(&in->versionlock, i + 1);
+	}
+      }
+    } else if (p.is_wrlock()) {
+      dout(20) << " must wrlock " << *lock << " " << *object << dendl;
+      client_t _client = p.is_state_pin() ? lock->get_excl_client() : client;
+      if (object->is_auth()) {
+	mustpin.insert(object);
+      } else if (!object->is_auth() &&
+		 !lock->can_wrlock(_client) &&  // we might have to request a scatter
+		 !mdr->is_peer()) {           // if we are peer (remote_wrlock), the leader already authpinned
+	dout(15) << " will also auth_pin " << *object
+		 << " in case we need to request a scatter" << dendl;
+	mustpin.insert(object);
+      }
+    } else if (p.is_remote_wrlock()) {
+      dout(20) << " must remote_wrlock on mds." << p.wrlock_target << " "
+	       << *lock << " " << *object << dendl;
+      mustpin.insert(object);
+    } else if (p.is_rdlock()) {
+
+      dout(20) << " must rdlock " << *lock << " " << *object << dendl;
+      if (object->is_auth()) {
+	mustpin.insert(object);
+      } else if (!object->is_auth() &&
+		 !lock->can_rdlock(client)) {      // we might have to request an rdlock
+	dout(15) << " will also auth_pin " << *object
+		 << " in case we need to request a rdlock" << dendl;
+	mustpin.insert(object);
+      }
+    } else {
+      ceph_assert(0 == "locker unknown lock operation");
+    }
+  }
+
+  lov.sort_and_merge();
+ 
+  // AUTH PINS
+  map<mds_rank_t, set<MDSCacheObject*> > mustpin_remote;  // mds -> (object set)
+  
+  // can i auth pin them all now?
+  marker.message = "failed to authpin local pins";
+  for (const auto &p : mustpin) {
+    MDSCacheObject *object = p;
+
+    dout(10) << " must authpin " << *object << dendl;
+
+    if (mdr->is_auth_pinned(object)) {
+      if (object != (MDSCacheObject*)auth_pin_freeze)
+	continue;
+      if (mdr->more()->is_remote_frozen_authpin) {
+	if (mdr->more()->rename_inode == auth_pin_freeze)
+	  continue;
+	// unfreeze auth pin for the wrong inode
+	mustpin_remote[mdr->more()->rename_inode->authority().first].size();
+      }
+    }
+    
+    if (!object->is_auth()) {
+      if (mdr->lock_cache) { // debug
+	ceph_assert(mdr->lock_cache->opcode == CEPH_MDS_OP_UNLINK);
+	CDentry *dn = mdr->dn[0].back();
+	ceph_assert(dn->get_projected_linkage()->is_remote());
+      }
+
+      if (object->is_ambiguous_auth()) {
+	// wait
+	dout(10) << " ambiguous auth, waiting to authpin " << *object << dendl;
+	mdr->disable_lock_cache();
+	drop_locks(mdr.get());
+	mdr->drop_local_auth_pins();
+	marker.message = "waiting for single auth, object is being migrated";
+	object->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
+	return false;
+      }
+      mustpin_remote[object->authority().first].insert(object);
+      continue;
+    }
+    int err = 0;
+    if (!object->can_auth_pin(&err)) {
+      if (mdr->lock_cache) {
+	CDir *dir;
+	if (CInode *in = dynamic_cast<CInode*>(object)) {
+	  ceph_assert(!in->is_frozen_inode() && !in->is_frozen_auth_pin());
+	  dir = in->get_projected_parent_dir();
+	} else if (CDentry *dn = dynamic_cast<CDentry*>(object)) {
+	  dir = dn->get_dir();
+	} else {
+	  ceph_assert(0 == "unknown type of lock parent");
+	}
+	if (dir->get_inode() == mdr->lock_cache->get_dir_inode()) {
+	  // forcibly auth pin if there is lock cache on parent dir
+	  continue;
+	}
+
+	{ // debug
+	  ceph_assert(mdr->lock_cache->opcode == CEPH_MDS_OP_UNLINK);
+	  CDentry *dn = mdr->dn[0].back();
+	  ceph_assert(dn->get_projected_linkage()->is_remote());
+	}
+      }
+
+      // wait
+      mdr->disable_lock_cache();
+      drop_locks(mdr.get());
+      mdr->drop_local_auth_pins();
+      if (auth_pin_nonblocking) {
+	dout(10) << " can't auth_pin (freezing?) " << *object << ", nonblocking" << dendl;
+	mdr->aborted = true;
+	return false;
+      }
+      if (err == MDSCacheObject::ERR_EXPORTING_TREE) {
+	marker.message = "failed to authpin, subtree is being exported";
+      } else if (err == MDSCacheObject::ERR_FRAGMENTING_DIR) {
+	marker.message = "failed to authpin, dir is being fragmented";
+      } else if (err == MDSCacheObject::ERR_EXPORTING_INODE) {
+	marker.message = "failed to authpin, inode is being exported";
+      }
+      dout(10) << " can't auth_pin (freezing?), waiting to authpin " << *object << dendl;
+      object->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+
+      if (mdr->is_any_remote_auth_pin())
+	notify_freeze_waiter(object);
+
+      return false;
+    }
+  }
+
+  // ok, grab local auth pins
+  for (const auto& p : mustpin) {
+    MDSCacheObject *object = p;
+    if (mdr->is_auth_pinned(object)) {
+      dout(10) << " already auth_pinned " << *object << dendl;
+    } else if (object->is_auth()) {
+      dout(10) << " auth_pinning " << *object << dendl;
+      mdr->auth_pin(object);
+    }
+  }
+
+  // request remote auth_pins
+  if (!mustpin_remote.empty()) {
+    marker.message = "requesting remote authpins";
+    for (const auto& p : mdr->object_states) {
+      if (p.second.remote_auth_pinned == MDS_RANK_NONE)
+	continue;
+      ceph_assert(p.second.remote_auth_pinned == p.first->authority().first);
+      auto q = mustpin_remote.find(p.second.remote_auth_pinned);
+      if (q != mustpin_remote.end())
+	q->second.insert(p.first);
+    }
+
+    for (auto& p : mustpin_remote) {
+      dout(10) << "requesting remote auth_pins from mds." << p.first << dendl;
+
+      // wait for active auth
+      if (mds->is_cluster_degraded() &&
+	  !mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first)) {
+	dout(10) << " mds." << p.first << " is not active" << dendl;
+	if (mdr->more()->waiting_on_peer.empty())
+	  mds->wait_for_active_peer(p.first, new C_MDS_RetryRequest(mdcache, mdr));
+	return false;
+      }
+      
+      auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt,
+						MMDSPeerRequest::OP_AUTHPIN);
+      for (auto& o : p.second) {
+	dout(10) << " req remote auth_pin of " << *o << dendl;
+	MDSCacheObjectInfo info;
+	o->set_object_info(info);
+	req->get_authpins().push_back(info);
+	if (o == auth_pin_freeze)
+	  o->set_object_info(req->get_authpin_freeze());
+	mdr->pin(o);
+      }
+      if (auth_pin_nonblocking)
+	req->mark_nonblocking();
+      else if (!mdr->locks.empty())
+	req->mark_notify_blocking();
+
+      mds->send_message_mds(req, p.first);
+
+      // put in waiting list
+      auto ret = mdr->more()->waiting_on_peer.insert(p.first);
+      ceph_assert(ret.second);
+    }
+    return false;
+  }
+
+  // caps i'll need to issue
+  set<CInode*> issue_set;
+  bool result = false;
+
+  // acquire locks.
+  // make sure they match currently acquired locks.
+  for (const auto& p : lov) {
+    auto lock = p.lock;
+    if (p.is_xlock()) {
+      if (mdr->is_xlocked(lock)) {
+	dout(10) << " already xlocked " << *lock << " " << *lock->get_parent() << dendl;
+	continue;
+      }
+      if (mdr->locking && lock != mdr->locking)
+	cancel_locking(mdr.get(), &issue_set);
+      if (!xlock_start(lock, mdr)) {
+	marker.message = "failed to xlock, waiting";
+	goto out;
+      }
+      dout(10) << " got xlock on " << *lock << " " << *lock->get_parent() << dendl;
+    } else if (p.is_wrlock() || p.is_remote_wrlock()) {
+      auto it = mdr->locks.find(lock);
+      if (p.is_remote_wrlock()) {
+	if (it != mdr->locks.end() && it->is_remote_wrlock()) {
+	  dout(10) << " already remote_wrlocked " << *lock << " " << *lock->get_parent() << dendl;
+	} else {
+	  if (mdr->locking && lock != mdr->locking)
+	    cancel_locking(mdr.get(), &issue_set);
+	  marker.message = "waiting for remote wrlocks";
+	  remote_wrlock_start(lock, p.wrlock_target, mdr);
+	  goto out;
+	}
+      }
+      if (p.is_wrlock()) {
+	if (it != mdr->locks.end() && it->is_wrlock()) {
+	  dout(10) << " already wrlocked " << *lock << " " << *lock->get_parent() << dendl;
+	  continue;
+	}
+	client_t _client = p.is_state_pin() ? lock->get_excl_client() : client;
+	if (p.is_remote_wrlock()) {
+	  // nowait if we have already gotten remote wrlock
+	  if (!wrlock_try(lock, mdr, _client)) {
+	    marker.message = "failed to wrlock, dropping remote wrlock and waiting";
+	    // can't take the wrlock because the scatter lock is gathering. need to
+	    // release the remote wrlock, so that the gathering process can finish.
+	    ceph_assert(it != mdr->locks.end());
+	    remote_wrlock_finish(it, mdr.get());
+	    remote_wrlock_start(lock, p.wrlock_target, mdr);
+	    goto out;
+	  }
+	} else {
+	  if (!wrlock_start(p, mdr)) {
+	    ceph_assert(!p.is_remote_wrlock());
+	    marker.message = "failed to wrlock, waiting";
+	    goto out;
+	  }
+	}
+	dout(10) << " got wrlock on " << *lock << " " << *lock->get_parent() << dendl;
+      }
+    } else {
+      if (mdr->is_rdlocked(lock)) {
+	dout(10) << " already rdlocked " << *lock << " " << *lock->get_parent() << dendl;
+	continue;
+      }
+
+      ceph_assert(mdr->is_leader());
+      if (lock->needs_recover()) {
+	if (mds->is_cluster_degraded()) {
+	  if (!mdr->is_queued_for_replay()) {
+	    // see comments in SimpleLock::set_state_rejoin() and
+	    // ScatterLock::encode_state_for_rejoin()
+	    drop_locks(mdr.get());
+	    mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
+	    dout(10) << " rejoin recovering " << *lock << " " << *lock->get_parent()
+		     << ", waiting for cluster recovered" << dendl;
+	    marker.message = "rejoin recovering lock, waiting for cluster recovered";
+	    return false;
+	  }
+	} else {
+	  lock->clear_need_recover();
+	}
+      }
+
+      if (!rdlock_start(lock, mdr)) {
+	marker.message = "failed to rdlock, waiting";
+	goto out;
+      }
+      dout(10) << " got rdlock on " << *lock << " " << *lock->get_parent() << dendl;
+    }
+  }
+
+  mdr->set_mds_stamp(ceph_clock_now());
+  result = true;
+  marker.message = "acquired locks";
+
+ out:
+  issue_caps_set(issue_set);
+  return result;
+}
+
+void Locker::notify_freeze_waiter(MDSCacheObject *o)
+{
+  CDir *dir = NULL;
+  if (CInode *in = dynamic_cast<CInode*>(o)) {
+    if (!in->is_root())
+      dir = in->get_parent_dir();
+  } else if (CDentry *dn = dynamic_cast<CDentry*>(o)) {
+    dir = dn->get_dir();
+  } else {
+    dir = dynamic_cast<CDir*>(o);
+    ceph_assert(dir);
+  }
+  if (dir) {
+    if (dir->is_freezing_dir())
+      mdcache->fragment_freeze_inc_num_waiters(dir);
+    if (dir->is_freezing_tree()) {
+      while (!dir->is_freezing_tree_root())
+	dir = dir->get_parent_dir();
+      mdcache->migrator->export_freeze_inc_num_waiters(dir);
+    }
+  }
+}
+
+void Locker::set_xlocks_done(MutationImpl *mut, bool skip_dentry)
+{
+  for (const auto &p : mut->locks) {
+    if (!p.is_xlock())
+      continue;
+    MDSCacheObject *obj = p.lock->get_parent();
+    ceph_assert(obj->is_auth());
+    if (skip_dentry &&
+	(p.lock->get_type() == CEPH_LOCK_DN || p.lock->get_type() == CEPH_LOCK_DVERSION))
+      continue;
+    dout(10) << "set_xlocks_done on " << *p.lock << " " << *obj << dendl;
+    p.lock->set_xlock_done();
+  }
+}
+
+void Locker::_drop_locks(MutationImpl *mut, set<CInode*> *pneed_issue,
+			 bool drop_rdlocks)
+{
+  set<mds_rank_t> peers;
+
+  for (auto it = mut->locks.begin(); it != mut->locks.end(); ) {
+    SimpleLock *lock = it->lock;
+    MDSCacheObject *obj = lock->get_parent();
+
+    if (it->is_xlock()) {
+      if (obj->is_auth()) {
+	bool ni = false;
+	xlock_finish(it++, mut, &ni);
+	if (ni)
+	  pneed_issue->insert(static_cast<CInode*>(obj));
+      } else {
+	ceph_assert(lock->get_sm()->can_remote_xlock);
+	peers.insert(obj->authority().first);
+	lock->put_xlock();
+	mut->locks.erase(it++);
+      }
+    } else if (it->is_wrlock() || it->is_remote_wrlock()) {
+      if (it->is_remote_wrlock()) {
+	peers.insert(it->wrlock_target);
+	it->clear_remote_wrlock();
+      }
+      if (it->is_wrlock()) {
+	bool ni = false;
+	wrlock_finish(it++, mut, &ni);
+	if (ni)
+	  pneed_issue->insert(static_cast<CInode*>(obj));
+      } else {
+	mut->locks.erase(it++);
+      }
+    } else if (drop_rdlocks && it->is_rdlock()) {
+      bool ni = false;
+      rdlock_finish(it++, mut, &ni);
+      if (ni)
+	pneed_issue->insert(static_cast<CInode*>(obj));
+    } else {
+      ++it;
+    }
+  }
+
+  if (drop_rdlocks) {
+    if (mut->lock_cache) {
+      put_lock_cache(mut->lock_cache);
+      mut->lock_cache = nullptr;
+    }
+  }
+
+  for (set<mds_rank_t>::iterator p = peers.begin(); p != peers.end(); ++p) {
+    if (!mds->is_cluster_degraded() ||
+	mds->mdsmap->get_state(*p) >= MDSMap::STATE_REJOIN) {
+      dout(10) << "_drop_non_rdlocks dropping remote locks on mds." << *p << dendl;
+      auto peerreq = make_message<MMDSPeerRequest>(mut->reqid, mut->attempt,
+						     MMDSPeerRequest::OP_DROPLOCKS);
+      mds->send_message_mds(peerreq, *p);
+    }
+  }
+}
+
+void Locker::cancel_locking(MutationImpl *mut, set<CInode*> *pneed_issue)
+{
+  SimpleLock *lock = mut->locking;
+  ceph_assert(lock);
+  dout(10) << "cancel_locking " << *lock << " on " << *mut << dendl;
+
+  if (lock->get_parent()->is_auth()) {
+    bool need_issue = false;
+    if (lock->get_state() == LOCK_PREXLOCK) {
+      _finish_xlock(lock, -1, &need_issue);
+    } else if (lock->get_state() == LOCK_LOCK_XLOCK) {
+      lock->set_state(LOCK_XLOCKDONE);
+      eval_gather(lock, true, &need_issue);
+    }
+    if (need_issue)
+      pneed_issue->insert(static_cast<CInode *>(lock->get_parent()));
+  }
+  mut->finish_locking(lock);
+}
+
+void Locker::drop_locks(MutationImpl *mut, set<CInode*> *pneed_issue)
+{
+  // leftover locks
+  set<CInode*> my_need_issue;
+  if (!pneed_issue)
+    pneed_issue = &my_need_issue;
+
+  if (mut->locking)
+    cancel_locking(mut, pneed_issue);
+  _drop_locks(mut, pneed_issue, true);
+
+  if (pneed_issue == &my_need_issue)
+    issue_caps_set(*pneed_issue);
+  mut->locking_state = 0;
+}
+
+void Locker::drop_non_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue)
+{
+  set<CInode*> my_need_issue;
+  if (!pneed_issue)
+    pneed_issue = &my_need_issue;
+
+  _drop_locks(mut, pneed_issue, false);
+
+  if (pneed_issue == &my_need_issue)
+    issue_caps_set(*pneed_issue);
+}
+
+void Locker::drop_rdlocks_for_early_reply(MutationImpl *mut)
+{
+  set<CInode*> need_issue;
+
+  for (auto it = mut->locks.begin(); it != mut->locks.end(); ) {
+    if (!it->is_rdlock()) {
+      ++it;
+      continue;
+    }
+    SimpleLock *lock = it->lock;
+    // make later mksnap/setlayout (at other mds) wait for this unsafe request
+    if (lock->get_type() == CEPH_LOCK_ISNAP ||
+	lock->get_type() == CEPH_LOCK_IPOLICY) {
+      ++it;
+      continue;
+    }
+    bool ni = false;
+    rdlock_finish(it++, mut, &ni);
+    if (ni)
+      need_issue.insert(static_cast<CInode*>(lock->get_parent()));
+  }
+
+  issue_caps_set(need_issue);
+}
+
+void Locker::drop_locks_for_fragment_unfreeze(MutationImpl *mut)
+{
+  set<CInode*> need_issue;
+
+  for (auto it = mut->locks.begin(); it != mut->locks.end(); ) {
+    SimpleLock *lock = it->lock;
+    if (lock->get_type() == CEPH_LOCK_IDFT) {
+      ++it;
+      continue;
+    }
+    bool ni = false;
+    wrlock_finish(it++, mut, &ni);
+    if (ni)
+      need_issue.insert(static_cast<CInode*>(lock->get_parent()));
+  }
+  issue_caps_set(need_issue);
+}
+
+class C_MDL_DropCache : public LockerContext {
+  MDLockCache *lock_cache;
+public:
+  C_MDL_DropCache(Locker *l, MDLockCache *lc) :
+    LockerContext(l), lock_cache(lc) { }
+  void finish(int r) override {
+    locker->drop_locks(lock_cache);
+    lock_cache->cleanup();
+    delete lock_cache;
+  }
+};
+
+void Locker::put_lock_cache(MDLockCache* lock_cache)
+{
+  ceph_assert(lock_cache->ref > 0);
+  if (--lock_cache->ref > 0)
+    return;
+
+  ceph_assert(lock_cache->invalidating);
+
+  lock_cache->detach_locks();
+
+  CInode *diri = lock_cache->get_dir_inode();
+  for (auto dir : lock_cache->auth_pinned_dirfrags) {
+    if (dir->get_inode() != diri)
+      continue;
+    dir->enable_frozen_inode();
+  }
+
+  mds->queue_waiter(new C_MDL_DropCache(this, lock_cache));
+}
+
+int Locker::get_cap_bit_for_lock_cache(int op)
+{
+  switch(op) {
+    case CEPH_MDS_OP_CREATE:
+      return CEPH_CAP_DIR_CREATE;
+    case CEPH_MDS_OP_UNLINK:
+      return CEPH_CAP_DIR_UNLINK;
+    default:
+      ceph_assert(0 == "unsupported operation");
+      return 0;
+  }
+}
+
+void Locker::invalidate_lock_cache(MDLockCache *lock_cache)
+{
+  ceph_assert(lock_cache->item_cap_lock_cache.is_on_list());
+  if (lock_cache->invalidating) {
+    ceph_assert(!lock_cache->client_cap);
+  } else {
+    lock_cache->invalidating = true;
+    lock_cache->detach_dirfrags();
+  }
+
+  Capability *cap = lock_cache->client_cap;
+  if (cap) {
+    int cap_bit = get_cap_bit_for_lock_cache(lock_cache->opcode);
+    cap->clear_lock_cache_allowed(cap_bit);
+    if (cap->issued() & cap_bit)
+      issue_caps(lock_cache->get_dir_inode(), cap);
+    else
+      cap = nullptr;
+  }
+
+  if (!cap) {
+    lock_cache->item_cap_lock_cache.remove_myself();
+    put_lock_cache(lock_cache);
+  }
+}
+
+void Locker::eval_lock_caches(Capability *cap)
+{
+  for (auto p = cap->lock_caches.begin(); !p.end(); ) {
+    MDLockCache *lock_cache = *p;
+    ++p;
+    if (!lock_cache->invalidating)
+      continue;
+    int cap_bit = get_cap_bit_for_lock_cache(lock_cache->opcode);
+    if (!(cap->issued() & cap_bit)) {
+      lock_cache->item_cap_lock_cache.remove_myself();
+      put_lock_cache(lock_cache);
+    }
+  }
+}
+
+// ask lock caches to release auth pins
+void Locker::invalidate_lock_caches(CDir *dir)
+{
+  dout(10) << "invalidate_lock_caches on " << *dir << dendl;
+  auto &lock_caches = dir->lock_caches_with_auth_pins;
+  while (!lock_caches.empty()) {
+    invalidate_lock_cache(lock_caches.front()->parent);
+  }
+}
+
+// ask lock caches to release locks
+void Locker::invalidate_lock_caches(SimpleLock *lock)
+{
+  dout(10) << "invalidate_lock_caches " << *lock << " on " << *lock->get_parent() << dendl;
+  if (lock->is_cached()) {
+    auto&& lock_caches = lock->get_active_caches();
+    for (auto& lc : lock_caches)
+      invalidate_lock_cache(lc);
+  }
+}
+
+void Locker::create_lock_cache(MDRequestRef& mdr, CInode *diri, file_layout_t *dir_layout)
+{
+  if (mdr->lock_cache)
+    return;
+
+  client_t client = mdr->get_client();
+  int opcode = mdr->client_request->get_op();
+  dout(10) << "create_lock_cache for client." << client << "/" << ceph_mds_op_name(opcode)<< " on " << *diri << dendl;
+
+  if (!diri->is_auth()) {
+    dout(10) << " dir inode is not auth, noop" << dendl;
+    return;
+  }
+
+  if (mdr->has_more() && !mdr->more()->peers.empty()) {
+    dout(10) << " there are peers requests for " << *mdr << ", noop" << dendl;
+    return;
+  }
+
+  Capability *cap = diri->get_client_cap(client);
+  if (!cap) {
+    dout(10) << " there is no cap for client." << client << ", noop" << dendl;
+    return;
+  }
+
+  for (auto p = cap->lock_caches.begin(); !p.end(); ++p) {
+    if ((*p)->opcode == opcode) {
+      dout(10) << " lock cache already exists for " << ceph_mds_op_name(opcode) << ", noop" << dendl;
+      return;
+    }
+  }
+
+  set<MDSCacheObject*> ancestors;
+  for (CInode *in = diri; ; ) {
+    CDentry *pdn = in->get_projected_parent_dn();
+    if (!pdn)
+      break;
+    // ancestors.insert(pdn);
+    in = pdn->get_dir()->get_inode();
+    ancestors.insert(in);
+  }
+
+  for (auto& p : mdr->object_states) {
+    if (p.first != diri && !ancestors.count(p.first))
+      continue;
+    auto& stat = p.second;
+    if (stat.auth_pinned) {
+      if (!p.first->can_auth_pin()) {
+	dout(10) << " can't auth_pin(freezing?) lock parent " << *p.first << ", noop" << dendl;
+	return;
+      }
+      if (CInode *in = dynamic_cast<CInode*>(p.first); in->is_parent_projected()) {
+	CDir *dir = in->get_projected_parent_dir();
+	if (!dir->can_auth_pin()) {
+	  dout(10) << " can't auth_pin(!auth|freezing?) dirfrag " << *dir << ", noop" << dendl;
+	  return;
+	}
+      }
+    }
+  }
+
+  std::vector<CDir*> dfv;
+  dfv.reserve(diri->get_num_dirfrags());
+
+  diri->get_dirfrags(dfv);
+  for (auto dir : dfv) {
+    if (!dir->is_auth() || !dir->can_auth_pin()) {
+      dout(10) << " can't auth_pin(!auth|freezing?) dirfrag " << *dir << ", noop" << dendl;
+      return;
+    }
+    if (dir->is_any_freezing_or_frozen_inode()) {
+      dout(10) << " there is freezing/frozen inode in " << *dir << ", noop" << dendl;
+      return;
+    }
+  }
+
+  for (auto& p : mdr->locks) {
+    MDSCacheObject *obj = p.lock->get_parent();
+    if (obj != diri && !ancestors.count(obj))
+      continue;
+    if (!p.lock->is_stable()) {
+      dout(10) << " unstable " << *p.lock << " on " << *obj << ", noop" << dendl;
+      return;
+    }
+  }
+
+  auto lock_cache = new MDLockCache(cap, opcode);
+  if (dir_layout)
+    lock_cache->set_dir_layout(*dir_layout);
+  cap->set_lock_cache_allowed(get_cap_bit_for_lock_cache(opcode));
+
+  for (auto dir : dfv) {
+    // prevent subtree migration
+    lock_cache->auth_pin(dir);
+    // prevent frozen inode
+    dir->disable_frozen_inode();
+  }
+
+  for (auto& p : mdr->object_states) {
+    if (p.first != diri && !ancestors.count(p.first))
+      continue;
+    auto& stat = p.second;
+    if (stat.auth_pinned)
+      lock_cache->auth_pin(p.first);
+    else
+      lock_cache->pin(p.first);
+
+    if (CInode *in = dynamic_cast<CInode*>(p.first)) {
+      CDentry *pdn = in->get_projected_parent_dn();
+      if (pdn)
+	dfv.push_back(pdn->get_dir());
+    } else if (CDentry *dn = dynamic_cast<CDentry*>(p.first)) {
+	dfv.push_back(dn->get_dir());
+    } else {
+      ceph_assert(0 == "unknown type of lock parent");
+    }
+  }
+  lock_cache->attach_dirfrags(std::move(dfv));
+
+  for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
+    MDSCacheObject *obj = it->lock->get_parent();
+    if (obj != diri && !ancestors.count(obj)) {
+      ++it;
+      continue;
+    }
+    unsigned lock_flag = 0;
+    if (it->is_wrlock()) {
+      // skip wrlocks that were added by MDCache::predirty_journal_parent()
+      if (obj == diri)
+	lock_flag = MutationImpl::LockOp::WRLOCK;
+    } else {
+      ceph_assert(it->is_rdlock());
+      lock_flag = MutationImpl::LockOp::RDLOCK;
+    }
+    if (lock_flag) {
+      lock_cache->emplace_lock(it->lock, lock_flag);
+      mdr->locks.erase(it++);
+    } else {
+      ++it;
+    }
+  }
+  lock_cache->attach_locks();
+
+  lock_cache->ref++;
+  mdr->lock_cache = lock_cache;
+}
+
+bool Locker::find_and_attach_lock_cache(MDRequestRef& mdr, CInode *diri)
+{
+  if (mdr->lock_cache)
+    return true;
+
+  Capability *cap = diri->get_client_cap(mdr->get_client());
+  if (!cap)
+    return false;
+
+  int opcode = mdr->client_request->get_op();
+  for (auto p = cap->lock_caches.begin(); !p.end(); ++p) {
+    MDLockCache *lock_cache = *p;
+    if (lock_cache->opcode == opcode) {
+      dout(10) << "found lock cache for " << ceph_mds_op_name(opcode) << " on " << *diri << dendl;
+      mdr->lock_cache = lock_cache;
+      mdr->lock_cache->ref++;
+      return true;
+    }
+  }
+  return false;
+}
+
+// generics
+
+void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, MDSContext::vec *pfinishers)
+{
+  dout(10) << "eval_gather " << *lock << " on " << *lock->get_parent() << dendl;
+  ceph_assert(!lock->is_stable());
+
+  int next = lock->get_next_state();
+
+  CInode *in = 0;
+  bool caps = lock->get_cap_shift();
+  if (lock->get_type() != CEPH_LOCK_DN)
+    in = static_cast<CInode *>(lock->get_parent());
+
+  bool need_issue = false;
+
+  int loner_issued = 0, other_issued = 0, xlocker_issued = 0;
+  ceph_assert(!caps || in != NULL);
+  if (caps && in->is_head()) {
+    in->get_caps_issued(&loner_issued, &other_issued, &xlocker_issued,
+			lock->get_cap_shift(), lock->get_cap_mask());
+    dout(10) << " next state is " << lock->get_state_name(next) 
+	     << " issued/allows loner " << gcap_string(loner_issued)
+	     << "/" << gcap_string(lock->gcaps_allowed(CAP_LONER, next))
+	     << " xlocker " << gcap_string(xlocker_issued)
+	     << "/" << gcap_string(lock->gcaps_allowed(CAP_XLOCKER, next))
+	     << " other " << gcap_string(other_issued)
+	     << "/" << gcap_string(lock->gcaps_allowed(CAP_ANY, next))
+	     << dendl;
+
+    if (first && ((~lock->gcaps_allowed(CAP_ANY, next) & other_issued) ||
+		  (~lock->gcaps_allowed(CAP_LONER, next) & loner_issued) ||
+		  (~lock->gcaps_allowed(CAP_XLOCKER, next) & xlocker_issued)))
+      need_issue = true;
+  }
+
+#define IS_TRUE_AND_LT_AUTH(x, auth) (x && ((auth && x <= AUTH) || (!auth && x < AUTH)))
+  bool auth = lock->get_parent()->is_auth();
+  if (!lock->is_gathering() &&
+      (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_rdlock, auth) || !lock->is_rdlocked()) &&
+      (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_wrlock, auth) || !lock->is_wrlocked()) &&
+      (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_xlock, auth) || !lock->is_xlocked()) &&
+      (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_lease, auth) || !lock->is_leased()) &&
+      !(lock->get_parent()->is_auth() && lock->is_flushing()) &&  // i.e. wait for scatter_writebehind!
+      (!caps || ((~lock->gcaps_allowed(CAP_ANY, next) & other_issued) == 0 &&
+		 (~lock->gcaps_allowed(CAP_LONER, next) & loner_issued) == 0 &&
+		 (~lock->gcaps_allowed(CAP_XLOCKER, next) & xlocker_issued) == 0)) &&
+      lock->get_state() != LOCK_SYNC_MIX2 &&  // these states need an explicit trigger from the auth mds
+      lock->get_state() != LOCK_MIX_SYNC2
+      ) {
+    dout(7) << "eval_gather finished gather on " << *lock
+	    << " on " << *lock->get_parent() << dendl;
+
+    if (lock->get_sm() == &sm_filelock) {
+      ceph_assert(in);
+      if (in->state_test(CInode::STATE_RECOVERING)) {
+	dout(7) << "eval_gather finished gather, but still recovering" << dendl;
+	return;
+      } else if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
+	dout(7) << "eval_gather finished gather, but need to recover" << dendl;
+	mds->mdcache->queue_file_recover(in);
+	mds->mdcache->do_file_recover();
+	return;
+      }
+    }
+
+    if (!lock->get_parent()->is_auth()) {
+      // replica: tell auth
+      mds_rank_t auth = lock->get_parent()->authority().first;
+
+      if (lock->get_parent()->is_rejoining() &&
+	  mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
+	dout(7) << "eval_gather finished gather, but still rejoining "
+		<< *lock->get_parent() << dendl;
+	return;
+      }
+
+      if (!mds->is_cluster_degraded() ||
+	  mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
+	switch (lock->get_state()) {
+	case LOCK_SYNC_LOCK:
+	  mds->send_message_mds(make_message<MLock>(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), auth);
+	  break;
+
+	case LOCK_MIX_SYNC:
+	  {
+	    auto reply = make_message<MLock>(lock, LOCK_AC_SYNCACK, mds->get_nodeid());
+	    lock->encode_locked_state(reply->get_data());
+	    mds->send_message_mds(reply, auth);
+	    next = LOCK_MIX_SYNC2;
+	    (static_cast<ScatterLock *>(lock))->start_flush();
+	  }
+	  break;
+
+	case LOCK_MIX_SYNC2:
+	  (static_cast<ScatterLock *>(lock))->finish_flush();
+	  (static_cast<ScatterLock *>(lock))->clear_flushed();
+
+	case LOCK_SYNC_MIX2:
+	  // do nothing, we already acked
+	  break;
+	  
+	case LOCK_SYNC_MIX:
+	  { 
+	    auto reply = make_message<MLock>(lock, LOCK_AC_MIXACK, mds->get_nodeid());
+	    mds->send_message_mds(reply, auth);
+	    next = LOCK_SYNC_MIX2;
+	  }
+	  break;
+
+	case LOCK_MIX_LOCK:
+	  {
+	    bufferlist data;
+	    lock->encode_locked_state(data);
+	    mds->send_message_mds(make_message<MLock>(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), auth);
+	    (static_cast<ScatterLock *>(lock))->start_flush();
+	    // we'll get an AC_LOCKFLUSHED to complete
+	  }
+	  break;
+
+	default:
+	  ceph_abort();
+	}
+      }
+    } else {
+      // auth
+
+      // once the first (local) stage of mix->lock gather complete we can
+      // gather from replicas
+      if (lock->get_state() == LOCK_MIX_LOCK &&
+	  lock->get_parent()->is_replicated()) {
+	dout(10) << " finished (local) gather for mix->lock, now gathering from replicas" << dendl;
+	send_lock_message(lock, LOCK_AC_LOCK);
+	lock->init_gather();
+	lock->set_state(LOCK_MIX_LOCK2);
+	return;
+      }
+
+      if (lock->is_dirty() && !lock->is_flushed()) {
+	scatter_writebehind(static_cast<ScatterLock *>(lock));
+	return;
+      }
+      lock->clear_flushed();
+      
+      switch (lock->get_state()) {
+	// to mixed
+      case LOCK_TSYN_MIX:
+      case LOCK_SYNC_MIX:
+      case LOCK_EXCL_MIX:
+      case LOCK_XSYN_MIX:
+	in->start_scatter(static_cast<ScatterLock *>(lock));
+	if (lock->get_parent()->is_replicated()) {
+	  bufferlist softdata;
+	  lock->encode_locked_state(softdata);
+	  send_lock_message(lock, LOCK_AC_MIX, softdata);
+	}
+	(static_cast<ScatterLock *>(lock))->clear_scatter_wanted();
+	break;
+
+      case LOCK_XLOCK:
+      case LOCK_XLOCKDONE:
+	if (next != LOCK_SYNC)
+	  break;
+	// fall-thru
+
+	// to sync
+      case LOCK_EXCL_SYNC:
+      case LOCK_LOCK_SYNC:
+      case LOCK_MIX_SYNC:
+      case LOCK_XSYN_SYNC:
+	if (lock->get_parent()->is_replicated()) {
+	  bufferlist softdata;
+	  lock->encode_locked_state(softdata);
+	  send_lock_message(lock, LOCK_AC_SYNC, softdata);
+	}
+	break;
+      }
+
+    }
+
+    lock->set_state(next);
+    
+    if (lock->get_parent()->is_auth() &&
+	lock->is_stable())
+      lock->get_parent()->auth_unpin(lock);
+
+    // drop loner before doing waiters
+    if (caps &&
+	in->is_head() &&
+	in->is_auth() &&
+	in->get_wanted_loner() != in->get_loner()) {
+      dout(10) << "  trying to drop loner" << dendl;
+      if (in->try_drop_loner()) {
+	dout(10) << "  dropped loner" << dendl;
+	need_issue = true;
+      }
+    }
+
+    if (pfinishers)
+      lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD|SimpleLock::WAIT_XLOCK,
+			 *pfinishers);
+    else
+      lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD|SimpleLock::WAIT_XLOCK);
+    
+    if (caps && in->is_head())
+      need_issue = true;
+
+    if (lock->get_parent()->is_auth() &&
+	lock->is_stable())
+      try_eval(lock, &need_issue);
+  }
+
+  if (need_issue) {
+    if (pneed_issue)
+      *pneed_issue = true;
+    else if (in->is_head())
+      issue_caps(in);
+  }
+
+}
+
+bool Locker::eval(CInode *in, int mask, bool caps_imported)
+{
+  bool need_issue = caps_imported;
+  MDSContext::vec finishers;
+  
+  dout(10) << "eval " << mask << " " << *in << dendl;
+
+  // choose loner?
+  if (in->is_auth() && in->is_head()) {
+    client_t orig_loner = in->get_loner();
+    if (in->choose_ideal_loner()) {
+      dout(10) << "eval set loner: client." << orig_loner << " -> client." << in->get_loner() << dendl;
+      need_issue = true;
+      mask = -1;
+    } else if (in->get_wanted_loner() != in->get_loner()) {
+      dout(10) << "eval want loner: client." << in->get_wanted_loner() << " but failed to set it" << dendl;
+      mask = -1;
+    }
+  }
+
+ retry:
+  if (mask & CEPH_LOCK_IFILE)
+    eval_any(&in->filelock, &need_issue, &finishers, caps_imported);
+  if (mask & CEPH_LOCK_IAUTH)
+    eval_any(&in->authlock, &need_issue, &finishers, caps_imported);
+  if (mask & CEPH_LOCK_ILINK)
+    eval_any(&in->linklock, &need_issue, &finishers, caps_imported);
+  if (mask & CEPH_LOCK_IXATTR)
+    eval_any(&in->xattrlock, &need_issue, &finishers, caps_imported);
+  if (mask & CEPH_LOCK_INEST)
+    eval_any(&in->nestlock, &need_issue, &finishers, caps_imported);
+  if (mask & CEPH_LOCK_IFLOCK)
+    eval_any(&in->flocklock, &need_issue, &finishers, caps_imported);
+  if (mask & CEPH_LOCK_IPOLICY)
+    eval_any(&in->policylock, &need_issue, &finishers, caps_imported);
+
+  // drop loner?
+  if (in->is_auth() && in->is_head() && in->get_wanted_loner() != in->get_loner()) {
+    if (in->try_drop_loner()) {
+      need_issue = true;
+      if (in->get_wanted_loner() >= 0) {
+	dout(10) << "eval end set loner to client." << in->get_loner() << dendl;
+	bool ok = in->try_set_loner();
+	ceph_assert(ok);
+	mask = -1;
+	goto retry;
+      }
+    }
+  }
+
+  finish_contexts(g_ceph_context, finishers);
+
+  if (need_issue && in->is_head())
+    issue_caps(in);
+
+  dout(10) << "eval done" << dendl;
+  return need_issue;
+}
+
+class C_Locker_Eval : public LockerContext {
+  MDSCacheObject *p;
+  int mask;
+public:
+  C_Locker_Eval(Locker *l, MDSCacheObject *pp, int m) : LockerContext(l), p(pp), mask(m) {
+    // We are used as an MDSCacheObject waiter, so should
+    // only be invoked by someone already holding the big lock.
+    ceph_assert(ceph_mutex_is_locked_by_me(locker->mds->mds_lock));
+    p->get(MDSCacheObject::PIN_PTRWAITER);    
+  }
+  void finish(int r) override {
+    locker->try_eval(p, mask);
+    p->put(MDSCacheObject::PIN_PTRWAITER);
+  }
+};
+
+void Locker::try_eval(MDSCacheObject *p, int mask)
+{
+  // unstable and ambiguous auth?
+  if (p->is_ambiguous_auth()) {
+    dout(7) << "try_eval ambiguous auth, waiting on " << *p << dendl;
+    p->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_Eval(this, p, mask));
+    return;
+  }
+
+  if (p->is_auth() && p->is_frozen()) {
+    dout(7) << "try_eval frozen, waiting on " << *p << dendl;
+    p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_Eval(this, p, mask));
+    return;
+  }
+
+  if (mask & CEPH_LOCK_DN) {
+    ceph_assert(mask == CEPH_LOCK_DN);
+    bool need_issue = false;  // ignore this, no caps on dentries
+    CDentry *dn = static_cast<CDentry *>(p);
+    eval_any(&dn->lock, &need_issue);
+  } else {
+    CInode *in = static_cast<CInode *>(p);
+    eval(in, mask);
+  }
+}
+
+void Locker::try_eval(SimpleLock *lock, bool *pneed_issue)
+{
+  MDSCacheObject *p = lock->get_parent();
+
+  // unstable and ambiguous auth?
+  if (p->is_ambiguous_auth()) {
+    dout(7) << "try_eval " << *lock << " ambiguousauth, waiting on " << *p << dendl;
+    p->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_Eval(this, p, lock->get_type()));
+    return;
+  }
+  
+  if (!p->is_auth()) {
+    dout(7) << "try_eval " << *lock << " not auth for " << *p << dendl;
+    return;
+  }
+
+  if (p->is_frozen()) {
+    dout(7) << "try_eval " << *lock << " frozen, waiting on " << *p << dendl;
+    p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_Eval(this, p, lock->get_type()));
+    return;
+  }
+
+  /*
+   * We could have a situation like:
+   *
+   * - mds A authpins item on mds B
+   * - mds B starts to freeze tree containing item
+   * - mds A tries wrlock_start on A, sends REQSCATTER to B
+   * - mds B lock is unstable, sets scatter_wanted
+   * - mds B lock stabilizes, calls try_eval.
+   *
+   * We can defer while freezing without causing a deadlock.  Honor
+   * scatter_wanted flag here.  This will never get deferred by the
+   * checks above due to the auth_pin held by the leader.
+   */
+  if (lock->is_scatterlock()) {
+    ScatterLock *slock = static_cast<ScatterLock *>(lock);
+    if (slock->get_scatter_wanted() &&
+	slock->get_state() != LOCK_MIX) {
+      scatter_mix(slock, pneed_issue);
+      if (!lock->is_stable())
+	return;
+    } else if (slock->get_unscatter_wanted() &&
+        slock->get_state() != LOCK_LOCK) {
+      simple_lock(slock, pneed_issue);
+      if (!lock->is_stable()) {
+        return;
+      }
+    }
+  }
+
+  if (lock->get_type() != CEPH_LOCK_DN &&
+      lock->get_type() != CEPH_LOCK_ISNAP &&
+      lock->get_type() != CEPH_LOCK_IPOLICY &&
+      p->is_freezing()) {
+    dout(7) << "try_eval " << *lock << " freezing, waiting on " << *p << dendl;
+    p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_Eval(this, p, lock->get_type()));
+    return;
+  }
+
+  eval(lock, pneed_issue);
+}
+
+void Locker::eval_cap_gather(CInode *in, set<CInode*> *issue_set)
+{
+  bool need_issue = false;
+  MDSContext::vec finishers;
+
+  // kick locks now
+  if (!in->filelock.is_stable())
+    eval_gather(&in->filelock, false, &need_issue, &finishers);
+  if (!in->authlock.is_stable())
+    eval_gather(&in->authlock, false, &need_issue, &finishers);
+  if (!in->linklock.is_stable())
+    eval_gather(&in->linklock, false, &need_issue, &finishers);
+  if (!in->xattrlock.is_stable())
+    eval_gather(&in->xattrlock, false, &need_issue, &finishers);
+
+  if (need_issue && in->is_head()) {
+    if (issue_set)
+      issue_set->insert(in);
+    else
+      issue_caps(in);
+  }
+
+  finish_contexts(g_ceph_context, finishers);
+}
+
+void Locker::eval_scatter_gathers(CInode *in)
+{
+  bool need_issue = false;
+  MDSContext::vec finishers;
+
+  dout(10) << "eval_scatter_gathers " << *in << dendl;
+
+  // kick locks now
+  if (!in->filelock.is_stable())
+    eval_gather(&in->filelock, false, &need_issue, &finishers);
+  if (!in->nestlock.is_stable())
+    eval_gather(&in->nestlock, false, &need_issue, &finishers);
+  if (!in->dirfragtreelock.is_stable())
+    eval_gather(&in->dirfragtreelock, false, &need_issue, &finishers);
+  
+  if (need_issue && in->is_head())
+    issue_caps(in);
+  
+  finish_contexts(g_ceph_context, finishers);
+}
+
+void Locker::eval(SimpleLock *lock, bool *need_issue)
+{
+  switch (lock->get_type()) {
+  case CEPH_LOCK_IFILE:
+    return file_eval(static_cast<ScatterLock*>(lock), need_issue);
+  case CEPH_LOCK_IDFT:
+  case CEPH_LOCK_INEST:
+    return scatter_eval(static_cast<ScatterLock*>(lock), need_issue);
+  default:
+    return simple_eval(lock, need_issue);
+  }
+}
+
+
+// ------------------
+// rdlock
+
+bool Locker::_rdlock_kick(SimpleLock *lock, bool as_anon)
+{
+  // kick the lock
+  if (lock->is_stable()) {
+    if (lock->get_parent()->is_auth()) {
+      if (lock->get_sm() == &sm_scatterlock) {
+	// not until tempsync is fully implemented
+	//if (lock->get_parent()->is_replicated())
+	//scatter_tempsync((ScatterLock*)lock);
+	//else
+	simple_sync(lock);
+      } else if (lock->get_sm() == &sm_filelock) {
+	CInode *in = static_cast<CInode*>(lock->get_parent());
+	if (lock->get_state() == LOCK_EXCL &&
+	    in->get_target_loner() >= 0 &&
+	    !in->is_dir() && !as_anon)   // as_anon => caller wants SYNC, not XSYN
+	  file_xsyn(lock);
+	else
+	  simple_sync(lock);
+      } else
+	simple_sync(lock);
+      return true;
+    } else {
+      // request rdlock state change from auth
+      mds_rank_t auth = lock->get_parent()->authority().first;
+      if (!mds->is_cluster_degraded() ||
+	  mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
+	dout(10) << "requesting rdlock from auth on "
+		 << *lock << " on " << *lock->get_parent() << dendl;
+	mds->send_message_mds(make_message<MLock>(lock, LOCK_AC_REQRDLOCK, mds->get_nodeid()), auth);
+      }
+      return false;
+    }
+  }
+  if (lock->get_type() == CEPH_LOCK_IFILE) {
+    CInode *in = static_cast<CInode *>(lock->get_parent());
+    if (in->state_test(CInode::STATE_RECOVERING)) {
+      mds->mdcache->recovery_queue.prioritize(in);
+    }
+  }
+
+  return false;
+}
+
+bool Locker::rdlock_try(SimpleLock *lock, client_t client)
+{
+  dout(7) << "rdlock_try on " << *lock << " on " << *lock->get_parent() << dendl;  
+
+  // can read?  grab ref.
+  if (lock->can_rdlock(client)) 
+    return true;
+  
+  _rdlock_kick(lock, false);
+
+  if (lock->can_rdlock(client)) 
+    return true;
+
+  return false;
+}
+
+bool Locker::rdlock_start(SimpleLock *lock, MDRequestRef& mut, bool as_anon)
+{
+  dout(7) << "rdlock_start  on " << *lock << " on " << *lock->get_parent() << dendl;  
+
+  // client may be allowed to rdlock the same item it has xlocked.
+  //  UNLESS someone passes in as_anon, or we're reading snapped version here.
+  if (mut->snapid != CEPH_NOSNAP)
+    as_anon = true;
+  client_t client = as_anon ? -1 : mut->get_client();
+
+  CInode *in = 0;
+  if (lock->get_type() != CEPH_LOCK_DN)
+    in = static_cast<CInode *>(lock->get_parent());
+
+  /*
+  if (!lock->get_parent()->is_auth() &&
+      lock->fw_rdlock_to_auth()) {
+    mdcache->request_forward(mut, lock->get_parent()->authority().first);
+    return false;
+  }
+  */
+
+  while (1) {
+    // can read?  grab ref.
+    if (lock->can_rdlock(client)) {
+      lock->get_rdlock();
+      mut->emplace_lock(lock, MutationImpl::LockOp::RDLOCK);
+      return true;
+    }
+
+    // hmm, wait a second.
+    if (in && !in->is_head() && in->is_auth() &&
+	lock->get_state() == LOCK_SNAP_SYNC) {
+      // okay, we actually need to kick the head's lock to get ourselves synced up.
+      CInode *head = mdcache->get_inode(in->ino());
+      ceph_assert(head);
+      SimpleLock *hlock = head->get_lock(CEPH_LOCK_IFILE);
+      if (hlock->get_state() == LOCK_SYNC)
+	hlock = head->get_lock(lock->get_type());
+
+      if (hlock->get_state() != LOCK_SYNC) {
+	dout(10) << "rdlock_start trying head inode " << *head << dendl;
+	if (!rdlock_start(hlock, mut, true)) // ** as_anon, no rdlock on EXCL **
+	  return false;
+	// oh, check our lock again then
+      }
+    }
+
+    if (!_rdlock_kick(lock, as_anon))
+      break;
+  }
+
+  // wait!
+  int wait_on;
+  if (lock->get_parent()->is_auth() && lock->is_stable())
+    wait_on = SimpleLock::WAIT_RD;
+  else
+    wait_on = SimpleLock::WAIT_STABLE;  // REQRDLOCK is ignored if lock is unstable, so we need to retry.
+  dout(7) << "rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl;
+  lock->add_waiter(wait_on, new C_MDS_RetryRequest(mdcache, mut));
+  nudge_log(lock);
+  return false;
+}
+
+void Locker::nudge_log(SimpleLock *lock)
+{
+  dout(10) << "nudge_log " << *lock << " on " << *lock->get_parent() << dendl;
+  if (lock->get_parent()->is_auth() && lock->is_unstable_and_locked())    // as with xlockdone, or cap flush
+    mds->mdlog->flush();
+}
+
+void Locker::rdlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue)
+{
+  ceph_assert(it->is_rdlock());
+  SimpleLock *lock = it->lock;
+  // drop ref
+  lock->put_rdlock();
+  if (mut)
+    mut->locks.erase(it);
+
+  dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << dendl;
+  
+  // last one?
+  if (!lock->is_rdlocked()) {
+    if (!lock->is_stable())
+      eval_gather(lock, false, pneed_issue);
+    else if (lock->get_parent()->is_auth())
+      try_eval(lock, pneed_issue);
+  }
+}
+
+bool Locker::rdlock_try_set(MutationImpl::LockOpVec& lov, MDRequestRef& mdr)
+{
+  dout(10) << __func__  << dendl;
+  for (const auto& p : lov) {
+    auto lock = p.lock;
+    ceph_assert(p.is_rdlock());
+    if (!mdr->is_rdlocked(lock) && !rdlock_try(lock, mdr->get_client())) {
+      lock->add_waiter(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD,
+                       new C_MDS_RetryRequest(mdcache, mdr));
+      goto failed;
+    }
+    lock->get_rdlock();
+    mdr->emplace_lock(lock, MutationImpl::LockOp::RDLOCK);
+    dout(20) << " got rdlock on " << *lock << " " << *lock->get_parent() << dendl;
+  }
+
+  return true;
+failed:
+  dout(10) << __func__ << " failed" << dendl;
+  drop_locks(mdr.get(), nullptr);
+  mdr->drop_local_auth_pins();
+  return false;
+}
+
+bool Locker::rdlock_try_set(MutationImpl::LockOpVec& lov, MutationRef& mut)
+{
+  dout(10) << __func__  << dendl;
+  for (const auto& p : lov) {
+    auto lock = p.lock;
+    ceph_assert(p.is_rdlock());
+    if (!lock->can_rdlock(mut->get_client()))
+      return false;
+    p.lock->get_rdlock();
+    mut->emplace_lock(p.lock, MutationImpl::LockOp::RDLOCK);
+  }
+  return true;
+}
+
+// ------------------
+// wrlock
+
+void Locker::wrlock_force(SimpleLock *lock, MutationRef& mut)
+{
+  if (lock->get_type() == CEPH_LOCK_IVERSION ||
+      lock->get_type() == CEPH_LOCK_DVERSION)
+    return local_wrlock_grab(static_cast<LocalLockC*>(lock), mut);
+
+  dout(7) << "wrlock_force  on " << *lock
+	  << " on " << *lock->get_parent() << dendl;  
+  lock->get_wrlock(true);
+  mut->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
+}
+
+bool Locker::wrlock_try(SimpleLock *lock, const MutationRef& mut, client_t client)
+{
+  dout(10) << "wrlock_try " << *lock << " on " << *lock->get_parent() << dendl;
+  if (client == -1)
+    client = mut->get_client();
+
+  while (1) {
+    if (lock->can_wrlock(client)) {
+      lock->get_wrlock();
+      auto it = mut->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
+      it->flags |= MutationImpl::LockOp::WRLOCK; // may already remote_wrlocked
+      return true;
+    }
+    if (!lock->is_stable())
+      break;
+    CInode *in = static_cast<CInode *>(lock->get_parent());
+    if (!in->is_auth())
+      break;
+    // caller may already has a log entry open. To avoid calling
+    // scatter_writebehind or start_scatter. don't change nest lock
+    // state if it has dirty scatterdata.
+    if (lock->is_dirty())
+      break;
+    // To avoid calling scatter_writebehind or start_scatter. don't
+    // change nest lock state to MIX.
+    ScatterLock *slock = static_cast<ScatterLock*>(lock);
+    if (slock->get_scatter_wanted() || in->has_subtree_or_exporting_dirfrag())
+      break;
+
+    simple_lock(lock);
+  }
+  return false;
+}
+
+bool Locker::wrlock_start(const MutationImpl::LockOp &op, MDRequestRef& mut)
+{
+  SimpleLock *lock = op.lock;
+  if (lock->get_type() == CEPH_LOCK_IVERSION ||
+      lock->get_type() == CEPH_LOCK_DVERSION)
+    return local_wrlock_start(static_cast<LocalLockC*>(lock), mut);
+
+  dout(10) << "wrlock_start " << *lock << " on " << *lock->get_parent() << dendl;
+
+  CInode *in = static_cast<CInode *>(lock->get_parent());
+  client_t client = op.is_state_pin() ? lock->get_excl_client() : mut->get_client();
+  bool want_scatter = lock->get_parent()->is_auth() &&
+		      (in->has_subtree_or_exporting_dirfrag() ||
+		       static_cast<ScatterLock*>(lock)->get_scatter_wanted());
+
+  while (1) {
+    // wrlock?
+    if (lock->can_wrlock(client) &&
+	(!want_scatter || lock->get_state() == LOCK_MIX)) {
+      lock->get_wrlock();
+      auto it = mut->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
+      it->flags |= MutationImpl::LockOp::WRLOCK; // may already remote_wrlocked
+      return true;
+    }
+
+    if (lock->get_type() == CEPH_LOCK_IFILE &&
+	in->state_test(CInode::STATE_RECOVERING)) {
+      mds->mdcache->recovery_queue.prioritize(in);
+    }
+
+    if (!lock->is_stable())
+      break;
+
+    if (in->is_auth()) {
+      if (want_scatter)
+	scatter_mix(static_cast<ScatterLock*>(lock));
+      else
+	simple_lock(lock);
+    } else {
+      // replica.
+      // auth should be auth_pinned (see acquire_locks wrlock weird mustpin case).
+      mds_rank_t auth = lock->get_parent()->authority().first;
+      if (!mds->is_cluster_degraded() ||
+	  mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
+	dout(10) << "requesting scatter from auth on "
+		 << *lock << " on " << *lock->get_parent() << dendl;
+	mds->send_message_mds(make_message<MLock>(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), auth);
+      }
+      break;
+    }
+  }
+
+  dout(7) << "wrlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl;
+  lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut));
+  nudge_log(lock);
+    
+  return false;
+}
+
+void Locker::wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue)
+{
+  ceph_assert(it->is_wrlock());
+  SimpleLock* lock = it->lock;
+
+  if (lock->get_type() == CEPH_LOCK_IVERSION ||
+      lock->get_type() == CEPH_LOCK_DVERSION)
+    return local_wrlock_finish(it, mut);
+
+  dout(7) << "wrlock_finish on " << *lock << " on " << *lock->get_parent() << dendl;
+  lock->put_wrlock();
+
+  if (it->is_remote_wrlock())
+    it->clear_wrlock();
+  else
+    mut->locks.erase(it);
+
+  if (lock->is_wrlocked()) {
+    // Evaluate unstable lock after scatter_writebehind_finish(). Because
+    // eval_gather() does not change lock's state when lock is flushing.
+    if (!lock->is_stable() && lock->is_flushed() &&
+	lock->get_parent()->is_auth())
+      eval_gather(lock, false, pneed_issue);
+  } else {
+    if (!lock->is_stable())
+      eval_gather(lock, false, pneed_issue);
+    else if (lock->get_parent()->is_auth())
+      try_eval(lock, pneed_issue);
+  }
+}
+
+
+// remote wrlock
+
+void Locker::remote_wrlock_start(SimpleLock *lock, mds_rank_t target, MDRequestRef& mut)
+{
+  dout(7) << "remote_wrlock_start mds." << target << " on " << *lock << " on " << *lock->get_parent() << dendl;
+
+  // wait for active target
+  if (mds->is_cluster_degraded() &&
+      !mds->mdsmap->is_clientreplay_or_active_or_stopping(target)) {
+    dout(7) << " mds." << target << " is not active" << dendl;
+    if (mut->more()->waiting_on_peer.empty())
+      mds->wait_for_active_peer(target, new C_MDS_RetryRequest(mdcache, mut));
+    return;
+  }
+
+  // send lock request
+  mut->start_locking(lock, target);
+  mut->more()->peers.insert(target);
+  auto r = make_message<MMDSPeerRequest>(mut->reqid, mut->attempt, MMDSPeerRequest::OP_WRLOCK);
+  r->set_lock_type(lock->get_type());
+  lock->get_parent()->set_object_info(r->get_object_info());
+  mds->send_message_mds(r, target);
+
+  ceph_assert(mut->more()->waiting_on_peer.count(target) == 0);
+  mut->more()->waiting_on_peer.insert(target);
+}
+
+void Locker::remote_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut)
+{
+  ceph_assert(it->is_remote_wrlock());
+  SimpleLock *lock = it->lock;
+  mds_rank_t target = it->wrlock_target;
+
+  if (it->is_wrlock())
+    it->clear_remote_wrlock();
+  else
+    mut->locks.erase(it);
+  
+  dout(7) << "remote_wrlock_finish releasing remote wrlock on mds." << target
+	  << " " << *lock->get_parent()  << dendl;
+  if (!mds->is_cluster_degraded() ||
+      mds->mdsmap->get_state(target) >= MDSMap::STATE_REJOIN) {
+    auto peerreq = make_message<MMDSPeerRequest>(mut->reqid, mut->attempt, MMDSPeerRequest::OP_UNWRLOCK);
+    peerreq->set_lock_type(lock->get_type());
+    lock->get_parent()->set_object_info(peerreq->get_object_info());
+    mds->send_message_mds(peerreq, target);
+  }
+}
+
+
+// ------------------
+// xlock
+
+bool Locker::xlock_start(SimpleLock *lock, MDRequestRef& mut)
+{
+  if (lock->get_type() == CEPH_LOCK_IVERSION ||
+      lock->get_type() == CEPH_LOCK_DVERSION)
+    return local_xlock_start(static_cast<LocalLockC*>(lock), mut);
+
+  dout(7) << "xlock_start on " << *lock << " on " << *lock->get_parent() << dendl;
+  client_t client = mut->get_client();
+
+  CInode *in = nullptr;
+  if (lock->get_cap_shift())
+    in = static_cast<CInode *>(lock->get_parent());
+
+  // auth?
+  if (lock->get_parent()->is_auth()) {
+    // auth
+    while (1) {
+      if (mut->locking && // started xlock (not preempt other request)
+	  lock->can_xlock(client) &&
+	  !(lock->get_state() == LOCK_LOCK_XLOCK &&	// client is not xlocker or
+	    in && in->issued_caps_need_gather(lock))) { // xlocker does not hold shared cap
+	lock->set_state(LOCK_XLOCK);
+	lock->get_xlock(mut, client);
+	mut->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
+	mut->finish_locking(lock);
+	return true;
+      }
+      
+      if (lock->get_type() == CEPH_LOCK_IFILE &&
+	  in->state_test(CInode::STATE_RECOVERING)) {
+	mds->mdcache->recovery_queue.prioritize(in);
+      }
+
+      if (!lock->is_stable() && (lock->get_state() != LOCK_XLOCKDONE ||
+				 lock->get_xlock_by_client() != client ||
+				 lock->is_waiter_for(SimpleLock::WAIT_STABLE)))
+	break;
+
+      if (lock->get_state() == LOCK_LOCK || lock->get_state() == LOCK_XLOCKDONE) {
+	mut->start_locking(lock);
+	simple_xlock(lock);
+      } else {
+	simple_lock(lock);
+      }
+    }
+    
+    lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut));
+    nudge_log(lock);
+    return false;
+  } else {
+    // replica
+    ceph_assert(lock->get_sm()->can_remote_xlock);
+    ceph_assert(!mut->peer_request);
+    
+    // wait for single auth
+    if (lock->get_parent()->is_ambiguous_auth()) {
+      lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, 
+				     new C_MDS_RetryRequest(mdcache, mut));
+      return false;
+    }
+    
+    // wait for active auth
+    mds_rank_t auth = lock->get_parent()->authority().first;
+    if (mds->is_cluster_degraded() &&
+	!mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
+      dout(7) << " mds." << auth << " is not active" << dendl;
+      if (mut->more()->waiting_on_peer.empty())
+	mds->wait_for_active_peer(auth, new C_MDS_RetryRequest(mdcache, mut));
+      return false;
+    }
+
+    // send lock request
+    mut->more()->peers.insert(auth);
+    mut->start_locking(lock, auth);
+    auto r = make_message<MMDSPeerRequest>(mut->reqid, mut->attempt, MMDSPeerRequest::OP_XLOCK);
+    r->set_lock_type(lock->get_type());
+    lock->get_parent()->set_object_info(r->get_object_info());
+    mds->send_message_mds(r, auth);
+
+    ceph_assert(mut->more()->waiting_on_peer.count(auth) == 0);
+    mut->more()->waiting_on_peer.insert(auth);
+
+    return false;
+  }
+}
+
+void Locker::_finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue)
+{
+  ceph_assert(!lock->is_stable());
+  if (lock->get_type() != CEPH_LOCK_DN &&
+      lock->get_type() != CEPH_LOCK_ISNAP &&
+      lock->get_type() != CEPH_LOCK_IPOLICY &&
+      lock->get_num_rdlocks() == 0 &&
+      lock->get_num_wrlocks() == 0 &&
+      !lock->is_leased() &&
+      lock->get_state() != LOCK_XLOCKSNAP) {
+    CInode *in = static_cast<CInode*>(lock->get_parent());
+    client_t loner = in->get_target_loner();
+    if (loner >= 0 && (xlocker < 0 || xlocker == loner)) {
+      lock->set_state(LOCK_EXCL);
+      lock->get_parent()->auth_unpin(lock);
+      lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD);
+      if (lock->get_cap_shift())
+	*pneed_issue = true;
+      if (lock->get_parent()->is_auth() &&
+	  lock->is_stable())
+	try_eval(lock, pneed_issue);
+      return;
+    }
+  }
+  // the xlocker may have CEPH_CAP_GSHARED, need to revoke it if next state is LOCK_LOCK
+  eval_gather(lock, lock->get_state() != LOCK_XLOCKSNAP, pneed_issue);
+}
+
+void Locker::xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue)
+{
+  ceph_assert(it->is_xlock());
+  SimpleLock *lock = it->lock;
+
+  if (lock->get_type() == CEPH_LOCK_IVERSION ||
+      lock->get_type() == CEPH_LOCK_DVERSION)
+    return local_xlock_finish(it, mut);
+
+  dout(10) << "xlock_finish on " << *lock << " " << *lock->get_parent() << dendl;
+
+  client_t xlocker = lock->get_xlock_by_client();
+
+  // drop ref
+  lock->put_xlock();
+  ceph_assert(mut);
+  mut->locks.erase(it);
+  
+  bool do_issue = false;
+
+  // remote xlock?
+  if (!lock->get_parent()->is_auth()) {
+    ceph_assert(lock->get_sm()->can_remote_xlock);
+
+    // tell auth
+    dout(7) << "xlock_finish releasing remote xlock on " << *lock->get_parent()  << dendl;
+    mds_rank_t auth = lock->get_parent()->authority().first;
+    if (!mds->is_cluster_degraded() ||
+	mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
+      auto peerreq = make_message<MMDSPeerRequest>(mut->reqid, mut->attempt, MMDSPeerRequest::OP_UNXLOCK);
+      peerreq->set_lock_type(lock->get_type());
+      lock->get_parent()->set_object_info(peerreq->get_object_info());
+      mds->send_message_mds(peerreq, auth);
+    }
+    // others waiting?
+    lock->finish_waiters(SimpleLock::WAIT_STABLE |
+			 SimpleLock::WAIT_WR | 
+			 SimpleLock::WAIT_RD, 0); 
+  } else {
+    if (lock->get_num_xlocks() == 0 &&
+        lock->get_state() != LOCK_LOCK_XLOCK) { // no one is taking xlock
+      _finish_xlock(lock, xlocker, &do_issue);
+    }
+  }
+  
+  if (do_issue) {
+    CInode *in = static_cast<CInode*>(lock->get_parent());
+    if (in->is_head()) {
+      if (pneed_issue)
+	*pneed_issue = true;
+      else
+	issue_caps(in);
+    }
+  }
+}
+
+void Locker::xlock_export(const MutationImpl::lock_iterator& it, MutationImpl *mut)
+{
+  ceph_assert(it->is_xlock());
+  SimpleLock *lock = it->lock;
+  dout(10) << "xlock_export on " << *lock << " " << *lock->get_parent() << dendl;
+
+  lock->put_xlock();
+  mut->locks.erase(it);
+
+  MDSCacheObject *p = lock->get_parent();
+  ceph_assert(p->state_test(CInode::STATE_AMBIGUOUSAUTH));  // we are exporting this (inode)
+
+  if (!lock->is_stable())
+    lock->get_parent()->auth_unpin(lock);
+
+  lock->set_state(LOCK_LOCK);
+}
+
+void Locker::xlock_import(SimpleLock *lock)
+{
+  dout(10) << "xlock_import on " << *lock << " " << *lock->get_parent() << dendl;
+  lock->get_parent()->auth_pin(lock);
+}
+
+void Locker::xlock_downgrade(SimpleLock *lock, MutationImpl *mut)
+{
+  dout(10) << "xlock_downgrade on " << *lock << " " << *lock->get_parent() << dendl;
+  auto it = mut->locks.find(lock);
+  if (it->is_rdlock())
+    return; // already downgraded
+
+  ceph_assert(lock->get_parent()->is_auth());
+  ceph_assert(it != mut->locks.end());
+  ceph_assert(it->is_xlock());
+
+  lock->set_xlock_done();
+  lock->get_rdlock();
+  xlock_finish(it, mut, nullptr);
+  mut->emplace_lock(lock, MutationImpl::LockOp::RDLOCK);
+}
+
+
+// file i/o -----------------------------------------
+
+version_t Locker::issue_file_data_version(CInode *in)
+{
+  dout(7) << "issue_file_data_version on " << *in << dendl;
+  return in->get_inode()->file_data_version;
+}
+
+class C_Locker_FileUpdate_finish : public LockerLogContext {
+  CInode *in;
+  MutationRef mut;
+  unsigned flags;
+  client_t client;
+  ref_t<MClientCaps> ack;
+public:
+  C_Locker_FileUpdate_finish(Locker *l, CInode *i, MutationRef& m, unsigned f,
+                             const ref_t<MClientCaps> &ack, client_t c=-1)
+    : LockerLogContext(l), in(i), mut(m), flags(f), client(c), ack(ack) {
+    in->get(CInode::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    locker->file_update_finish(in, mut, flags, client, ack);
+    in->put(CInode::PIN_PTRWAITER);
+  }
+};
+
+enum {
+  UPDATE_SHAREMAX = 1,
+  UPDATE_NEEDSISSUE = 2,
+  UPDATE_SNAPFLUSH = 4,
+};
+
+void Locker::file_update_finish(CInode *in, MutationRef& mut, unsigned flags,
+				client_t client, const ref_t<MClientCaps> &ack)
+{
+  dout(10) << "file_update_finish on " << *in << dendl;
+
+  mut->apply();
+
+  if (ack) {
+    Session *session = mds->get_session(client);
+    if (session && !session->is_closed()) {
+      // "oldest flush tid" > 0 means client uses unique TID for each flush
+      if (ack->get_oldest_flush_tid() > 0)
+        session->add_completed_flush(ack->get_client_tid());
+      mds->send_message_client_counted(ack, session);
+    } else {
+      dout(10) << " no session for client." << client << " " << *ack << dendl;
+    }
+  }
+
+  set<CInode*> need_issue;
+  drop_locks(mut.get(), &need_issue);
+
+  if (in->is_head()) {
+    if ((flags & UPDATE_NEEDSISSUE) && need_issue.count(in) == 0) {
+      Capability *cap = in->get_client_cap(client);
+      if (cap && (cap->wanted() & ~cap->pending()))
+	issue_caps(in, cap);
+    }
+
+    if ((flags & UPDATE_SHAREMAX) && in->is_auth() &&
+	(in->filelock.gcaps_allowed(CAP_LONER) & (CEPH_CAP_GWR|CEPH_CAP_GBUFFER)))
+      share_inode_max_size(in);
+
+  } else if ((flags & UPDATE_SNAPFLUSH) && !in->client_snap_caps.empty()) {
+    dout(10) << " client_snap_caps " << in->client_snap_caps << dendl;
+    // check for snap writeback completion
+    in->client_snap_caps.erase(client);
+    if (in->client_snap_caps.empty()) {
+      for (int i = 0; i < num_cinode_locks; i++) {
+	SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
+	ceph_assert(lock);
+	lock->put_wrlock();
+      }
+      in->item_open_file.remove_myself();
+      in->item_caps.remove_myself();
+      eval_cap_gather(in, &need_issue);
+    }
+  }
+  issue_caps_set(need_issue);
+
+  mds->balancer->hit_inode(in, META_POP_IWR);
+
+  // auth unpin after issuing caps
+  mut->cleanup();
+}
+
+Capability* Locker::issue_new_caps(CInode *in,
+				   int mode,
+				   MDRequestRef& mdr,
+				   SnapRealm *realm)
+{
+  dout(7) << "issue_new_caps for mode " << mode << " on " << *in << dendl;
+  Session *session = mdr->session;
+  bool new_inode = (mdr->alloc_ino || mdr->used_prealloc_ino);
+
+  // if replay or async, try to reconnect cap, and otherwise do nothing.
+  if (new_inode && mdr->client_request->is_queued_for_replay())
+    return mds->mdcache->try_reconnect_cap(in, session);
+
+  // my needs
+  ceph_assert(session->info.inst.name.is_client());
+  client_t my_client = session->get_client();
+  int my_want = ceph_caps_for_mode(mode);
+
+  // register a capability
+  Capability *cap = in->get_client_cap(my_client);
+  if (!cap) {
+    // new cap
+    cap = in->add_client_cap(my_client, session, realm, new_inode);
+    cap->set_wanted(my_want);
+    cap->mark_new();
+  } else {
+    // make sure it wants sufficient caps
+    if (my_want & ~cap->wanted()) {
+      // augment wanted caps for this client
+      cap->set_wanted(cap->wanted() | my_want);
+    }
+  }
+  cap->inc_suppress(); // suppress file cap messages (we'll bundle with the request reply)
+
+  if (in->is_auth()) {
+    // [auth] twiddle mode?
+    eval(in, CEPH_CAP_LOCKS);
+
+    int all_allowed = -1, loner_allowed = -1, xlocker_allowed = -1;
+    int allowed = get_allowed_caps(in, cap, all_allowed, loner_allowed,
+                                   xlocker_allowed);
+
+    if (_need_flush_mdlog(in, my_want & ~allowed, true))
+      mds->mdlog->flush();
+
+  } else {
+    // [replica] tell auth about any new caps wanted
+    request_inode_file_caps(in);
+  }
+
+  // issue caps (pot. incl new one)
+  //issue_caps(in);  // note: _eval above may have done this already...
+
+  // re-issue whatever we can
+  //cap->issue(cap->pending());
+
+  cap->dec_suppress();
+
+  return cap;
+}
+
+void Locker::issue_caps_set(set<CInode*>& inset)
+{
+  for (set<CInode*>::iterator p = inset.begin(); p != inset.end(); ++p)
+    issue_caps(*p);
+}
+
+class C_Locker_RevokeStaleCap : public LockerContext {
+  CInode *in;
+  client_t client;
+public:
+  C_Locker_RevokeStaleCap(Locker *l, CInode *i, client_t c) :
+    LockerContext(l), in(i), client(c) {
+    in->get(CInode::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    locker->revoke_stale_cap(in, client);
+    in->put(CInode::PIN_PTRWAITER);
+  }
+};
+
+int Locker::get_allowed_caps(CInode *in, Capability *cap,
+                             int &all_allowed, int &loner_allowed,
+                             int &xlocker_allowed)
+{
+  client_t client = cap->get_client();
+
+  // allowed caps are determined by the lock mode.
+  if (all_allowed == -1)
+    all_allowed = in->get_caps_allowed_by_type(CAP_ANY);
+  if (loner_allowed == -1)
+    loner_allowed = in->get_caps_allowed_by_type(CAP_LONER);
+  if (xlocker_allowed == -1)
+    xlocker_allowed = in->get_caps_allowed_by_type(CAP_XLOCKER);
+
+  client_t loner = in->get_loner();
+  if (loner >= 0) {
+    dout(7) << "get_allowed_caps loner client." << loner
+	    << " allowed=" << ccap_string(loner_allowed) 
+	    << ", xlocker allowed=" << ccap_string(xlocker_allowed)
+	    << ", others allowed=" << ccap_string(all_allowed)
+	    << " on " << *in << dendl;
+  } else {
+    dout(7) << "get_allowed_caps allowed=" << ccap_string(all_allowed) 
+	    << ", xlocker allowed=" << ccap_string(xlocker_allowed)
+	    << " on " << *in << dendl;
+  }
+
+  // do not issue _new_ bits when size|mtime is projected
+  int allowed;
+  if (loner == client)
+    allowed = loner_allowed;
+  else
+    allowed = all_allowed;
+
+  // add in any xlocker-only caps (for locks this client is the xlocker for)
+  allowed |= xlocker_allowed & in->get_xlocker_mask(client);
+  if (in->is_dir()) {
+    allowed &= ~CEPH_CAP_ANY_DIR_OPS;
+    if (allowed & CEPH_CAP_FILE_EXCL)
+      allowed |= cap->get_lock_cache_allowed();
+  }
+
+  if ((in->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
+       cap->is_noinline()) ||
+      (!in->get_inode()->layout.pool_ns.empty() &&
+       cap->is_nopoolns()))
+    allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
+
+  return allowed;
+}
+
+int Locker::issue_caps(CInode *in, Capability *only_cap)
+{
+  // count conflicts with
+  int nissued = 0;
+  int all_allowed = -1, loner_allowed = -1, xlocker_allowed = -1;
+
+  ceph_assert(in->is_head());
+
+  // client caps
+  map<client_t, Capability>::iterator it;
+  if (only_cap)
+    it = in->client_caps.find(only_cap->get_client());
+  else
+    it = in->client_caps.begin();
+  for (; it != in->client_caps.end(); ++it) {
+    Capability *cap = &it->second;
+    int allowed = get_allowed_caps(in, cap, all_allowed, loner_allowed,
+                                   xlocker_allowed);
+    int pending = cap->pending();
+    int wanted = cap->wanted();
+
+    dout(20) << " client." << it->first
+	     << " pending " << ccap_string(pending) 
+	     << " allowed " << ccap_string(allowed) 
+	     << " wanted " << ccap_string(wanted)
+	     << dendl;
+
+    if (!(pending & ~allowed)) {
+      // skip if suppress or new, and not revocation
+      if (cap->is_new() || cap->is_suppress() || cap->is_stale()) {
+	dout(20) << "  !revoke and new|suppressed|stale, skipping client." << it->first << dendl;
+	continue;
+      }
+    } else {
+      ceph_assert(!cap->is_new());
+      if (cap->is_stale()) {
+	dout(20) << "  revoke stale cap from client." << it->first << dendl;
+	ceph_assert(!cap->is_valid());
+	cap->issue(allowed & pending, false);
+	mds->queue_waiter_front(new C_Locker_RevokeStaleCap(this, in, it->first));
+	continue;
+      }
+
+      if (!cap->is_valid() && (pending & ~CEPH_CAP_PIN)) {
+	// After stale->resume circle, client thinks it only has CEPH_CAP_PIN.
+	// mds needs to re-issue caps, then do revocation.
+	long seq = cap->issue(pending, true);
+
+	dout(7) << "   sending MClientCaps to client." << it->first
+		<< " seq " << seq << " re-issue " << ccap_string(pending) << dendl;
+
+        if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_grant);
+
+	auto m = make_message<MClientCaps>(CEPH_CAP_OP_GRANT, in->ino(),
+					   in->find_snaprealm()->inode->ino(),
+					   cap->get_cap_id(), cap->get_last_seq(),
+					   pending, wanted, 0, cap->get_mseq(),
+					   mds->get_osd_epoch_barrier());
+	in->encode_cap_message(m, cap);
+
+	mds->send_message_client_counted(m, cap->get_session());
+      }
+    }
+
+    // notify clients about deleted inode, to make sure they release caps ASAP.
+    if (in->get_inode()->nlink == 0)
+      wanted |= CEPH_CAP_LINK_SHARED;
+
+    // are there caps that the client _wants_ and can have, but aren't pending?
+    // or do we need to revoke?
+    if ((pending & ~allowed) ||			// need to revoke ~allowed caps.
+	((wanted & allowed) & ~pending) ||	// missing wanted+allowed caps
+	!cap->is_valid()) {			// after stale->resume circle
+      // issue
+      nissued++;
+
+      // include caps that clients generally like, while we're at it.
+      int likes = in->get_caps_liked();      
+      int before = pending;
+      long seq;
+      if (pending & ~allowed)
+	seq = cap->issue((wanted|likes) & allowed & pending, true);  // if revoking, don't issue anything new.
+      else
+	seq = cap->issue((wanted|likes) & allowed, true);
+      int after = cap->pending();
+
+      dout(7) << "   sending MClientCaps to client." << it->first
+	      << " seq " << seq << " new pending " << ccap_string(after)
+	      << " was " << ccap_string(before) << dendl;
+
+      int op = (before & ~after) ? CEPH_CAP_OP_REVOKE : CEPH_CAP_OP_GRANT;
+      if (op == CEPH_CAP_OP_REVOKE) {
+        if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_revoke);
+	revoking_caps.push_back(&cap->item_revoking_caps);
+	revoking_caps_by_client[cap->get_client()].push_back(&cap->item_client_revoking_caps);
+	cap->set_last_revoke_stamp(ceph_clock_now());
+	cap->reset_num_revoke_warnings();
+      } else {
+        if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_grant);
+      }
+
+      auto m = make_message<MClientCaps>(op, in->ino(),
+					 in->find_snaprealm()->inode->ino(),
+					 cap->get_cap_id(), cap->get_last_seq(),
+					 after, wanted, 0, cap->get_mseq(),
+					 mds->get_osd_epoch_barrier());
+      in->encode_cap_message(m, cap);
+
+      mds->send_message_client_counted(m, cap->get_session());
+    }
+
+    if (only_cap)
+      break;
+  }
+
+  return nissued;
+}
+
+void Locker::issue_truncate(CInode *in)
+{
+  dout(7) << "issue_truncate on " << *in << dendl;
+  
+  for (auto &p : in->client_caps) {
+    if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_trunc);
+    Capability *cap = &p.second;
+    auto m = make_message<MClientCaps>(CEPH_CAP_OP_TRUNC,
+                                       in->ino(),
+                                       in->find_snaprealm()->inode->ino(),
+                                       cap->get_cap_id(), cap->get_last_seq(),
+                                       cap->pending(), cap->wanted(), 0,
+                                       cap->get_mseq(),
+                                       mds->get_osd_epoch_barrier());
+    in->encode_cap_message(m, cap);			     
+    mds->send_message_client_counted(m, p.first);
+  }
+
+  // should we increase max_size?
+  if (in->is_auth() && in->is_file())
+    check_inode_max_size(in);
+}
+
+
+void Locker::revoke_stale_cap(CInode *in, client_t client)
+{
+  dout(7) << __func__ << " client." << client << " on " << *in << dendl;
+  Capability *cap = in->get_client_cap(client);
+  if (!cap)
+    return;
+
+  if (cap->revoking() & CEPH_CAP_ANY_WR) {
+    CachedStackStringStream css;
+    mds->evict_client(client.v, false, g_conf()->mds_session_blocklist_on_timeout, *css, nullptr);
+    return;
+  }
+
+  cap->revoke();
+
+  if (in->is_auth() && in->get_inode()->client_ranges.count(cap->get_client()))
+    in->state_set(CInode::STATE_NEEDSRECOVER);
+
+  if (in->state_test(CInode::STATE_EXPORTINGCAPS))
+    return;
+
+  if (!in->filelock.is_stable())
+    eval_gather(&in->filelock);
+  if (!in->linklock.is_stable())
+    eval_gather(&in->linklock);
+  if (!in->authlock.is_stable())
+    eval_gather(&in->authlock);
+  if (!in->xattrlock.is_stable())
+    eval_gather(&in->xattrlock);
+
+  if (in->is_auth())
+    try_eval(in, CEPH_CAP_LOCKS);
+  else
+    request_inode_file_caps(in);
+}
+
+bool Locker::revoke_stale_caps(Session *session)
+{
+  dout(10) << "revoke_stale_caps for " << session->info.inst.name << dendl;
+
+  // invalidate all caps
+  session->inc_cap_gen();
+
+  bool ret = true;
+  std::vector<CInode*> to_eval;
+
+  for (auto p = session->caps.begin(); !p.end(); ) {
+    Capability *cap = *p;
+    ++p;
+    if (!cap->is_notable()) {
+      // the rest ones are not being revoked and don't have writeable range
+      // and don't want exclusive caps or want file read/write. They don't
+      // need recover, they don't affect eval_gather()/try_eval()
+      break;
+    }
+
+    int revoking = cap->revoking();
+    if (!revoking)
+      continue;
+
+    if (revoking & CEPH_CAP_ANY_WR) {
+      ret = false;
+      break;
+    }
+
+    int issued = cap->issued();
+    CInode *in = cap->get_inode();
+    dout(10) << " revoking " << ccap_string(issued) << " on " << *in << dendl;
+    int revoked = cap->revoke();
+    if (revoked & CEPH_CAP_ANY_DIR_OPS)
+      eval_lock_caches(cap);
+
+    if (in->is_auth() &&
+	in->get_inode()->client_ranges.count(cap->get_client()))
+      in->state_set(CInode::STATE_NEEDSRECOVER);
+
+    // eval lock/inode may finish contexts, which may modify other cap's position
+    // in the session->caps.
+    to_eval.push_back(in);
+  }
+
+  for (auto in : to_eval) {
+    if (in->state_test(CInode::STATE_EXPORTINGCAPS))
+      continue;
+
+    if (!in->filelock.is_stable())
+      eval_gather(&in->filelock);
+    if (!in->linklock.is_stable())
+      eval_gather(&in->linklock);
+    if (!in->authlock.is_stable())
+      eval_gather(&in->authlock);
+    if (!in->xattrlock.is_stable())
+      eval_gather(&in->xattrlock);
+
+    if (in->is_auth())
+      try_eval(in, CEPH_CAP_LOCKS);
+    else
+      request_inode_file_caps(in);
+  }
+
+  return ret;
+}
+
+void Locker::resume_stale_caps(Session *session)
+{
+  dout(10) << "resume_stale_caps for " << session->info.inst.name << dendl;
+
+  bool lazy = session->info.has_feature(CEPHFS_FEATURE_LAZY_CAP_WANTED);
+  for (xlist<Capability*>::iterator p = session->caps.begin(); !p.end(); ) {
+    Capability *cap = *p;
+    ++p;
+    if (lazy && !cap->is_notable())
+      break; // see revoke_stale_caps()
+
+    CInode *in = cap->get_inode();
+    ceph_assert(in->is_head());
+    dout(10) << " clearing stale flag on " << *in << dendl;
+
+    if (in->state_test(CInode::STATE_EXPORTINGCAPS)) {
+      // if export succeeds, the cap will be removed. if export fails,
+      // we need to re-issue the cap if it's not stale.
+      in->state_set(CInode::STATE_EVALSTALECAPS);
+      continue;
+    }
+
+    if (!in->is_auth() || !eval(in, CEPH_CAP_LOCKS))
+      issue_caps(in, cap);
+  }
+}
+
+void Locker::remove_stale_leases(Session *session)
+{
+  dout(10) << "remove_stale_leases for " << session->info.inst.name << dendl;
+  xlist<ClientLease*>::iterator p = session->leases.begin();
+  while (!p.end()) {
+    ClientLease *l = *p;
+    ++p;
+    CDentry *parent = static_cast<CDentry*>(l->parent);
+    dout(15) << " removing lease on " << *parent << dendl;
+    parent->remove_client_lease(l, this);
+  }
+}
+
+
+class C_MDL_RequestInodeFileCaps : public LockerContext {
+  CInode *in;
+public:
+  C_MDL_RequestInodeFileCaps(Locker *l, CInode *i) : LockerContext(l), in(i) {
+    in->get(CInode::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    if (!in->is_auth())
+      locker->request_inode_file_caps(in);
+    in->put(CInode::PIN_PTRWAITER);
+  }
+};
+
+void Locker::request_inode_file_caps(CInode *in)
+{
+  ceph_assert(!in->is_auth());
+
+  int wanted = in->get_caps_wanted() & in->get_caps_allowed_ever() & ~CEPH_CAP_PIN;
+  if (wanted != in->replica_caps_wanted) {
+    // wait for single auth
+    if (in->is_ambiguous_auth()) {
+      in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, 
+                     new C_MDL_RequestInodeFileCaps(this, in));
+      return;
+    }
+
+    mds_rank_t auth = in->authority().first;
+    if (mds->is_cluster_degraded() &&
+	mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
+      mds->wait_for_active_peer(auth, new C_MDL_RequestInodeFileCaps(this, in));
+      return;
+    }
+
+    dout(7) << "request_inode_file_caps " << ccap_string(wanted)
+            << " was " << ccap_string(in->replica_caps_wanted) 
+            << " on " << *in << " to mds." << auth << dendl;
+
+    in->replica_caps_wanted = wanted;
+
+    if (!mds->is_cluster_degraded() ||
+	mds->mdsmap->is_clientreplay_or_active_or_stopping(auth))
+      mds->send_message_mds(make_message<MInodeFileCaps>(in->ino(), in->replica_caps_wanted), auth);
+  }
+}
+
+void Locker::handle_inode_file_caps(const cref_t<MInodeFileCaps> &m)
+{
+  // nobody should be talking to us during recovery.
+  if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+    if (mds->get_want_state() >= MDSMap::STATE_CLIENTREPLAY) {
+      mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+      return;
+    }
+    ceph_abort_msg("got unexpected message during recovery");
+  }
+
+  // ok
+  CInode *in = mdcache->get_inode(m->get_ino());
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+
+  ceph_assert(in);
+  ceph_assert(in->is_auth());
+
+  dout(7) << "handle_inode_file_caps replica mds." << from << " wants caps " << ccap_string(m->get_caps()) << " on " << *in << dendl;
+
+  if (mds->logger) mds->logger->inc(l_mdss_handle_inode_file_caps);
+
+  in->set_mds_caps_wanted(from, m->get_caps());
+
+  try_eval(in, CEPH_CAP_LOCKS);
+}
+
+
+class C_MDL_CheckMaxSize : public LockerContext {
+  CInode *in;
+  uint64_t new_max_size;
+  uint64_t newsize;
+  utime_t mtime;
+
+public:
+  C_MDL_CheckMaxSize(Locker *l, CInode *i, uint64_t _new_max_size,
+                     uint64_t _newsize, utime_t _mtime) :
+    LockerContext(l), in(i),
+    new_max_size(_new_max_size), newsize(_newsize), mtime(_mtime)
+  {
+    in->get(CInode::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    if (in->is_auth())
+      locker->check_inode_max_size(in, false, new_max_size, newsize, mtime);
+    in->put(CInode::PIN_PTRWAITER);
+  }
+};
+
+uint64_t Locker::calc_new_max_size(const CInode::inode_const_ptr &pi, uint64_t size)
+{
+  uint64_t new_max = (size + 1) << 1;
+  uint64_t max_inc = g_conf()->mds_client_writeable_range_max_inc_objs;
+  if (max_inc > 0) {
+    max_inc *= pi->layout.object_size;
+    new_max = std::min(new_max, size + max_inc);
+  }
+  return round_up_to(new_max, pi->get_layout_size_increment());
+}
+
+bool Locker::check_client_ranges(CInode *in, uint64_t size)
+{
+  const auto& latest = in->get_projected_inode();
+  uint64_t ms;
+  if (latest->has_layout()) {
+    ms = calc_new_max_size(latest, size);
+  } else {
+    // Layout-less directories like ~mds0/, have zero size
+    ms = 0;
+  }
+
+  auto it = latest->client_ranges.begin();
+  for (auto &p : in->client_caps) {
+    if ((p.second.issued() | p.second.wanted()) & CEPH_CAP_ANY_FILE_WR) {
+      if (it == latest->client_ranges.end())
+	return true;
+      if (it->first != p.first)
+	return true;
+      if (ms > it->second.range.last)
+	return true;
+      ++it;
+    }
+  }
+  return it != latest->client_ranges.end();
+}
+
+bool Locker::calc_new_client_ranges(CInode *in, uint64_t size, bool *max_increased)
+{
+  const auto& latest = in->get_projected_inode();
+  uint64_t ms;
+  if (latest->has_layout()) {
+    ms = calc_new_max_size(latest, size);
+  } else {
+    // Layout-less directories like ~mds0/, have zero size
+    ms = 0;
+  }
+
+  auto pi = in->_get_projected_inode();
+  bool updated = false;
+
+  // increase ranges as appropriate.
+  // shrink to 0 if no WR|BUFFER caps issued.
+  auto it = pi->client_ranges.begin();
+  for (auto &p : in->client_caps) {
+    if ((p.second.issued() | p.second.wanted()) & CEPH_CAP_ANY_FILE_WR) {
+      while (it != pi->client_ranges.end() && it->first < p.first) {
+	it = pi->client_ranges.erase(it);
+	updated = true;
+      }
+
+      if (it != pi->client_ranges.end() && it->first == p.first) {
+	if (ms > it->second.range.last) {
+	  it->second.range.last = ms;
+	  updated = true;
+	  if (max_increased)
+	    *max_increased = true;
+	}
+      } else {
+	it = pi->client_ranges.emplace_hint(it, std::piecewise_construct,
+					    std::forward_as_tuple(p.first),
+					    std::forward_as_tuple());
+	it->second.range.last = ms;
+	it->second.follows = in->first - 1;
+	updated = true;
+	if (max_increased)
+	  *max_increased = true;
+      }
+      p.second.mark_clientwriteable();
+      ++it;
+    } else {
+      p.second.clear_clientwriteable();
+    }
+  }
+  while (it != pi->client_ranges.end()) {
+    it = pi->client_ranges.erase(it);
+    updated = true;
+  }
+  if (updated) {
+    if (pi->client_ranges.empty())
+      in->clear_clientwriteable();
+    else
+      in->mark_clientwriteable();
+  }
+  return updated;
+}
+
+bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
+				  uint64_t new_max_size, uint64_t new_size,
+				  utime_t new_mtime)
+{
+  ceph_assert(in->is_auth());
+  ceph_assert(in->is_file());
+
+  const auto& latest = in->get_projected_inode();
+  uint64_t size = latest->size;
+  bool update_size = new_size > 0;
+
+  if (update_size) {
+    new_size = size = std::max(size, new_size);
+    new_mtime = std::max(new_mtime, latest->mtime);
+    if (latest->size == new_size && latest->mtime == new_mtime)
+      update_size = false;
+  }
+
+  bool new_ranges = check_client_ranges(in, std::max(new_max_size, size));
+  if (!update_size && !new_ranges) {
+    dout(20) << "check_inode_max_size no-op on " << *in << dendl;
+    return false;
+  }
+
+  dout(10) << "check_inode_max_size new_ranges " << new_ranges
+	   << " update_size " << update_size
+	   << " on " << *in << dendl;
+
+  if (in->is_frozen()) {
+    dout(10) << "check_inode_max_size frozen, waiting on " << *in << dendl;
+    in->add_waiter(CInode::WAIT_UNFREEZE,
+		   new C_MDL_CheckMaxSize(this, in, new_max_size, new_size, new_mtime));
+    return false;
+  } else if (!force_wrlock && !in->filelock.can_wrlock(in->get_loner())) {
+    // lock?
+    if (in->filelock.is_stable()) {
+      if (in->get_target_loner() >= 0)
+	file_excl(&in->filelock);
+      else
+	simple_lock(&in->filelock);
+    }
+    if (!in->filelock.can_wrlock(in->get_loner())) {
+      dout(10) << "check_inode_max_size can't wrlock, waiting on " << *in << dendl;
+      in->filelock.add_waiter(SimpleLock::WAIT_STABLE,
+			      new C_MDL_CheckMaxSize(this, in, new_max_size, new_size, new_mtime));
+      return false;
+    }
+  }
+
+  MutationRef mut(new MutationImpl());
+  mut->ls = mds->mdlog->get_current_segment();
+    
+  auto pi = in->project_inode(mut);
+  pi.inode->version = in->pre_dirty();
+
+  bool max_increased = false;
+  if (new_ranges &&
+      calc_new_client_ranges(in, std::max(new_max_size, size), &max_increased)) {
+    dout(10) << "check_inode_max_size client_ranges "
+	     << in->get_previous_projected_inode()->client_ranges
+	     <<  " -> " << pi.inode->client_ranges << dendl;
+  }
+
+  if (update_size) {
+    dout(10) << "check_inode_max_size size " << pi.inode->size << " -> " << new_size << dendl;
+    pi.inode->size = new_size;
+    pi.inode->rstat.rbytes = new_size;
+    dout(10) << "check_inode_max_size mtime " << pi.inode->mtime << " -> " << new_mtime << dendl;
+    pi.inode->mtime = new_mtime;
+    if (new_mtime > pi.inode->ctime) {
+      pi.inode->ctime = new_mtime;
+      if (new_mtime > pi.inode->rstat.rctime)
+	pi.inode->rstat.rctime = new_mtime;
+    }
+  }
+
+  // use EOpen if the file is still open; otherwise, use EUpdate.
+  // this is just an optimization to push open files forward into
+  // newer log segments.
+  LogEvent *le;
+  EMetaBlob *metablob;
+  if (in->is_any_caps_wanted() && in->last == CEPH_NOSNAP) {   
+    EOpen *eo = new EOpen(mds->mdlog);
+    eo->add_ino(in->ino());
+    metablob = &eo->metablob;
+    le = eo;
+  } else {
+    EUpdate *eu = new EUpdate(mds->mdlog, "check_inode_max_size");
+    metablob = &eu->metablob;
+    le = eu;
+  }
+  mds->mdlog->start_entry(le);
+
+  mdcache->predirty_journal_parents(mut, metablob, in, 0, PREDIRTY_PRIMARY);
+  // no cow, here!
+  CDentry *parent = in->get_projected_parent_dn();
+  metablob->add_primary_dentry(parent, in, true);
+  mdcache->journal_dirty_inode(mut.get(), metablob, in);
+
+  mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut,
+      UPDATE_SHAREMAX, ref_t<MClientCaps>()));
+  wrlock_force(&in->filelock, mut);  // wrlock for duration of journal
+  mut->auth_pin(in);
+
+  // make max_size _increase_ timely
+  if (max_increased)
+    mds->mdlog->flush();
+
+  return true;
+}
+
+
+void Locker::share_inode_max_size(CInode *in, Capability *only_cap)
+{
+  /*
+   * only share if currently issued a WR cap.  if client doesn't have it,
+   * file_max doesn't matter, and the client will get it if/when they get
+   * the cap later.
+   */
+  dout(10) << "share_inode_max_size on " << *in << dendl;
+  map<client_t, Capability>::iterator it;
+  if (only_cap)
+    it = in->client_caps.find(only_cap->get_client());
+  else
+    it = in->client_caps.begin();
+  for (; it != in->client_caps.end(); ++it) {
+    const client_t client = it->first;
+    Capability *cap = &it->second;
+    if (cap->is_suppress())
+      continue;
+    if (cap->pending() & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER)) {
+      dout(10) << "share_inode_max_size with client." << client << dendl;
+      if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_grant);
+      cap->inc_last_seq();
+      auto m = make_message<MClientCaps>(CEPH_CAP_OP_GRANT,
+                                         in->ino(),
+                                         in->find_snaprealm()->inode->ino(),
+                                         cap->get_cap_id(),
+                                         cap->get_last_seq(),
+                                         cap->pending(),
+                                         cap->wanted(), 0,
+                                         cap->get_mseq(),
+                                         mds->get_osd_epoch_barrier());
+      in->encode_cap_message(m, cap);
+      mds->send_message_client_counted(m, client);
+    }
+    if (only_cap)
+      break;
+  }
+}
+
+bool Locker::_need_flush_mdlog(CInode *in, int wanted, bool lock_state_any)
+{
+  /* flush log if caps are wanted by client but corresponding lock is unstable and locked by
+   * pending mutations. */
+  if (((wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR|CEPH_CAP_FILE_SHARED|CEPH_CAP_FILE_EXCL)) &&
+       (lock_state_any ? in->filelock.is_locked() : in->filelock.is_unstable_and_locked())) ||
+      ((wanted & (CEPH_CAP_AUTH_SHARED|CEPH_CAP_AUTH_EXCL)) &&
+       (lock_state_any ? in->authlock.is_locked() : in->authlock.is_unstable_and_locked())) ||
+      ((wanted & (CEPH_CAP_LINK_SHARED|CEPH_CAP_LINK_EXCL)) &&
+       (lock_state_any ? in->linklock.is_locked() : in->linklock.is_unstable_and_locked())) ||
+      ((wanted & (CEPH_CAP_XATTR_SHARED|CEPH_CAP_XATTR_EXCL)) &&
+       (lock_state_any ? in->xattrlock.is_locked() : in->xattrlock.is_unstable_and_locked())))
+    return true;
+  return false;
+}
+
+void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq)
+{
+  if (ceph_seq_cmp(issue_seq, cap->get_last_issue()) == 0) {
+    dout(10) << " wanted " << ccap_string(cap->wanted())
+	     << " -> " << ccap_string(wanted) << dendl;
+    cap->set_wanted(wanted);
+  } else if (wanted & ~cap->wanted()) {
+    dout(10) << " wanted " << ccap_string(cap->wanted())
+	     << " -> " << ccap_string(wanted)
+	     << " (added caps even though we had seq mismatch!)" << dendl;
+    cap->set_wanted(wanted | cap->wanted());
+  } else {
+    dout(10) << " NOT changing wanted " << ccap_string(cap->wanted())
+	     << " -> " << ccap_string(wanted)
+	     << " (issue_seq " << issue_seq << " != last_issue "
+	     << cap->get_last_issue() << ")" << dendl;
+    return;
+  }
+
+  CInode *cur = cap->get_inode();
+  if (!cur->is_auth()) {
+    request_inode_file_caps(cur);
+    return;
+  }
+
+  if (cap->wanted()) {
+    if (cur->state_test(CInode::STATE_RECOVERING) &&
+	(cap->wanted() & (CEPH_CAP_FILE_RD |
+			  CEPH_CAP_FILE_WR))) {
+      mds->mdcache->recovery_queue.prioritize(cur);
+    }
+
+    if (mdcache->open_file_table.should_log_open(cur)) {
+      ceph_assert(cur->last == CEPH_NOSNAP);
+      EOpen *le = new EOpen(mds->mdlog);
+      mds->mdlog->start_entry(le);
+      le->add_clean_inode(cur);
+      mds->mdlog->submit_entry(le);
+    }
+  }
+}
+
+void Locker::snapflush_nudge(CInode *in)
+{
+  ceph_assert(in->last != CEPH_NOSNAP);
+  if (in->client_snap_caps.empty())
+    return;
+
+  CInode *head = mdcache->get_inode(in->ino());
+  // head inode gets unpinned when snapflush starts. It might get trimmed
+  // before snapflush finishes.
+  if (!head)
+    return;
+
+  ceph_assert(head->is_auth());
+  if (head->client_need_snapflush.empty())
+    return;
+
+  SimpleLock *hlock = head->get_lock(CEPH_LOCK_IFILE);
+  if (hlock->get_state() == LOCK_SYNC || !hlock->is_stable()) {
+    hlock = NULL;
+    for (int i = 0; i < num_cinode_locks; i++) {
+      SimpleLock *lock = head->get_lock(cinode_lock_info[i].lock);
+      if (lock->get_state() != LOCK_SYNC && lock->is_stable()) {
+	hlock = lock;
+	break;
+      }
+    }
+  }
+  if (hlock) {
+    _rdlock_kick(hlock, true);
+  } else {
+    // also, requeue, in case of unstable lock
+    need_snapflush_inodes.push_back(&in->item_caps);
+  }
+}
+
+void Locker::mark_need_snapflush_inode(CInode *in)
+{
+  ceph_assert(in->last != CEPH_NOSNAP);
+  if (!in->item_caps.is_on_list()) {
+    need_snapflush_inodes.push_back(&in->item_caps);
+    utime_t now = ceph_clock_now();
+    in->last_dirstat_prop = now;
+    dout(10) << "mark_need_snapflush_inode " << *in << " - added at " << now << dendl;
+  }
+}
+
+bool Locker::is_revoking_any_caps_from(client_t client)
+{
+  auto it = revoking_caps_by_client.find(client);
+  if (it == revoking_caps_by_client.end())
+    return false;
+  return !it->second.empty();
+}
+
+void Locker::_do_null_snapflush(CInode *head_in, client_t client, snapid_t last)
+{
+  dout(10) << "_do_null_snapflush client." << client << " on " << *head_in << dendl;
+  for (auto p = head_in->client_need_snapflush.begin();
+       p != head_in->client_need_snapflush.end() && p->first < last; ) {
+    snapid_t snapid = p->first;
+    auto &clients = p->second;
+    ++p;  // be careful, q loop below depends on this
+
+    if (clients.count(client)) {
+      dout(10) << " doing async NULL snapflush on " << snapid << " from client." << client << dendl;
+      CInode *sin = mdcache->pick_inode_snap(head_in, snapid - 1);
+      ceph_assert(sin);
+      ceph_assert(sin->first <= snapid);
+      _do_snap_update(sin, snapid, 0, sin->first - 1, client, ref_t<MClientCaps>(), ref_t<MClientCaps>());
+      head_in->remove_need_snapflush(sin, snapid, client);
+    }
+  }
+}
+
+
+bool Locker::should_defer_client_cap_frozen(CInode *in)
+{
+  if (in->is_frozen())
+    return true;
+
+  /*
+   * This policy needs to be AT LEAST as permissive as allowing a client
+   * request to go forward, or else a client request can release something,
+   * the release gets deferred, but the request gets processed and deadlocks
+   * because when the caps can't get revoked.
+   *
+   * No auth_pin implies that there is no unstable lock and @in is not auth
+   * pinnned by client request. If parent dirfrag is auth pinned by a lock
+   * cache, later request from lock cache owner may forcibly auth pin the @in.
+   */
+  if (in->is_freezing() && in->get_num_auth_pins() == 0) {
+    CDir* dir = in->get_parent_dir();
+    if (!dir || !dir->is_auth_pinned_by_lock_cache())
+      return true;
+  }
+  return false;
+}
+
+void Locker::handle_client_caps(const cref_t<MClientCaps> &m)
+{
+  client_t client = m->get_source().num();
+  snapid_t follows = m->get_snap_follows();
+  auto op = m->get_op();
+  auto dirty = m->get_dirty();
+  dout(7) << "handle_client_caps "
+	  << " on " << m->get_ino()
+	  << " tid " << m->get_client_tid() << " follows " << follows
+	  << " op " << ceph_cap_op_name(op)
+	  << " flags 0x" << std::hex << m->flags << std::dec << dendl;
+
+  Session *session = mds->get_session(m);
+  if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+    if (!session) {
+      dout(5) << " no session, dropping " << *m << dendl;
+      return;
+    }
+    if (session->is_closed() ||
+	session->is_closing() ||
+	session->is_killing()) {
+      dout(7) << " session closed|closing|killing, dropping " << *m << dendl;
+      return;
+    }
+    if ((mds->is_reconnect() || mds->get_want_state() == MDSMap::STATE_RECONNECT) &&
+	dirty && m->get_client_tid() > 0 &&
+	!session->have_completed_flush(m->get_client_tid())) {
+      mdcache->set_reconnected_dirty_caps(client, m->get_ino(), dirty,
+					  op == CEPH_CAP_OP_FLUSHSNAP);
+    }
+    mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
+  if (mds->logger) mds->logger->inc(l_mdss_handle_client_caps);
+  if (dirty) {
+      if (mds->logger) mds->logger->inc(l_mdss_handle_client_caps_dirty);
+  }
+
+  if (m->get_client_tid() > 0 && session &&
+      session->have_completed_flush(m->get_client_tid())) {
+    dout(7) << "handle_client_caps already flushed tid " << m->get_client_tid()
+	    << " for client." << client << dendl;
+    ref_t<MClientCaps> ack;
+    if (op == CEPH_CAP_OP_FLUSHSNAP) {
+      if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flushsnap_ack);
+      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP_ACK, m->get_ino(), 0, 0, 0, 0, 0, dirty, 0, mds->get_osd_epoch_barrier());
+    } else {
+      if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flush_ack);
+      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSH_ACK, m->get_ino(), 0, m->get_cap_id(), m->get_seq(), m->get_caps(), 0, dirty, 0, mds->get_osd_epoch_barrier());
+    }
+    ack->set_snap_follows(follows);
+    ack->set_client_tid(m->get_client_tid());
+    mds->send_message_client_counted(ack, m->get_connection());
+    if (op == CEPH_CAP_OP_FLUSHSNAP) {
+      return;
+    } else {
+      // fall-thru because the message may release some caps
+      dirty = false;
+      op = CEPH_CAP_OP_UPDATE;
+    }
+  }
+
+  // "oldest flush tid" > 0 means client uses unique TID for each flush
+  if (m->get_oldest_flush_tid() > 0 && session) {
+    if (session->trim_completed_flushes(m->get_oldest_flush_tid())) {
+      mds->mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
+
+      if (session->get_num_trim_flushes_warnings() > 0 &&
+	  session->get_num_completed_flushes() * 2 < g_conf()->mds_max_completed_flushes)
+	session->reset_num_trim_flushes_warnings();
+    } else {
+      if (session->get_num_completed_flushes() >=
+	  (g_conf()->mds_max_completed_flushes << session->get_num_trim_flushes_warnings())) {
+	session->inc_num_trim_flushes_warnings();
+	CachedStackStringStream css;
+	*css << "client." << session->get_client() << " does not advance its oldest_flush_tid ("
+	     << m->get_oldest_flush_tid() << "), "
+	     << session->get_num_completed_flushes()
+	     << " completed flushes recorded in session";
+	mds->clog->warn() << css->strv();
+	dout(20) << __func__ << " " << css->strv() << dendl;
+      }
+    }
+  }
+
+  CInode *head_in = mdcache->get_inode(m->get_ino());
+  if (!head_in) {
+    if (mds->is_clientreplay()) {
+      dout(7) << "handle_client_caps on unknown ino " << m->get_ino()
+	<< ", will try again after replayed client requests" << dendl;
+      mdcache->wait_replay_cap_reconnect(m->get_ino(), new C_MDS_RetryMessage(mds, m));
+      return;
+    }
+
+    /*
+     * "handle_client_caps on unknown ino xxx” is normal after migrating a subtree
+     * Sequence of events that cause this are:
+     *   - client sends caps message to mds.a
+     *   - mds finishes subtree migration, send cap export to client
+     *   - mds trim its cache
+     *   - mds receives cap messages from client
+     */
+    dout(7) << "handle_client_caps on unknown ino " << m->get_ino() << ", dropping" << dendl;
+    return;
+  }
+
+  if (m->osd_epoch_barrier && !mds->objecter->have_map(m->osd_epoch_barrier)) {
+    // Pause RADOS operations until we see the required epoch
+    mds->objecter->set_epoch_barrier(m->osd_epoch_barrier);
+  }
+
+  if (mds->get_osd_epoch_barrier() < m->osd_epoch_barrier) {
+    // Record the barrier so that we will retransmit it to clients
+    mds->set_osd_epoch_barrier(m->osd_epoch_barrier);
+  }
+
+  dout(10) << " head inode " << *head_in << dendl;
+
+  Capability *cap = 0;
+  cap = head_in->get_client_cap(client);
+  if (!cap) {
+    dout(7) << "handle_client_caps no cap for client." << client << " on " << *head_in << dendl;
+    return;
+  }  
+  ceph_assert(cap);
+
+  // freezing|frozen?
+  if (should_defer_client_cap_frozen(head_in)) {
+    dout(7) << "handle_client_caps freezing|frozen on " << *head_in << dendl;
+    head_in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+  if (ceph_seq_cmp(m->get_mseq(), cap->get_mseq()) < 0) {
+    dout(7) << "handle_client_caps mseq " << m->get_mseq() << " < " << cap->get_mseq()
+	    << ", dropping" << dendl;
+    return;
+  }
+
+  bool need_unpin = false;
+
+  // flushsnap?
+  if (op == CEPH_CAP_OP_FLUSHSNAP) {
+    if (!head_in->is_auth()) {
+      dout(7) << " not auth, ignoring flushsnap on " << *head_in << dendl;
+      goto out;
+    }
+
+    SnapRealm *realm = head_in->find_snaprealm();
+    snapid_t snap = realm->get_snap_following(follows);
+    dout(10) << "  flushsnap follows " << follows << " -> snap " << snap << dendl;
+
+    auto p = head_in->client_need_snapflush.begin();
+    if (p != head_in->client_need_snapflush.end() && p->first < snap) {
+      head_in->auth_pin(this); // prevent subtree frozen
+      need_unpin = true;
+      _do_null_snapflush(head_in, client, snap);
+    }
+
+    CInode *in = head_in;
+    if (snap != CEPH_NOSNAP) {
+      in = mdcache->pick_inode_snap(head_in, snap - 1);
+      if (in != head_in)
+	dout(10) << " snapped inode " << *in << dendl;
+    }
+
+    // we can prepare the ack now, since this FLUSHEDSNAP is independent of any
+    // other cap ops.  (except possibly duplicate FLUSHSNAP requests, but worst
+    // case we get a dup response, so whatever.)
+    ref_t<MClientCaps> ack;
+    if (dirty) {
+      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP_ACK, in->ino(), 0, 0, 0, 0, 0, dirty, 0, mds->get_osd_epoch_barrier());
+      ack->set_snap_follows(follows);
+      ack->set_client_tid(m->get_client_tid());
+      ack->set_oldest_flush_tid(m->get_oldest_flush_tid());
+    }
+
+    if (in == head_in ||
+	(head_in->client_need_snapflush.count(snap) &&
+	 head_in->client_need_snapflush[snap].count(client))) {
+      dout(7) << " flushsnap snap " << snap
+	      << " client." << client << " on " << *in << dendl;
+
+      // this cap now follows a later snap (i.e. the one initiating this flush, or later)
+      if (in == head_in)
+	cap->client_follows = snap < CEPH_NOSNAP ? snap : realm->get_newest_seq();
+   
+      _do_snap_update(in, snap, dirty, follows, client, m, ack);
+
+      if (in != head_in)
+	head_in->remove_need_snapflush(in, snap, client);
+    } else {
+      dout(7) << " not expecting flushsnap " << snap << " from client." << client << " on " << *in << dendl;
+      if (ack) {
+        if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flushsnap_ack);
+	mds->send_message_client_counted(ack, m->get_connection());
+      }
+    }
+    goto out;
+  }
+
+  if (cap->get_cap_id() != m->get_cap_id()) {
+    dout(7) << " ignoring client capid " << m->get_cap_id() << " != my " << cap->get_cap_id() << dendl;
+  } else {
+    CInode *in = head_in;
+    if (follows > 0) {
+      in = mdcache->pick_inode_snap(head_in, follows);
+      // intermediate snap inodes
+      while (in != head_in) {
+	ceph_assert(in->last != CEPH_NOSNAP);
+	if (in->is_auth() && dirty) {
+	  dout(10) << " updating intermediate snapped inode " << *in << dendl;
+	  _do_cap_update(in, NULL, dirty, follows, m, ref_t<MClientCaps>());
+	}
+	in = mdcache->pick_inode_snap(head_in, in->last);
+      }
+    }
+ 
+    // head inode, and cap
+    ref_t<MClientCaps> ack;
+
+    int caps = m->get_caps();
+    if (caps & ~cap->issued()) {
+      dout(10) << " confirming not issued caps " << ccap_string(caps & ~cap->issued()) << dendl;
+      caps &= cap->issued();
+    }
+    
+    int revoked = cap->confirm_receipt(m->get_seq(), caps);
+    dout(10) << " follows " << follows
+	     << " retains " << ccap_string(m->get_caps())
+	     << " dirty " << ccap_string(dirty)
+	     << " on " << *in << dendl;
+
+    if (revoked & CEPH_CAP_ANY_DIR_OPS)
+      eval_lock_caches(cap);
+
+    // missing/skipped snapflush?
+    //  The client MAY send a snapflush if it is issued WR/EXCL caps, but
+    //  presently only does so when it has actual dirty metadata.  But, we
+    //  set up the need_snapflush stuff based on the issued caps.
+    //  We can infer that the client WONT send a FLUSHSNAP once they have
+    //  released all WR/EXCL caps (the FLUSHSNAP always comes before the cap
+    //  update/release).
+    if (!head_in->client_need_snapflush.empty()) {
+      if (!(cap->issued() & CEPH_CAP_ANY_FILE_WR) &&
+	  !(m->flags & MClientCaps::FLAG_PENDING_CAPSNAP)) {
+	head_in->auth_pin(this); // prevent subtree frozen
+	need_unpin = true;
+	_do_null_snapflush(head_in, client);
+      } else {
+	dout(10) << " revocation in progress, not making any conclusions about null snapflushes" << dendl;
+      }
+    }
+    if (cap->need_snapflush() && !(m->flags & MClientCaps::FLAG_PENDING_CAPSNAP))
+      cap->clear_needsnapflush();
+
+    if (dirty && in->is_auth()) {
+      dout(7) << " flush client." << client << " dirty " << ccap_string(dirty)
+	      << " seq " << m->get_seq() << " on " << *in << dendl;
+      ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSH_ACK, in->ino(), 0, cap->get_cap_id(), m->get_seq(),
+          m->get_caps(), 0, dirty, 0, mds->get_osd_epoch_barrier());
+      ack->set_client_tid(m->get_client_tid());
+      ack->set_oldest_flush_tid(m->get_oldest_flush_tid());
+    }
+
+    // filter wanted based on what we could ever give out (given auth/replica status)
+    bool need_flush = m->flags & MClientCaps::FLAG_SYNC;
+    int new_wanted = m->get_wanted();
+    if (new_wanted != cap->wanted()) {
+      if (!need_flush && in->is_auth() && (new_wanted & ~cap->pending())) {
+	// exapnding caps.  make sure we aren't waiting for a log flush
+	need_flush = _need_flush_mdlog(head_in, new_wanted & ~cap->pending());
+      }
+
+      adjust_cap_wanted(cap, new_wanted, m->get_issue_seq());
+    }
+
+    if (in->is_auth() &&
+	_do_cap_update(in, cap, dirty, follows, m, ack, &need_flush)) {
+      // updated
+      eval(in, CEPH_CAP_LOCKS);
+
+      if (!need_flush && (cap->wanted() & ~cap->pending()))
+	need_flush = _need_flush_mdlog(in, cap->wanted() & ~cap->pending());
+    } else {
+      // no update, ack now.
+      if (ack) {
+        if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flush_ack);
+	mds->send_message_client_counted(ack, m->get_connection());
+      }
+      
+      bool did_issue = eval(in, CEPH_CAP_LOCKS);
+      if (!did_issue && (cap->wanted() & ~cap->pending()))
+	issue_caps(in, cap);
+
+      if (cap->get_last_seq() == 0 &&
+	  (cap->pending() & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER))) {
+	share_inode_max_size(in, cap);
+      }
+    }
+
+    if (need_flush)
+      mds->mdlog->flush();
+  }
+
+ out:
+  if (need_unpin)
+    head_in->auth_unpin(this);
+}
+
+
+class C_Locker_RetryRequestCapRelease : public LockerContext {
+  client_t client;
+  ceph_mds_request_release item;
+public:
+  C_Locker_RetryRequestCapRelease(Locker *l, client_t c, const ceph_mds_request_release& it) :
+    LockerContext(l), client(c), item(it) { }
+  void finish(int r) override {
+    string dname;
+    MDRequestRef null_ref;
+    locker->process_request_cap_release(null_ref, client, item, dname);
+  }
+};
+
+void Locker::process_request_cap_release(MDRequestRef& mdr, client_t client, const ceph_mds_request_release& item,
+					 std::string_view dname)
+{
+  inodeno_t ino = (uint64_t)item.ino;
+  uint64_t cap_id = item.cap_id;
+  int caps = item.caps;
+  int wanted = item.wanted;
+  int seq = item.seq;
+  int issue_seq = item.issue_seq;
+  int mseq = item.mseq;
+
+  CInode *in = mdcache->get_inode(ino);
+  if (!in)
+    return;
+
+  if (dname.length()) {
+    frag_t fg = in->pick_dirfrag(dname);
+    CDir *dir = in->get_dirfrag(fg);
+    if (dir) {
+      CDentry *dn = dir->lookup(dname);
+      if (dn) {
+	ClientLease *l = dn->get_client_lease(client);
+	if (l) {
+	  dout(10) << __func__ << " removing lease on " << *dn << dendl;
+	  dn->remove_client_lease(l, this);
+	} else {
+	  dout(7) << __func__ << " client." << client
+		  << " doesn't have lease on " << *dn << dendl;
+	}
+      } else {
+	dout(7) << __func__ << " client." << client << " released lease on dn "
+		<< dir->dirfrag() << "/" << dname << " which dne" << dendl;
+      }
+    }
+  }
+
+  Capability *cap = in->get_client_cap(client);
+  if (!cap)
+    return;
+
+  dout(10) << __func__ << " client." << client << " " << ccap_string(caps) << " on " << *in
+	   << (mdr ? "" : " (DEFERRED, no mdr)")
+	   << dendl;
+    
+  if (ceph_seq_cmp(mseq, cap->get_mseq()) < 0) {
+    dout(7) << " mseq " << mseq << " < " << cap->get_mseq() << ", dropping" << dendl;
+    return;
+  }
+
+  if (cap->get_cap_id() != cap_id) {
+    dout(7) << " cap_id " << cap_id << " != " << cap->get_cap_id() << ", dropping" << dendl;
+    return;
+  }
+
+  if (should_defer_client_cap_frozen(in)) {
+    dout(7) << " frozen, deferring" << dendl;
+    in->add_waiter(CInode::WAIT_UNFREEZE, new C_Locker_RetryRequestCapRelease(this, client, item));
+    return;
+  }
+    
+  if (mds->logger) mds->logger->inc(l_mdss_process_request_cap_release);
+
+  if (caps & ~cap->issued()) {
+    dout(10) << " confirming not issued caps " << ccap_string(caps & ~cap->issued()) << dendl;
+    caps &= cap->issued();
+  }
+  int revoked = cap->confirm_receipt(seq, caps);
+  if (revoked & CEPH_CAP_ANY_DIR_OPS)
+    eval_lock_caches(cap);
+
+  if (!in->client_need_snapflush.empty() &&
+      (cap->issued() & CEPH_CAP_ANY_FILE_WR) == 0) {
+    _do_null_snapflush(in, client);
+  }
+
+  adjust_cap_wanted(cap, wanted, issue_seq);
+  
+  if (mdr)
+    cap->inc_suppress();
+  eval(in, CEPH_CAP_LOCKS);
+  if (mdr)
+    cap->dec_suppress();
+  
+  // take note; we may need to reissue on this cap later
+  if (mdr)
+    mdr->cap_releases[in->vino()] = cap->get_last_seq();
+}
+
+class C_Locker_RetryKickIssueCaps : public LockerContext {
+  CInode *in;
+  client_t client;
+  ceph_seq_t seq;
+public:
+  C_Locker_RetryKickIssueCaps(Locker *l, CInode *i, client_t c, ceph_seq_t s) :
+    LockerContext(l), in(i), client(c), seq(s) {
+    in->get(CInode::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    locker->kick_issue_caps(in, client, seq);
+    in->put(CInode::PIN_PTRWAITER);
+  }
+};
+
+void Locker::kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq)
+{
+  Capability *cap = in->get_client_cap(client);
+  if (!cap || cap->get_last_seq() != seq)
+    return;
+  if (in->is_frozen()) {
+    dout(10) << "kick_issue_caps waiting for unfreeze on " << *in << dendl;
+    in->add_waiter(CInode::WAIT_UNFREEZE,
+	new C_Locker_RetryKickIssueCaps(this, in, client, seq));
+    return;
+  }
+  dout(10) << "kick_issue_caps released at current seq " << seq
+    << ", reissuing" << dendl;
+  issue_caps(in, cap);
+}
+
+void Locker::kick_cap_releases(MDRequestRef& mdr)
+{
+  client_t client = mdr->get_client();
+  for (map<vinodeno_t,ceph_seq_t>::iterator p = mdr->cap_releases.begin();
+       p != mdr->cap_releases.end();
+       ++p) {
+    CInode *in = mdcache->get_inode(p->first);
+    if (!in)
+      continue;
+    kick_issue_caps(in, client, p->second);
+  }
+}
+
+/**
+ * m and ack might be NULL, so don't dereference them unless dirty != 0
+ */
+void Locker::_do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, const cref_t<MClientCaps> &m, const ref_t<MClientCaps> &ack)
+{
+  dout(10) << "_do_snap_update dirty " << ccap_string(dirty)
+	   << " follows " << follows << " snap " << snap
+	   << " on " << *in << dendl;
+
+  if (snap == CEPH_NOSNAP) {
+    // hmm, i guess snap was already deleted?  just ack!
+    dout(10) << " wow, the snap following " << follows
+	     << " was already deleted.  nothing to record, just ack." << dendl;
+    if (ack) {
+      if (ack->get_op() == CEPH_CAP_OP_FLUSHSNAP_ACK) {
+          if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flushsnap_ack);
+      }
+      mds->send_message_client_counted(ack, m->get_connection());
+    }
+    return;
+  }
+
+  EUpdate *le = new EUpdate(mds->mdlog, "snap flush");
+  mds->mdlog->start_entry(le);
+  MutationRef mut = new MutationImpl();
+  mut->ls = mds->mdlog->get_current_segment();
+
+  // normal metadata updates that we can apply to the head as well.
+
+  // update xattrs?
+  CInode::mempool_xattr_map *px = nullptr;
+  bool xattrs = (dirty & CEPH_CAP_XATTR_EXCL) &&
+                m->xattrbl.length() &&
+                m->head.xattr_version > in->get_projected_inode()->xattr_version;
+
+  CInode::mempool_old_inode *oi = nullptr;
+  CInode::old_inode_map_ptr _old_inodes;
+  if (in->is_any_old_inodes()) {
+    auto last = in->pick_old_inode(snap);
+    if (last) {
+      _old_inodes = CInode::allocate_old_inode_map(*in->get_old_inodes());
+      oi = &_old_inodes->at(last);
+      if (snap > oi->first) {
+	(*_old_inodes)[snap - 1] = *oi;;
+	oi->first = snap;
+      }
+    }
+  }
+
+  CInode::mempool_inode *i;
+  if (oi) {
+    dout(10) << " writing into old inode" << dendl;
+    auto pi = in->project_inode(mut);
+    pi.inode->version = in->pre_dirty();
+    i = &oi->inode;
+    if (xattrs)
+      px = &oi->xattrs;
+  } else {
+    auto pi = in->project_inode(mut, xattrs);
+    pi.inode->version = in->pre_dirty();
+    i = pi.inode.get();
+    if (xattrs)
+      px = pi.xattrs.get();
+  }
+
+  _update_cap_fields(in, dirty, m, i);
+
+  // xattr
+  if (xattrs) {
+    dout(7) << " xattrs v" << i->xattr_version << " -> " << m->head.xattr_version
+	    << " len " << m->xattrbl.length() << dendl;
+    i->xattr_version = m->head.xattr_version;
+    auto p = m->xattrbl.cbegin();
+    decode(*px, p);
+  }
+
+  {
+    auto it = i->client_ranges.find(client);
+    if (it != i->client_ranges.end()) {
+      if (in->last == snap) {
+        dout(10) << "  removing client_range entirely" << dendl;
+        i->client_ranges.erase(it);
+      } else {
+        dout(10) << "  client_range now follows " << snap << dendl;
+        it->second.follows = snap;
+      }
+    }
+  }
+
+  if (_old_inodes)
+    in->reset_old_inodes(std::move(_old_inodes));
+
+  mut->auth_pin(in);
+  mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows);
+  mdcache->journal_dirty_inode(mut.get(), &le->metablob, in, follows);
+
+  // "oldest flush tid" > 0 means client uses unique TID for each flush
+  if (ack && ack->get_oldest_flush_tid() > 0)
+    le->metablob.add_client_flush(metareqid_t(m->get_source(), ack->get_client_tid()),
+				  ack->get_oldest_flush_tid());
+
+  mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut, UPDATE_SNAPFLUSH,
+							      ack, client));
+}
+
+void Locker::_update_cap_fields(CInode *in, int dirty, const cref_t<MClientCaps> &m, CInode::mempool_inode *pi)
+{
+  if (dirty == 0)
+    return;
+
+  /* m must be valid if there are dirty caps */
+  ceph_assert(m);
+  uint64_t features = m->get_connection()->get_features();
+
+  if (m->get_ctime() > pi->ctime) {
+    dout(7) << "  ctime " << pi->ctime << " -> " << m->get_ctime()
+	    << " for " << *in << dendl;
+    pi->ctime = m->get_ctime();
+    if (m->get_ctime() > pi->rstat.rctime)
+      pi->rstat.rctime = m->get_ctime();
+  }
+
+  if ((features & CEPH_FEATURE_FS_CHANGE_ATTR) &&
+      m->get_change_attr() > pi->change_attr) {
+    dout(7) << "  change_attr " << pi->change_attr << " -> " << m->get_change_attr()
+	    << " for " << *in << dendl;
+    pi->change_attr = m->get_change_attr();
+  }
+
+  // file
+  if (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
+    utime_t atime = m->get_atime();
+    utime_t mtime = m->get_mtime();
+    uint64_t size = m->get_size();
+    version_t inline_version = m->inline_version;
+    
+    if (((dirty & CEPH_CAP_FILE_WR) && mtime > pi->mtime) ||
+	((dirty & CEPH_CAP_FILE_EXCL) && mtime != pi->mtime)) {
+      dout(7) << "  mtime " << pi->mtime << " -> " << mtime
+	      << " for " << *in << dendl;
+      pi->mtime = mtime;
+      if (mtime > pi->rstat.rctime)
+	pi->rstat.rctime = mtime;
+    }
+    if (in->is_file() &&   // ONLY if regular file
+	size > pi->size) {
+      dout(7) << "  size " << pi->size << " -> " << size
+	      << " for " << *in << dendl;
+      pi->size = size;
+      pi->rstat.rbytes = size;
+    }
+    if (in->is_file() &&
+        (dirty & CEPH_CAP_FILE_WR) &&
+        inline_version > pi->inline_data.version) {
+      pi->inline_data.version = inline_version;
+      if (inline_version != CEPH_INLINE_NONE && m->inline_data.length() > 0)
+	pi->inline_data.set_data(m->inline_data);
+      else
+	pi->inline_data.free_data();
+    }
+    if ((dirty & CEPH_CAP_FILE_EXCL) && atime != pi->atime) {
+      dout(7) << "  atime " << pi->atime << " -> " << atime
+	      << " for " << *in << dendl;
+      pi->atime = atime;
+    }
+    if ((dirty & CEPH_CAP_FILE_EXCL) &&
+	ceph_seq_cmp(pi->time_warp_seq, m->get_time_warp_seq()) < 0) {
+      dout(7) << "  time_warp_seq " << pi->time_warp_seq << " -> " << m->get_time_warp_seq()
+	      << " for " << *in << dendl;
+      pi->time_warp_seq = m->get_time_warp_seq();
+    }
+  }
+  // auth
+  if (dirty & CEPH_CAP_AUTH_EXCL) {
+    if (m->head.uid != pi->uid) {
+      dout(7) << "  uid " << pi->uid
+	      << " -> " << m->head.uid
+	      << " for " << *in << dendl;
+      pi->uid = m->head.uid;
+    }
+    if (m->head.gid != pi->gid) {
+      dout(7) << "  gid " << pi->gid
+	      << " -> " << m->head.gid
+	      << " for " << *in << dendl;
+      pi->gid = m->head.gid;
+    }
+    if (m->head.mode != pi->mode) {
+      dout(7) << "  mode " << oct << pi->mode
+	      << " -> " << m->head.mode << dec
+	      << " for " << *in << dendl;
+      pi->mode = m->head.mode;
+    }
+    if ((features & CEPH_FEATURE_FS_BTIME) && m->get_btime() != pi->btime) {
+      dout(7) << "  btime " << oct << pi->btime
+	      << " -> " << m->get_btime() << dec
+	      << " for " << *in << dendl;
+      pi->btime = m->get_btime();
+    }
+  }
+}
+
+/*
+ * update inode based on cap flush|flushsnap|wanted.
+ *  adjust max_size, if needed.
+ * if we update, return true; otherwise, false (no updated needed).
+ */
+bool Locker::_do_cap_update(CInode *in, Capability *cap,
+			    int dirty, snapid_t follows,
+			    const cref_t<MClientCaps> &m, const ref_t<MClientCaps> &ack,
+			    bool *need_flush)
+{
+  dout(10) << "_do_cap_update dirty " << ccap_string(dirty)
+	   << " issued " << ccap_string(cap ? cap->issued() : 0)
+	   << " wanted " << ccap_string(cap ? cap->wanted() : 0)
+	   << " on " << *in << dendl;
+  ceph_assert(in->is_auth());
+  client_t client = m->get_source().num();
+  const auto& latest = in->get_projected_inode();
+
+  // increase or zero max_size?
+  uint64_t size = m->get_size();
+  bool change_max = false;
+  uint64_t old_max = latest->get_client_range(client);
+  uint64_t new_max = old_max;
+  
+  if (in->is_file()) {
+    bool forced_change_max = false;
+    dout(20) << "inode is file" << dendl;
+    if (cap && ((cap->issued() | cap->wanted()) & CEPH_CAP_ANY_FILE_WR)) {
+      dout(20) << "client has write caps; m->get_max_size="
+               << m->get_max_size() << "; old_max=" << old_max << dendl;
+      if (m->get_max_size() > new_max) {
+	dout(10) << "client requests file_max " << m->get_max_size()
+		 << " > max " << old_max << dendl;
+	change_max = true;
+	forced_change_max = true;
+	new_max = calc_new_max_size(latest, m->get_max_size());
+      } else {
+	new_max = calc_new_max_size(latest, size);
+
+	if (new_max > old_max)
+	  change_max = true;
+	else
+	  new_max = old_max;
+      }
+    } else {
+      if (old_max) {
+	change_max = true;
+	new_max = 0;
+      }
+    }
+
+    if (in->last == CEPH_NOSNAP &&
+	change_max &&
+	!in->filelock.can_wrlock(client) &&
+	!in->filelock.can_force_wrlock(client)) {
+      dout(10) << " i want to change file_max, but lock won't allow it (yet)" << dendl;
+      if (in->filelock.is_stable()) {
+	bool need_issue = false;
+	if (cap)
+	  cap->inc_suppress();
+	if (in->get_mds_caps_wanted().empty() &&
+	    (in->get_loner() >= 0 || (in->get_wanted_loner() >= 0 && in->try_set_loner()))) {
+	  if (in->filelock.get_state() != LOCK_EXCL)
+	    file_excl(&in->filelock, &need_issue);
+	} else
+	  simple_lock(&in->filelock, &need_issue);
+	if (need_issue)
+	  issue_caps(in);
+	if (cap)
+	  cap->dec_suppress();
+      }
+      if (!in->filelock.can_wrlock(client) &&
+	  !in->filelock.can_force_wrlock(client)) {
+	C_MDL_CheckMaxSize *cms = new C_MDL_CheckMaxSize(this, in,
+	                                                 forced_change_max ? new_max : 0,
+	                                                 0, utime_t());
+
+	in->filelock.add_waiter(SimpleLock::WAIT_STABLE, cms);
+	change_max = false;
+      }
+    }
+  }
+
+  if (m->flockbl.length()) {
+    int32_t num_locks;
+    auto bli = m->flockbl.cbegin();
+    decode(num_locks, bli);
+    for ( int i=0; i < num_locks; ++i) {
+      ceph_filelock decoded_lock;
+      decode(decoded_lock, bli);
+      in->get_fcntl_lock_state()->held_locks.
+	insert(pair<uint64_t, ceph_filelock>(decoded_lock.start, decoded_lock));
+      ++in->get_fcntl_lock_state()->client_held_lock_counts[(client_t)(decoded_lock.client)];
+    }
+    decode(num_locks, bli);
+    for ( int i=0; i < num_locks; ++i) {
+      ceph_filelock decoded_lock;
+      decode(decoded_lock, bli);
+      in->get_flock_lock_state()->held_locks.
+	insert(pair<uint64_t, ceph_filelock>(decoded_lock.start, decoded_lock));
+      ++in->get_flock_lock_state()->client_held_lock_counts[(client_t)(decoded_lock.client)];
+    }
+  }
+
+  if (!dirty && !change_max)
+    return false;
+
+  Session *session = mds->get_session(m);
+  if (session->check_access(in, MAY_WRITE,
+			    m->caller_uid, m->caller_gid, NULL, 0, 0) < 0) {
+    dout(10) << "check_access failed, dropping cap update on " << *in << dendl;
+    return false;
+  }
+
+  // do the update.
+  EUpdate *le = new EUpdate(mds->mdlog, "cap update");
+  mds->mdlog->start_entry(le);
+
+  bool xattr = (dirty & CEPH_CAP_XATTR_EXCL) &&
+               m->xattrbl.length() &&
+               m->head.xattr_version > in->get_projected_inode()->xattr_version;
+
+  MutationRef mut(new MutationImpl());
+  mut->ls = mds->mdlog->get_current_segment();
+
+  auto pi = in->project_inode(mut, xattr);
+  pi.inode->version = in->pre_dirty();
+
+  _update_cap_fields(in, dirty, m, pi.inode.get());
+
+  if (change_max) {
+    dout(7) << "  max_size " << old_max << " -> " << new_max
+	    << " for " << *in << dendl;
+    if (new_max) {
+      auto &cr = pi.inode->client_ranges[client];
+      cr.range.first = 0;
+      cr.range.last = new_max;
+      cr.follows = in->first - 1;
+      in->mark_clientwriteable();
+      if (cap)
+	cap->mark_clientwriteable();
+    } else {
+      pi.inode->client_ranges.erase(client);
+      if (pi.inode->client_ranges.empty())
+	in->clear_clientwriteable();
+      if (cap)
+	cap->clear_clientwriteable();
+    }
+  }
+    
+  if (change_max || (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) 
+    wrlock_force(&in->filelock, mut);  // wrlock for duration of journal
+
+  // auth
+  if (dirty & CEPH_CAP_AUTH_EXCL)
+    wrlock_force(&in->authlock, mut);
+
+  // xattrs update?
+  if (xattr) {
+    dout(7) << " xattrs v" << pi.inode->xattr_version << " -> " << m->head.xattr_version << dendl;
+    pi.inode->xattr_version = m->head.xattr_version;
+    auto p = m->xattrbl.cbegin();
+    decode_noshare(*pi.xattrs, p);
+    wrlock_force(&in->xattrlock, mut);
+  }
+  
+  mut->auth_pin(in);
+  mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows);
+  mdcache->journal_dirty_inode(mut.get(), &le->metablob, in, follows);
+
+  // "oldest flush tid" > 0 means client uses unique TID for each flush
+  if (ack && ack->get_oldest_flush_tid() > 0)
+    le->metablob.add_client_flush(metareqid_t(m->get_source(), ack->get_client_tid()),
+				  ack->get_oldest_flush_tid());
+
+  unsigned update_flags = 0;
+  if (change_max)
+    update_flags |= UPDATE_SHAREMAX;
+  if (cap)
+    update_flags |= UPDATE_NEEDSISSUE;
+  mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut, update_flags,
+							      ack, client));
+  if (need_flush && !*need_flush &&
+      ((change_max && new_max) || // max INCREASE
+       _need_flush_mdlog(in, dirty)))
+    *need_flush = true;
+
+  return true;
+}
+
+void Locker::handle_client_cap_release(const cref_t<MClientCapRelease> &m)
+{
+  client_t client = m->get_source().num();
+  dout(10) << "handle_client_cap_release " << *m << dendl;
+
+  if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+    mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
+  if (mds->logger) mds->logger->inc(l_mdss_handle_client_cap_release);
+
+  if (m->osd_epoch_barrier && !mds->objecter->have_map(m->osd_epoch_barrier)) {
+    // Pause RADOS operations until we see the required epoch
+    mds->objecter->set_epoch_barrier(m->osd_epoch_barrier);
+  }
+
+  if (mds->get_osd_epoch_barrier() < m->osd_epoch_barrier) {
+    // Record the barrier so that we will retransmit it to clients
+    mds->set_osd_epoch_barrier(m->osd_epoch_barrier);
+  }
+
+  Session *session = mds->get_session(m);
+
+  for (const auto &cap : m->caps) {
+    _do_cap_release(client, inodeno_t((uint64_t)cap.ino) , cap.cap_id, cap.migrate_seq, cap.seq);
+  }
+
+  if (session) {
+    session->notify_cap_release(m->caps.size());
+  }
+}
+
+class C_Locker_RetryCapRelease : public LockerContext {
+  client_t client;
+  inodeno_t ino;
+  uint64_t cap_id;
+  ceph_seq_t migrate_seq;
+  ceph_seq_t issue_seq;
+public:
+  C_Locker_RetryCapRelease(Locker *l, client_t c, inodeno_t i, uint64_t id,
+			   ceph_seq_t mseq, ceph_seq_t seq) :
+    LockerContext(l), client(c), ino(i), cap_id(id), migrate_seq(mseq), issue_seq(seq) {}
+  void finish(int r) override {
+    locker->_do_cap_release(client, ino, cap_id, migrate_seq, issue_seq);
+  }
+};
+
+void Locker::_do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id,
+			     ceph_seq_t mseq, ceph_seq_t seq)
+{
+  CInode *in = mdcache->get_inode(ino);
+  if (!in) {
+    dout(7) << "_do_cap_release missing ino " << ino << dendl;
+    return;
+  }
+  Capability *cap = in->get_client_cap(client);
+  if (!cap) {
+    dout(7) << "_do_cap_release no cap for client" << client << " on "<< *in << dendl;
+    return;
+  }
+
+  dout(7) << "_do_cap_release for client." << client << " on "<< *in << dendl;
+  if (cap->get_cap_id() != cap_id) {
+    dout(7) << " capid " << cap_id << " != " << cap->get_cap_id() << ", ignore" << dendl;
+    return;
+  }
+  if (ceph_seq_cmp(mseq, cap->get_mseq()) < 0) {
+    dout(7) << " mseq " << mseq << " < " << cap->get_mseq() << ", ignore" << dendl;
+    return;
+  }
+  if (should_defer_client_cap_frozen(in)) {
+    dout(7) << " freezing|frozen, deferring" << dendl;
+    in->add_waiter(CInode::WAIT_UNFREEZE,
+                  new C_Locker_RetryCapRelease(this, client, ino, cap_id, mseq, seq));
+    return;
+  }
+  if (seq != cap->get_last_issue()) {
+    dout(7) << " issue_seq " << seq << " != " << cap->get_last_issue() << dendl;
+    // clean out any old revoke history
+    cap->clean_revoke_from(seq);
+    eval_cap_gather(in);
+    return;
+  }
+  remove_client_cap(in, cap);
+}
+
+void Locker::remove_client_cap(CInode *in, Capability *cap, bool kill)
+{
+  client_t client = cap->get_client();
+  // clean out any pending snapflush state
+  if (!in->client_need_snapflush.empty())
+    _do_null_snapflush(in, client);
+
+  while (!cap->lock_caches.empty()) {
+    MDLockCache* lock_cache = cap->lock_caches.front();
+    lock_cache->client_cap = nullptr;
+    invalidate_lock_cache(lock_cache);
+  }
+
+  bool notable = cap->is_notable();
+  in->remove_client_cap(client);
+  if (!notable)
+    return;
+
+  if (in->is_auth()) {
+    // make sure we clear out the client byte range
+    if (in->get_projected_inode()->client_ranges.count(client) &&
+	!(in->get_inode()->nlink == 0 && !in->is_any_caps())) {  // unless it's unlink + stray
+      if (kill)
+	in->state_set(CInode::STATE_NEEDSRECOVER);
+      else
+	check_inode_max_size(in);
+    }
+  } else {
+    request_inode_file_caps(in);
+  }
+  
+  try_eval(in, CEPH_CAP_LOCKS);
+}
+
+
+/**
+ * Return true if any currently revoking caps exceed the
+ * session_timeout threshold.
+ */
+bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking,
+                                    double timeout) const
+{
+    xlist<Capability*>::const_iterator p = revoking.begin();
+    if (p.end()) {
+      // No revoking caps at the moment
+      return false;
+    } else {
+      utime_t now = ceph_clock_now();
+      utime_t age = now - (*p)->get_last_revoke_stamp();
+      if (age <= timeout) {
+          return false;
+      } else {
+          return true;
+      }
+    }
+}
+
+std::set<client_t> Locker::get_late_revoking_clients(double timeout) const
+{
+  std::set<client_t> result;
+
+  if (any_late_revoking_caps(revoking_caps, timeout)) {
+    // Slow path: execute in O(N_clients)
+    for (auto &p : revoking_caps_by_client) {
+      if (any_late_revoking_caps(p.second, timeout)) {
+        result.insert(p.first);
+      }
+    }
+  } else {
+    // Fast path: no misbehaving clients, execute in O(1)
+  }
+  return result;
+}
+
+// Hard-code instead of surfacing a config settings because this is
+// really a hack that should go away at some point when we have better
+// inspection tools for getting at detailed cap state (#7316)
+#define MAX_WARN_CAPS 100
+
+void Locker::caps_tick()
+{
+  utime_t now = ceph_clock_now();
+
+  if (!need_snapflush_inodes.empty()) {
+    // snap inodes that needs flush are auth pinned, they affect
+    // subtree/difrarg freeze.
+    utime_t cutoff = now;
+    cutoff -= g_conf()->mds_freeze_tree_timeout / 3;
+
+    CInode *last = need_snapflush_inodes.back();
+    while (!need_snapflush_inodes.empty()) {
+      CInode *in = need_snapflush_inodes.front();
+      if (in->last_dirstat_prop >= cutoff)
+	break;
+      in->item_caps.remove_myself();
+      snapflush_nudge(in);
+      if (in == last)
+	break;
+    }
+  }
+
+  dout(20) << __func__ << " " << revoking_caps.size() << " revoking caps" << dendl;
+
+  now = ceph_clock_now();
+  int n = 0;
+  for (xlist<Capability*>::iterator p = revoking_caps.begin(); !p.end(); ++p) {
+    Capability *cap = *p;
+
+    utime_t age = now - cap->get_last_revoke_stamp();
+    dout(20) << __func__ << " age = " << age << " client." << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
+    if (age <= mds->mdsmap->get_session_timeout()) {
+      dout(20) << __func__ << " age below timeout " << mds->mdsmap->get_session_timeout() << dendl;
+      break;
+    } else {
+      ++n;
+      if (n > MAX_WARN_CAPS) {
+        dout(1) << __func__ << " more than " << MAX_WARN_CAPS << " caps are late"
+          << "revoking, ignoring subsequent caps" << dendl;
+        break;
+      }
+    }
+    // exponential backoff of warning intervals
+    if (age > mds->mdsmap->get_session_timeout() * (1 << cap->get_num_revoke_warnings())) {
+      cap->inc_num_revoke_warnings();
+      CachedStackStringStream css;
+      *css << "client." << cap->get_client() << " isn't responding to mclientcaps(revoke), ino "
+	   << cap->get_inode()->ino() << " pending " << ccap_string(cap->pending())
+	   << " issued " << ccap_string(cap->issued()) << ", sent " << age << " seconds ago";
+      mds->clog->warn() << css->strv();
+      dout(20) << __func__ << " " << css->strv() << dendl;
+    } else {
+      dout(20) << __func__ << " silencing log message (backoff) for " << "client." << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
+    }
+  }
+}
+
+
+void Locker::handle_client_lease(const cref_t<MClientLease> &m)
+{
+  dout(10) << "handle_client_lease " << *m << dendl;
+
+  ceph_assert(m->get_source().is_client());
+  client_t client = m->get_source().num();
+
+  CInode *in = mdcache->get_inode(m->get_ino(), m->get_last());
+  if (!in) {
+    dout(7) << "handle_client_lease don't have ino " << m->get_ino() << "." << m->get_last() << dendl;
+    return;
+  }
+  CDentry *dn = 0;
+
+  frag_t fg = in->pick_dirfrag(m->dname);
+  CDir *dir = in->get_dirfrag(fg);
+  if (dir) 
+    dn = dir->lookup(m->dname);
+  if (!dn) {
+    dout(7) << "handle_client_lease don't have dn " << m->get_ino() << " " << m->dname << dendl;
+    return;
+  }
+  dout(10) << " on " << *dn << dendl;
+
+  // replica and lock
+  ClientLease *l = dn->get_client_lease(client);
+  if (!l) {
+    dout(7) << "handle_client_lease didn't have lease for client." << client << " of " << *dn << dendl;
+    return;
+  } 
+
+  switch (m->get_action()) {
+  case CEPH_MDS_LEASE_REVOKE_ACK:
+  case CEPH_MDS_LEASE_RELEASE:
+    if (l->seq != m->get_seq()) {
+      dout(7) << "handle_client_lease release - seq " << l->seq << " != provided " << m->get_seq() << dendl;
+    } else {
+      dout(7) << "handle_client_lease client." << client
+	      << " on " << *dn << dendl;
+      dn->remove_client_lease(l, this);
+    }
+    break;
+
+  case CEPH_MDS_LEASE_RENEW:
+    {
+      dout(7) << "handle_client_lease client." << client << " renew on " << *dn
+	      << (!dn->lock.can_lease(client)?", revoking lease":"") << dendl;
+      if (dn->lock.can_lease(client)) {
+        auto reply = make_message<MClientLease>(*m);
+	int pool = 1;   // fixme.. do something smart!
+	reply->h.duration_ms = (int)(1000 * mdcache->client_lease_durations[pool]);
+	reply->h.seq = ++l->seq;
+	reply->clear_payload();
+
+	utime_t now = ceph_clock_now();
+	now += mdcache->client_lease_durations[pool];
+	mdcache->touch_client_lease(l, pool, now);
+
+	mds->send_message_client_counted(reply, m->get_connection());
+      }
+    }
+    break;
+
+  default:
+    ceph_abort(); // implement me
+    break;
+  }
+}
+
+
+void Locker::issue_client_lease(CDentry *dn, CInode *in, MDRequestRef &mdr, utime_t now,
+                                bufferlist &bl)
+{
+  client_t client = mdr->get_client();
+  Session *session = mdr->session;
+
+  CInode *diri = dn->get_dir()->get_inode();
+  if (mdr->snapid == CEPH_NOSNAP &&
+      dn->lock.can_lease(client) &&
+      !diri->is_stray() &&  // do not issue dn leases in stray dir!
+      !diri->filelock.can_lease(client) &&
+      !(diri->get_client_cap_pending(client) & (CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL))) {
+    int mask = 0;
+    CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
+    if (dnl->is_primary()) {
+      ceph_assert(dnl->get_inode() == in);
+      mask = CEPH_LEASE_PRIMARY_LINK;
+    } else {
+      if (dnl->is_remote())
+        ceph_assert(dnl->get_remote_ino() == in->ino());
+      else
+        ceph_assert(!in);
+    }
+    // issue a dentry lease
+    ClientLease *l = dn->add_client_lease(client, session);
+    session->touch_lease(l);
+    
+    int pool = 1;   // fixme.. do something smart!
+    now += mdcache->client_lease_durations[pool];
+    mdcache->touch_client_lease(l, pool, now);
+
+    LeaseStat lstat;
+    lstat.mask = CEPH_LEASE_VALID | mask;
+    lstat.duration_ms = (uint32_t)(1000 * mdcache->client_lease_durations[pool]);
+    lstat.seq = ++l->seq;
+    lstat.alternate_name = std::string(dn->alternate_name);
+    encode_lease(bl, session->info, lstat);
+    dout(20) << "issue_client_lease seq " << lstat.seq << " dur " << lstat.duration_ms << "ms "
+	     << " on " << *dn << dendl;
+  } else {
+    // null lease
+    LeaseStat lstat;
+    lstat.mask = 0;
+    lstat.alternate_name = std::string(dn->alternate_name);
+    encode_lease(bl, session->info, lstat);
+    dout(20) << "issue_client_lease no/null lease on " << *dn << dendl;
+  }
+}
+
+
+void Locker::revoke_client_leases(SimpleLock *lock)
+{
+  int n = 0;
+  CDentry *dn = static_cast<CDentry*>(lock->get_parent());
+  for (map<client_t, ClientLease*>::iterator p = dn->client_lease_map.begin();
+       p != dn->client_lease_map.end();
+       ++p) {
+    ClientLease *l = p->second;
+    
+    n++;
+    ceph_assert(lock->get_type() == CEPH_LOCK_DN);
+
+    CDentry *dn = static_cast<CDentry*>(lock->get_parent());
+    int mask = 1 | CEPH_LOCK_DN; // old and new bits
+    
+    // i should also revoke the dir ICONTENT lease, if they have it!
+    CInode *diri = dn->get_dir()->get_inode();
+    auto lease = make_message<MClientLease>(CEPH_MDS_LEASE_REVOKE, l->seq, mask, diri->ino(), diri->first, CEPH_NOSNAP, dn->get_name());
+    mds->send_message_client_counted(lease, l->client);
+  }
+}
+
+void Locker::encode_lease(bufferlist& bl, const session_info_t& info,
+			  const LeaseStat& ls)
+{
+  if (info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
+    ENCODE_START(2, 1, bl);
+    encode(ls.mask, bl);
+    encode(ls.duration_ms, bl);
+    encode(ls.seq, bl);
+    encode(ls.alternate_name, bl);
+    ENCODE_FINISH(bl);
+  }
+  else {
+    encode(ls.mask, bl);
+    encode(ls.duration_ms, bl);
+    encode(ls.seq, bl);
+  }
+}
+
+// locks ----------------------------------------------------------------
+
+SimpleLock *Locker::get_lock(int lock_type, const MDSCacheObjectInfo &info) 
+{
+  switch (lock_type) {
+  case CEPH_LOCK_DN:
+    {
+      // be careful; info.dirfrag may have incorrect frag; recalculate based on dname.
+      CInode *diri = mdcache->get_inode(info.dirfrag.ino);
+      frag_t fg;
+      CDir *dir = 0;
+      CDentry *dn = 0;
+      if (diri) {
+	fg = diri->pick_dirfrag(info.dname);
+	dir = diri->get_dirfrag(fg);
+	if (dir) 
+	  dn = dir->lookup(info.dname, info.snapid);
+      }
+      if (!dn) {
+	dout(7) << "get_lock don't have dn " << info.dirfrag.ino << " " << info.dname << dendl;
+	return 0;
+      }
+      return &dn->lock;
+    }
+
+  case CEPH_LOCK_IAUTH:
+  case CEPH_LOCK_ILINK:
+  case CEPH_LOCK_IDFT:
+  case CEPH_LOCK_IFILE:
+  case CEPH_LOCK_INEST:
+  case CEPH_LOCK_IXATTR:
+  case CEPH_LOCK_ISNAP:
+  case CEPH_LOCK_IFLOCK:
+  case CEPH_LOCK_IPOLICY:
+    {
+      CInode *in = mdcache->get_inode(info.ino, info.snapid);
+      if (!in) {
+	dout(7) << "get_lock don't have ino " << info.ino << dendl;
+	return 0;
+      }
+      switch (lock_type) {
+      case CEPH_LOCK_IAUTH: return &in->authlock;
+      case CEPH_LOCK_ILINK: return &in->linklock;
+      case CEPH_LOCK_IDFT: return &in->dirfragtreelock;
+      case CEPH_LOCK_IFILE: return &in->filelock;
+      case CEPH_LOCK_INEST: return &in->nestlock;
+      case CEPH_LOCK_IXATTR: return &in->xattrlock;
+      case CEPH_LOCK_ISNAP: return &in->snaplock;
+      case CEPH_LOCK_IFLOCK: return &in->flocklock;
+      case CEPH_LOCK_IPOLICY: return &in->policylock;
+      }
+    }
+
+  default:
+    dout(7) << "get_lock don't know lock_type " << lock_type << dendl;
+    ceph_abort();
+    break;
+  }
+
+  return 0;  
+}
+
+void Locker::handle_lock(const cref_t<MLock> &m)
+{
+  // nobody should be talking to us during recovery.
+  ceph_assert(mds->is_rejoin() || mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+
+  SimpleLock *lock = get_lock(m->get_lock_type(), m->get_object_info());
+  if (!lock) {
+    dout(10) << "don't have object " << m->get_object_info() << ", must have trimmed, dropping" << dendl;
+    return;
+  }
+
+  switch (lock->get_type()) {
+  case CEPH_LOCK_DN:
+  case CEPH_LOCK_IAUTH:
+  case CEPH_LOCK_ILINK:
+  case CEPH_LOCK_ISNAP:
+  case CEPH_LOCK_IXATTR:
+  case CEPH_LOCK_IFLOCK:
+  case CEPH_LOCK_IPOLICY:
+    handle_simple_lock(lock, m);
+    break;
+    
+  case CEPH_LOCK_IDFT:
+  case CEPH_LOCK_INEST:
+    //handle_scatter_lock((ScatterLock*)lock, m);
+    //break;
+
+  case CEPH_LOCK_IFILE:
+    handle_file_lock(static_cast<ScatterLock*>(lock), m);
+    break;
+    
+  default:
+    dout(7) << "handle_lock got otype " << m->get_lock_type() << dendl;
+    ceph_abort();
+    break;
+  }
+}
+ 
+
+
+
+
+// ==========================================================================
+// simple lock
+
+/** This function may take a reference to m if it needs one, but does
+ * not put references. */
+void Locker::handle_reqrdlock(SimpleLock *lock, const cref_t<MLock> &m)
+{
+  MDSCacheObject *parent = lock->get_parent();
+  if (parent->is_auth() &&
+      lock->get_state() != LOCK_SYNC &&
+      !parent->is_frozen()) {
+    dout(7) << "handle_reqrdlock got rdlock request on " << *lock
+	    << " on " << *parent << dendl;
+    ceph_assert(parent->is_auth()); // replica auth pinned if they're doing this!
+    if (lock->is_stable()) {
+      simple_sync(lock);
+    } else {
+      dout(7) << "handle_reqrdlock delaying request until lock is stable" << dendl;
+      lock->add_waiter(SimpleLock::WAIT_STABLE | MDSCacheObject::WAIT_UNFREEZE,
+                       new C_MDS_RetryMessage(mds, m));
+    }
+  } else {
+    dout(7) << "handle_reqrdlock dropping rdlock request on " << *lock
+	    << " on " << *parent << dendl;
+    // replica should retry
+  }
+}
+
+void Locker::handle_simple_lock(SimpleLock *lock, const cref_t<MLock> &m)
+{
+  int from = m->get_asker();
+  
+  dout(10) << "handle_simple_lock " << *m
+	   << " on " << *lock << " " << *lock->get_parent() << dendl;
+
+  if (mds->is_rejoin()) {
+    if (lock->get_parent()->is_rejoining()) {
+      dout(7) << "handle_simple_lock still rejoining " << *lock->get_parent()
+	      << ", dropping " << *m << dendl;
+      return;
+    }
+  }
+
+  switch (m->get_action()) {
+    // -- replica --
+  case LOCK_AC_SYNC:
+    ceph_assert(lock->get_state() == LOCK_LOCK);
+    lock->decode_locked_state(m->get_data());
+    lock->set_state(LOCK_SYNC);
+    lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
+    break;
+    
+  case LOCK_AC_LOCK:
+    ceph_assert(lock->get_state() == LOCK_SYNC);
+    lock->set_state(LOCK_SYNC_LOCK);
+    if (lock->is_leased())
+      revoke_client_leases(lock);
+    eval_gather(lock, true);
+    if (lock->is_unstable_and_locked()) {
+      if (lock->is_cached())
+	invalidate_lock_caches(lock);
+      mds->mdlog->flush();
+    }
+    break;
+
+
+    // -- auth --
+  case LOCK_AC_LOCKACK:
+    ceph_assert(lock->get_state() == LOCK_SYNC_LOCK ||
+	   lock->get_state() == LOCK_SYNC_EXCL);
+    ceph_assert(lock->is_gathering(from));
+    lock->remove_gather(from);
+    
+    if (lock->is_gathering()) {
+      dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from
+	      << ", still gathering " << lock->get_gather_set() << dendl;
+    } else {
+      dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from
+	      << ", last one" << dendl;
+      eval_gather(lock);
+    }
+    break;
+
+  case LOCK_AC_REQRDLOCK:
+    handle_reqrdlock(lock, m);
+    break;
+
+  }
+}
+
+/* unused, currently.
+
+class C_Locker_SimpleEval : public Context {
+  Locker *locker;
+  SimpleLock *lock;
+public:
+  C_Locker_SimpleEval(Locker *l, SimpleLock *lk) : locker(l), lock(lk) {}
+  void finish(int r) {
+    locker->try_simple_eval(lock);
+  }
+};
+
+void Locker::try_simple_eval(SimpleLock *lock)
+{
+  // unstable and ambiguous auth?
+  if (!lock->is_stable() &&
+      lock->get_parent()->is_ambiguous_auth()) {
+    dout(7) << "simple_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl;
+    //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
+    lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_SimpleEval(this, lock));
+    return;
+  }
+
+  if (!lock->get_parent()->is_auth()) {
+    dout(7) << "try_simple_eval not auth for " << *lock->get_parent() << dendl;
+    return;
+  }
+
+  if (!lock->get_parent()->can_auth_pin()) {
+    dout(7) << "try_simple_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl;
+    //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
+    lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_SimpleEval(this, lock));
+    return;
+  }
+
+  if (lock->is_stable())
+    simple_eval(lock);
+}
+*/
+
+
+void Locker::simple_eval(SimpleLock *lock, bool *need_issue)
+{
+  dout(10) << "simple_eval " << *lock << " on " << *lock->get_parent() << dendl;
+
+  ceph_assert(lock->get_parent()->is_auth());
+  ceph_assert(lock->is_stable());
+
+  if (lock->get_parent()->is_freezing_or_frozen()) {
+    // dentry/snap lock in unreadable state can block path traverse
+    if ((lock->get_type() != CEPH_LOCK_DN &&
+	 lock->get_type() != CEPH_LOCK_ISNAP &&
+	 lock->get_type() != CEPH_LOCK_IPOLICY) ||
+	 lock->get_state() == LOCK_SYNC ||
+	 lock->get_parent()->is_frozen())
+      return;
+  }
+
+  if (mdcache->is_readonly()) {
+    if (lock->get_state() != LOCK_SYNC) {
+      dout(10) << "simple_eval read-only FS, syncing " << *lock << " on " << *lock->get_parent() << dendl;
+      simple_sync(lock, need_issue);
+    }
+    return;
+  }
+
+  CInode *in = 0;
+  int wanted = 0;
+  if (lock->get_cap_shift()) {
+    in = static_cast<CInode*>(lock->get_parent());
+    in->get_caps_wanted(&wanted, NULL, lock->get_cap_shift());
+  }
+  
+  // -> excl?
+  if (lock->get_state() != LOCK_EXCL &&
+      in && in->get_target_loner() >= 0 &&
+      (wanted & CEPH_CAP_GEXCL)) {
+    dout(7) << "simple_eval stable, going to excl " << *lock 
+	    << " on " << *lock->get_parent() << dendl;
+    simple_excl(lock, need_issue);
+  }
+
+  // stable -> sync?
+  else if (lock->get_state() != LOCK_SYNC &&
+	   !lock->is_wrlocked() &&
+	   ((!(wanted & CEPH_CAP_GEXCL) && !lock->is_waiter_for(SimpleLock::WAIT_WR)) ||
+	    (lock->get_state() == LOCK_EXCL && in && in->get_target_loner() < 0))) {
+    dout(7) << "simple_eval stable, syncing " << *lock 
+	    << " on " << *lock->get_parent() << dendl;
+    simple_sync(lock, need_issue);
+  }
+}
+
+
+// mid
+
+bool Locker::simple_sync(SimpleLock *lock, bool *need_issue)
+{
+  dout(7) << "simple_sync on " << *lock << " on " << *lock->get_parent() << dendl;
+  ceph_assert(lock->get_parent()->is_auth());
+  ceph_assert(lock->is_stable());
+
+  CInode *in = 0;
+  if (lock->get_cap_shift())
+    in = static_cast<CInode *>(lock->get_parent());
+
+  int old_state = lock->get_state();
+
+  if (old_state != LOCK_TSYN) {
+
+    switch (lock->get_state()) {
+    case LOCK_MIX: lock->set_state(LOCK_MIX_SYNC); break;
+    case LOCK_LOCK: lock->set_state(LOCK_LOCK_SYNC); break;
+    case LOCK_XSYN: lock->set_state(LOCK_XSYN_SYNC); break;
+    case LOCK_EXCL: lock->set_state(LOCK_EXCL_SYNC); break;
+    default: ceph_abort();
+    }
+
+    int gather = 0;
+    if (lock->is_wrlocked()) {
+      gather++;
+      if (lock->is_cached())
+	invalidate_lock_caches(lock);
+
+      // After a client request is early replied the mdlog won't be flushed
+      // immediately, but before safe replied the request will hold the write
+      // locks. So if the client sends another request to a different MDS
+      // daemon, which then needs to request read lock from current MDS daemon,
+      // then that daemon maybe stuck at most for 5 seconds. Which will lead
+      // the client stuck at most 5 seconds.
+      //
+      // Let's try to flush the mdlog when the write lock is held, which will
+      // release the write locks after mdlog is successfully flushed.
+      mds->mdlog->flush();
+    }
+    
+    if (lock->get_parent()->is_replicated() && old_state == LOCK_MIX) {
+      send_lock_message(lock, LOCK_AC_SYNC);
+      lock->init_gather();
+      gather++;
+    }
+    
+    if (in && in->is_head()) {
+      if (in->issued_caps_need_gather(lock)) {
+	if (need_issue)
+	  *need_issue = true;
+	else
+	  issue_caps(in);
+	gather++;
+      }
+    }
+    
+    bool need_recover = false;
+    if (lock->get_type() == CEPH_LOCK_IFILE) {
+      ceph_assert(in);
+      if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
+        mds->mdcache->queue_file_recover(in);
+	need_recover = true;
+        gather++;
+      }
+    }
+    
+    if (!gather && lock->is_dirty()) {
+      lock->get_parent()->auth_pin(lock);
+      scatter_writebehind(static_cast<ScatterLock*>(lock));
+      return false;
+    }
+
+    if (gather) {
+      lock->get_parent()->auth_pin(lock);
+      if (need_recover)
+	mds->mdcache->do_file_recover();
+      return false;
+    }
+  }
+
+  if (lock->get_parent()->is_replicated()) {    // FIXME
+    bufferlist data;
+    lock->encode_locked_state(data);
+    send_lock_message(lock, LOCK_AC_SYNC, data);
+  }
+  lock->set_state(LOCK_SYNC);
+  lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
+  if (in && in->is_head()) {
+    if (need_issue)
+      *need_issue = true;
+    else
+      issue_caps(in);
+  }
+  return true;
+}
+
+void Locker::simple_excl(SimpleLock *lock, bool *need_issue)
+{
+  dout(7) << "simple_excl on " << *lock << " on " << *lock->get_parent() << dendl;
+  ceph_assert(lock->get_parent()->is_auth());
+  ceph_assert(lock->is_stable());
+
+  CInode *in = 0;
+  if (lock->get_cap_shift())
+    in = static_cast<CInode *>(lock->get_parent());
+
+  switch (lock->get_state()) {
+  case LOCK_LOCK: lock->set_state(LOCK_LOCK_EXCL); break;
+  case LOCK_SYNC: lock->set_state(LOCK_SYNC_EXCL); break;
+  case LOCK_XSYN: lock->set_state(LOCK_XSYN_EXCL); break;
+  default: ceph_abort();
+  }
+  
+  int gather = 0;
+  if (lock->is_rdlocked())
+    gather++;
+  if (lock->is_wrlocked())
+    gather++;
+  if (gather && lock->is_cached())
+    invalidate_lock_caches(lock);
+
+  if (lock->get_parent()->is_replicated() && 
+      lock->get_state() != LOCK_LOCK_EXCL &&
+      lock->get_state() != LOCK_XSYN_EXCL) {
+    send_lock_message(lock, LOCK_AC_LOCK);
+    lock->init_gather();
+    gather++;
+  }
+  
+  if (in && in->is_head()) {
+    if (in->issued_caps_need_gather(lock)) {
+      if (need_issue)
+	*need_issue = true;
+      else
+	issue_caps(in);
+      gather++;
+    }
+  }
+  
+  if (gather) {
+    lock->get_parent()->auth_pin(lock);
+  } else {
+    lock->set_state(LOCK_EXCL);
+    lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
+    if (in) {
+      if (need_issue)
+	*need_issue = true;
+      else
+	issue_caps(in);
+    }
+  }
+}
+
+void Locker::simple_lock(SimpleLock *lock, bool *need_issue)
+{
+  dout(7) << "simple_lock on " << *lock << " on " << *lock->get_parent() << dendl;
+  ceph_assert(lock->get_parent()->is_auth());
+  ceph_assert(lock->is_stable());
+  ceph_assert(lock->get_state() != LOCK_LOCK);
+  
+  CInode *in = 0;
+  if (lock->get_cap_shift())
+    in = static_cast<CInode *>(lock->get_parent());
+
+  int old_state = lock->get_state();
+
+  switch (lock->get_state()) {
+  case LOCK_SYNC: lock->set_state(LOCK_SYNC_LOCK); break;
+  case LOCK_XSYN: lock->set_state(LOCK_XSYN_LOCK); break;
+  case LOCK_EXCL: lock->set_state(LOCK_EXCL_LOCK); break;
+  case LOCK_MIX: lock->set_state(LOCK_MIX_LOCK);
+    (static_cast<ScatterLock *>(lock))->clear_unscatter_wanted();
+    break;
+  case LOCK_TSYN: lock->set_state(LOCK_TSYN_LOCK); break;
+  default: ceph_abort();
+  }
+
+  int gather = 0;
+  if (lock->is_leased()) {
+    gather++;
+    revoke_client_leases(lock);
+  }
+  if (lock->is_rdlocked()) {
+    if (lock->is_cached())
+      invalidate_lock_caches(lock);
+    gather++;
+  }
+  if (in && in->is_head()) {
+    if (in->issued_caps_need_gather(lock)) {
+      if (need_issue)
+	*need_issue = true;
+      else
+	issue_caps(in);
+      gather++;
+    }
+  }
+
+  bool need_recover = false;
+  if (lock->get_type() == CEPH_LOCK_IFILE) {
+    ceph_assert(in);
+    if(in->state_test(CInode::STATE_NEEDSRECOVER)) {
+      mds->mdcache->queue_file_recover(in);
+      need_recover = true;
+      gather++;
+    }
+  }
+
+  if (lock->get_parent()->is_replicated() &&
+      lock->get_state() == LOCK_MIX_LOCK &&
+      gather) {
+    dout(10) << " doing local stage of mix->lock gather before gathering from replicas" << dendl;
+  } else {
+    // move to second stage of gather now, so we don't send the lock action later.
+    if (lock->get_state() == LOCK_MIX_LOCK)
+      lock->set_state(LOCK_MIX_LOCK2);
+
+    if (lock->get_parent()->is_replicated() &&
+	lock->get_sm()->states[old_state].replica_state != LOCK_LOCK) {  // replica may already be LOCK
+      gather++;
+      send_lock_message(lock, LOCK_AC_LOCK);
+      lock->init_gather();
+    }
+  }
+
+  if (!gather && lock->is_dirty()) {
+    lock->get_parent()->auth_pin(lock);
+    scatter_writebehind(static_cast<ScatterLock*>(lock));
+    return;
+  }
+
+  if (gather) {
+    lock->get_parent()->auth_pin(lock);
+    if (need_recover)
+      mds->mdcache->do_file_recover();
+  } else {
+    lock->set_state(LOCK_LOCK);
+    lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE);
+  }
+}
+
+
+void Locker::simple_xlock(SimpleLock *lock)
+{
+  dout(7) << "simple_xlock on " << *lock << " on " << *lock->get_parent() << dendl;
+  ceph_assert(lock->get_parent()->is_auth());
+  //assert(lock->is_stable());
+  ceph_assert(lock->get_state() != LOCK_XLOCK);
+  
+  CInode *in = 0;
+  if (lock->get_cap_shift())
+    in = static_cast<CInode *>(lock->get_parent());
+
+  if (lock->is_stable())
+    lock->get_parent()->auth_pin(lock);
+
+  switch (lock->get_state()) {
+  case LOCK_LOCK: 
+  case LOCK_XLOCKDONE: lock->set_state(LOCK_LOCK_XLOCK); break;
+  default: ceph_abort();
+  }
+
+  int gather = 0;
+  if (lock->is_rdlocked())
+    gather++;
+  if (lock->is_wrlocked())
+    gather++;
+  if (gather && lock->is_cached())
+    invalidate_lock_caches(lock);
+  
+  if (in && in->is_head()) {
+    if (in->issued_caps_need_gather(lock)) {
+      issue_caps(in);
+      gather++;
+    }
+  }
+
+  if (!gather) {
+    lock->set_state(LOCK_PREXLOCK);
+    //assert("shouldn't be called if we are already xlockable" == 0);
+  }
+}
+
+
+
+
+
+// ==========================================================================
+// scatter lock
+
+/*
+
+Some notes on scatterlocks.
+
+ - The scatter/gather is driven by the inode lock.  The scatter always
+   brings in the latest metadata from the fragments.
+
+ - When in a scattered/MIX state, fragments are only allowed to
+   update/be written to if the accounted stat matches the inode's
+   current version.
+
+ - That means, on gather, we _only_ assimilate diffs for frag metadata
+   that match the current version, because those are the only ones
+   written during this scatter/gather cycle.  (Others didn't permit
+   it.)  We increment the version and journal this to disk.
+
+ - When possible, we also simultaneously update our local frag
+   accounted stats to match.
+
+ - On scatter, the new inode info is broadcast to frags, both local
+   and remote.  If possible (auth and !frozen), the dirfrag auth
+   should update the accounted state (if it isn't already up to date).
+   Note that this may occur on both the local inode auth node and
+   inode replicas, so there are two potential paths. If it is NOT
+   possible, they need to mark_stale to prevent any possible writes.
+
+ - A scatter can be to MIX (potentially writeable) or to SYNC (read
+   only).  Both are opportunities to update the frag accounted stats,
+   even though only the MIX case is affected by a stale dirfrag.
+
+ - Because many scatter/gather cycles can potentially go by without a
+   frag being able to update its accounted stats (due to being frozen
+   by exports/refragments in progress), the frag may have (even very)
+   old stat versions.  That's fine.  If when we do want to update it,
+   we can update accounted_* and the version first.
+
+*/
+
+class C_Locker_ScatterWB : public LockerLogContext {
+  ScatterLock *lock;
+  MutationRef mut;
+public:
+  C_Locker_ScatterWB(Locker *l, ScatterLock *sl, MutationRef& m) :
+    LockerLogContext(l), lock(sl), mut(m) {}
+  void finish(int r) override { 
+    locker->scatter_writebehind_finish(lock, mut); 
+  }
+};
+
+void Locker::scatter_writebehind(ScatterLock *lock)
+{
+  CInode *in = static_cast<CInode*>(lock->get_parent());
+  dout(10) << "scatter_writebehind " << in->get_inode()->mtime << " on " << *lock << " on " << *in << dendl;
+
+  // journal
+  MutationRef mut(new MutationImpl());
+  mut->ls = mds->mdlog->get_current_segment();
+
+  // forcefully take a wrlock
+  lock->get_wrlock(true);
+  mut->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
+
+  in->pre_cow_old_inode();  // avoid cow mayhem
+
+  auto pi = in->project_inode(mut);
+  pi.inode->version = in->pre_dirty();
+
+  in->finish_scatter_gather_update(lock->get_type(), mut);
+  lock->start_flush();
+
+  EUpdate *le = new EUpdate(mds->mdlog, "scatter_writebehind");
+  mds->mdlog->start_entry(le);
+
+  mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
+  mdcache->journal_dirty_inode(mut.get(), &le->metablob, in);
+  
+  in->finish_scatter_gather_update_accounted(lock->get_type(), &le->metablob);
+
+  mds->mdlog->submit_entry(le, new C_Locker_ScatterWB(this, lock, mut));
+  mds->mdlog->flush();
+}
+
+void Locker::scatter_writebehind_finish(ScatterLock *lock, MutationRef& mut)
+{
+  CInode *in = static_cast<CInode*>(lock->get_parent());
+  dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << dendl;
+
+  mut->apply();
+
+  lock->finish_flush();
+
+  // if replicas may have flushed in a mix->lock state, send another
+  // message so they can finish_flush().
+  if (in->is_replicated()) {
+    switch (lock->get_state()) {
+    case LOCK_MIX_LOCK:
+    case LOCK_MIX_LOCK2:
+    case LOCK_MIX_EXCL:
+    case LOCK_MIX_TSYN:
+      send_lock_message(lock, LOCK_AC_LOCKFLUSHED);
+    }
+  }
+
+  drop_locks(mut.get());
+  mut->cleanup();
+
+  if (lock->is_stable())
+    lock->finish_waiters(ScatterLock::WAIT_STABLE);
+
+  //scatter_eval_gather(lock);
+}
+
+void Locker::scatter_eval(ScatterLock *lock, bool *need_issue)
+{
+  dout(10) << "scatter_eval " << *lock << " on " << *lock->get_parent() << dendl;
+
+  ceph_assert(lock->get_parent()->is_auth());
+  ceph_assert(lock->is_stable());
+
+  if (lock->get_parent()->is_freezing_or_frozen()) {
+    dout(20) << "  freezing|frozen" << dendl;
+    return;
+  }
+
+  if (mdcache->is_readonly()) {
+    if (lock->get_state() != LOCK_SYNC) {
+      dout(10) << "scatter_eval read-only FS, syncing " << *lock << " on " << *lock->get_parent() << dendl;
+      simple_sync(lock, need_issue);
+    }
+    return;
+  }
+  
+  if (!lock->is_rdlocked() &&
+      lock->get_state() != LOCK_MIX &&
+      lock->get_scatter_wanted()) {
+    dout(10) << "scatter_eval scatter_wanted, bump to mix " << *lock
+	     << " on " << *lock->get_parent() << dendl;
+    scatter_mix(lock, need_issue);
+    return;
+  }
+
+  if (lock->get_type() == CEPH_LOCK_INEST) {
+    // in general, we want to keep INEST writable at all times.
+    if (!lock->is_rdlocked()) {
+      if (lock->get_parent()->is_replicated()) {
+	if (lock->get_state() != LOCK_MIX)
+	  scatter_mix(lock, need_issue);
+      } else {
+	if (lock->get_state() != LOCK_LOCK)
+	  simple_lock(lock, need_issue);
+      }
+    }
+    return;
+  }
+
+  CInode *in = static_cast<CInode*>(lock->get_parent());
+  if (!in->has_subtree_or_exporting_dirfrag() || in->is_base()) {
+    // i _should_ be sync.
+    if (!lock->is_wrlocked() &&
+	lock->get_state() != LOCK_SYNC) {
+      dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << dendl;
+      simple_sync(lock, need_issue);
+    }
+  }
+}
+
+
+/*
+ * mark a scatterlock to indicate that the dir fnode has some dirty data
+ */
+void Locker::mark_updated_scatterlock(ScatterLock *lock)
+{
+  lock->mark_dirty();
+  if (lock->get_updated_item()->is_on_list()) {
+    dout(10) << "mark_updated_scatterlock " << *lock
+	     << " - already on list since " << lock->get_update_stamp() << dendl;
+  } else {
+    updated_scatterlocks.push_back(lock->get_updated_item());
+    utime_t now = ceph_clock_now();
+    lock->set_update_stamp(now);
+    dout(10) << "mark_updated_scatterlock " << *lock
+	     << " - added at " << now << dendl;
+  }
+}
+
+/*
+ * this is called by scatter_tick and LogSegment::try_to_trim() when
+ * trying to flush dirty scattered data (i.e. updated fnode) back to
+ * the inode.
+ *
+ * we need to lock|scatter in order to push fnode changes into the
+ * inode.dirstat.
+ */
+void Locker::scatter_nudge(ScatterLock *lock, MDSContext *c, bool forcelockchange)
+{
+  CInode *p = static_cast<CInode *>(lock->get_parent());
+
+  if (p->is_frozen() || p->is_freezing()) {
+    dout(10) << "scatter_nudge waiting for unfreeze on " << *p << dendl;
+    if (c) 
+      p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, c);
+    else if (lock->is_dirty())
+      // just requeue.  not ideal.. starvation prone..
+      updated_scatterlocks.push_back(lock->get_updated_item());
+    return;
+  }
+
+  if (p->is_ambiguous_auth()) {
+    dout(10) << "scatter_nudge waiting for single auth on " << *p << dendl;
+    if (c) 
+      p->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, c);
+    else if (lock->is_dirty())
+      // just requeue.  not ideal.. starvation prone..
+      updated_scatterlocks.push_back(lock->get_updated_item());
+    return;
+  }
+
+  if (p->is_auth()) {
+    int count = 0;
+    while (true) {
+      if (lock->is_stable()) {
+	// can we do it now?
+	//  (only if we're not replicated.. if we are, we really do need
+	//   to nudge the lock state!)
+	/*
+	  actually, even if we're not replicated, we can't stay in MIX, because another mds
+	  could discover and replicate us at any time.  if that happens while we're flushing,
+	  they end up in MIX but their inode has the old scatterstat version.
+
+	if (!forcelockchange && !lock->get_parent()->is_replicated() && lock->can_wrlock(-1)) {
+	  dout(10) << "scatter_nudge auth, propagating " << *lock << " on " << *p << dendl;
+	  scatter_writebehind(lock);
+	  if (c)
+	    lock->add_waiter(SimpleLock::WAIT_STABLE, c);
+	  return;
+	}
+	*/
+
+	if (mdcache->is_readonly()) {
+	  if (lock->get_state() != LOCK_SYNC) {
+	    dout(10) << "scatter_nudge auth, read-only FS, syncing " << *lock << " on " << *p << dendl;
+	    simple_sync(static_cast<ScatterLock*>(lock));
+	  }
+	  break;
+	}
+
+	// adjust lock state
+	dout(10) << "scatter_nudge auth, scatter/unscattering " << *lock << " on " << *p << dendl;
+	switch (lock->get_type()) {
+	case CEPH_LOCK_IFILE:
+	  if (p->is_replicated() && lock->get_state() != LOCK_MIX)
+	    scatter_mix(static_cast<ScatterLock*>(lock));
+	  else if (lock->get_state() != LOCK_LOCK)
+	    simple_lock(static_cast<ScatterLock*>(lock));
+	  else
+	    simple_sync(static_cast<ScatterLock*>(lock));
+	  break;
+	  
+	case CEPH_LOCK_IDFT:
+	case CEPH_LOCK_INEST:
+	  if (p->is_replicated() && lock->get_state() != LOCK_MIX)
+	    scatter_mix(lock);
+	  else if (lock->get_state() != LOCK_LOCK)
+	    simple_lock(lock);
+	  else
+	    simple_sync(lock);
+	  break;
+	default:
+	  ceph_abort();
+	}
+	++count;
+	if (lock->is_stable() && count == 2) {
+	  dout(10) << "scatter_nudge oh, stable after two cycles." << dendl;
+	  // this should only realy happen when called via
+	  // handle_file_lock due to AC_NUDGE, because the rest of the
+	  // time we are replicated or have dirty data and won't get
+	  // called.  bailing here avoids an infinite loop.
+	  ceph_assert(!c); 
+	  break;
+	}
+      } else {
+	dout(10) << "scatter_nudge auth, waiting for stable " << *lock << " on " << *p << dendl;
+	if (c)
+	  lock->add_waiter(SimpleLock::WAIT_STABLE, c);
+	return;
+      }
+    }
+  } else {
+    dout(10) << "scatter_nudge replica, requesting scatter/unscatter of " 
+	     << *lock << " on " << *p << dendl;
+    // request unscatter?
+    mds_rank_t auth = lock->get_parent()->authority().first;
+    if (!mds->is_cluster_degraded() || mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
+      mds->send_message_mds(make_message<MLock>(lock, LOCK_AC_NUDGE, mds->get_nodeid()), auth);
+    }
+
+    // wait...
+    if (c)
+      lock->add_waiter(SimpleLock::WAIT_STABLE, c);
+
+    // also, requeue, in case we had wrong auth or something
+    if (lock->is_dirty())
+      updated_scatterlocks.push_back(lock->get_updated_item());
+  }
+}
+
+void Locker::scatter_tick()
+{
+  dout(10) << "scatter_tick" << dendl;
+  
+  // updated
+  utime_t now = ceph_clock_now();
+  int n = updated_scatterlocks.size();
+  while (!updated_scatterlocks.empty()) {
+    ScatterLock *lock = updated_scatterlocks.front();
+
+    if (n-- == 0) break;  // scatter_nudge() may requeue; avoid looping
+    
+    if (!lock->is_dirty()) {
+      updated_scatterlocks.pop_front();
+      dout(10) << " removing from updated_scatterlocks " 
+	       << *lock << " " << *lock->get_parent() << dendl;
+      continue;
+    }
+    if (now - lock->get_update_stamp() < g_conf()->mds_scatter_nudge_interval)
+      break;
+    updated_scatterlocks.pop_front();
+    scatter_nudge(lock, 0);
+  }
+  mds->mdlog->flush();
+}
+
+
+void Locker::scatter_tempsync(ScatterLock *lock, bool *need_issue)
+{
+  dout(10) << "scatter_tempsync " << *lock
+	   << " on " << *lock->get_parent() << dendl;
+  ceph_assert(lock->get_parent()->is_auth());
+  ceph_assert(lock->is_stable());
+
+  ceph_abort_msg("not fully implemented, at least not for filelock");
+
+  CInode *in = static_cast<CInode *>(lock->get_parent());
+
+  switch (lock->get_state()) {
+  case LOCK_SYNC: ceph_abort();   // this shouldn't happen
+  case LOCK_LOCK: lock->set_state(LOCK_LOCK_TSYN); break;
+  case LOCK_MIX: lock->set_state(LOCK_MIX_TSYN); break;
+  default: ceph_abort();
+  }
+
+  int gather = 0;
+  if (lock->is_wrlocked()) {
+    if (lock->is_cached())
+      invalidate_lock_caches(lock);
+    gather++;
+  }
+
+  if (lock->get_cap_shift() &&
+      in->is_head() &&
+      in->issued_caps_need_gather(lock)) {
+    if (need_issue)
+      *need_issue = true;
+    else
+      issue_caps(in);
+    gather++;
+  }
+
+  if (lock->get_state() == LOCK_MIX_TSYN &&
+      in->is_replicated()) {
+    lock->init_gather();
+    send_lock_message(lock, LOCK_AC_LOCK);
+    gather++;
+  }
+
+  if (gather) {
+    in->auth_pin(lock);
+  } else {
+    // do tempsync
+    lock->set_state(LOCK_TSYN);
+    lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE);
+    if (lock->get_cap_shift()) {
+      if (need_issue)
+	*need_issue = true;
+      else
+	issue_caps(in);
+    }
+  }
+}
+
+
+
+// ==========================================================================
+// local lock
+
+void Locker::local_wrlock_grab(LocalLockC *lock, MutationRef& mut)
+{
+  dout(7) << "local_wrlock_grab  on " << *lock
+	  << " on " << *lock->get_parent() << dendl;  
+  
+  ceph_assert(lock->get_parent()->is_auth());
+  ceph_assert(lock->can_wrlock());
+  lock->get_wrlock(mut->get_client());
+
+  auto it = mut->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
+  ceph_assert(it->is_wrlock());
+}
+
+bool Locker::local_wrlock_start(LocalLockC *lock, MDRequestRef& mut)
+{
+  dout(7) << "local_wrlock_start  on " << *lock
+	  << " on " << *lock->get_parent() << dendl;  
+  
+  ceph_assert(lock->get_parent()->is_auth());
+  if (lock->can_wrlock()) {
+    lock->get_wrlock(mut->get_client());
+    auto it = mut->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
+    ceph_assert(it->is_wrlock());
+    return true;
+  } else {
+    lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut));
+    return false;
+  }
+}
+
+void Locker::local_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut)
+{
+  ceph_assert(it->is_wrlock());
+  LocalLockC *lock = static_cast<LocalLockC*>(it->lock);
+  dout(7) << "local_wrlock_finish  on " << *lock
+	  << " on " << *lock->get_parent() << dendl;  
+  lock->put_wrlock();
+  mut->locks.erase(it);
+  if (lock->get_num_wrlocks() == 0) {
+    lock->finish_waiters(SimpleLock::WAIT_STABLE |
+                         SimpleLock::WAIT_WR |
+                         SimpleLock::WAIT_RD);
+  }
+}
+
+bool Locker::local_xlock_start(LocalLockC *lock, MDRequestRef& mut)
+{
+  dout(7) << "local_xlock_start  on " << *lock
+	  << " on " << *lock->get_parent() << dendl;  
+  
+  ceph_assert(lock->get_parent()->is_auth());
+  if (!lock->can_xlock_local()) {
+    lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut));
+    return false;
+  }
+
+  lock->get_xlock(mut, mut->get_client());
+  mut->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
+  return true;
+}
+
+void Locker::local_xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut)
+{
+  ceph_assert(it->is_xlock());
+  LocalLockC *lock = static_cast<LocalLockC*>(it->lock);
+  dout(7) << "local_xlock_finish  on " << *lock
+	  << " on " << *lock->get_parent() << dendl;  
+  lock->put_xlock();
+  mut->locks.erase(it);
+
+  lock->finish_waiters(SimpleLock::WAIT_STABLE | 
+		       SimpleLock::WAIT_WR | 
+		       SimpleLock::WAIT_RD);
+}
+
+
+
+// ==========================================================================
+// file lock
+
+
+void Locker::file_eval(ScatterLock *lock, bool *need_issue)
+{
+  CInode *in = static_cast<CInode*>(lock->get_parent());
+  int loner_wanted, other_wanted;
+  int wanted = in->get_caps_wanted(&loner_wanted, &other_wanted, CEPH_CAP_SFILE);
+  dout(7) << "file_eval wanted=" << gcap_string(wanted)
+	  << " loner_wanted=" << gcap_string(loner_wanted)
+	  << " other_wanted=" << gcap_string(other_wanted)
+	  << "  filelock=" << *lock << " on " << *lock->get_parent()
+	  << dendl;
+
+  ceph_assert(lock->get_parent()->is_auth());
+  ceph_assert(lock->is_stable());
+
+  if (lock->get_parent()->is_freezing_or_frozen())
+    return;
+
+  if (mdcache->is_readonly()) {
+    if (lock->get_state() != LOCK_SYNC) {
+      dout(10) << "file_eval read-only FS, syncing " << *lock << " on " << *lock->get_parent() << dendl;
+      simple_sync(lock, need_issue);
+    }
+    return;
+  }
+
+  // excl -> *?
+  if (lock->get_state() == LOCK_EXCL) {
+    dout(20) << " is excl" << dendl;
+    int loner_issued, other_issued, xlocker_issued;
+    in->get_caps_issued(&loner_issued, &other_issued, &xlocker_issued, CEPH_CAP_SFILE);
+    dout(7) << "file_eval loner_issued=" << gcap_string(loner_issued)
+            << " other_issued=" << gcap_string(other_issued)
+	    << " xlocker_issued=" << gcap_string(xlocker_issued)
+	    << dendl;
+    if (!((loner_wanted|loner_issued) & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GBUFFER)) ||
+	(other_wanted & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GRD)) ||
+	(in->is_dir() && in->multiple_nonstale_caps())) {  // FIXME.. :/
+      dout(20) << " should lose it" << dendl;
+      // we should lose it.
+      //  loner  other   want
+      //  R      R       SYNC
+      //  R      R|W     MIX
+      //  R      W       MIX
+      //  R|W    R       MIX
+      //  R|W    R|W     MIX
+      //  R|W    W       MIX
+      //  W      R       MIX
+      //  W      R|W     MIX
+      //  W      W       MIX
+      // -> any writer means MIX; RD doesn't matter.
+      if (((other_wanted|loner_wanted) & CEPH_CAP_GWR) ||
+	  lock->is_waiter_for(SimpleLock::WAIT_WR))
+	scatter_mix(lock, need_issue);
+      else if (!lock->is_wrlocked())   // let excl wrlocks drain first
+	simple_sync(lock, need_issue);
+      else
+	dout(10) << " waiting for wrlock to drain" << dendl;
+    }    
+  }
+
+  // * -> excl?
+  else if (lock->get_state() != LOCK_EXCL &&
+	   !lock->is_rdlocked() &&
+	   //!lock->is_waiter_for(SimpleLock::WAIT_WR) &&
+	   in->get_target_loner() >= 0 &&
+	   (in->is_dir() ?
+	    !in->has_subtree_or_exporting_dirfrag() :
+	    (wanted & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GBUFFER)))) {
+    dout(7) << "file_eval stable, bump to loner " << *lock
+	    << " on " << *lock->get_parent() << dendl;
+    file_excl(lock, need_issue);
+  }
+
+  // * -> mixed?
+  else if (lock->get_state() != LOCK_MIX &&
+	   !lock->is_rdlocked() &&
+	   //!lock->is_waiter_for(SimpleLock::WAIT_WR) &&
+	   (lock->get_scatter_wanted() ||
+	    (in->get_target_loner() < 0 && (wanted & CEPH_CAP_GWR)))) {
+    dout(7) << "file_eval stable, bump to mixed " << *lock
+	    << " on " << *lock->get_parent() << dendl;
+    scatter_mix(lock, need_issue);
+  }
+  
+  // * -> sync?
+  else if (lock->get_state() != LOCK_SYNC &&
+	   !lock->is_wrlocked() &&   // drain wrlocks first!
+	   !lock->is_waiter_for(SimpleLock::WAIT_WR) &&
+	   !(wanted & CEPH_CAP_GWR) &&
+	   !((lock->get_state() == LOCK_MIX) &&
+	     in->is_dir() && in->has_subtree_or_exporting_dirfrag())  // if we are a delegation point, stay where we are
+	   //((wanted & CEPH_CAP_RD) || 
+	   //in->is_replicated() || 
+	   //lock->is_leased() ||
+	   //(!loner && lock->get_state() == LOCK_EXCL)) &&
+	   ) {
+    dout(7) << "file_eval stable, bump to sync " << *lock 
+	    << " on " << *lock->get_parent() << dendl;
+    simple_sync(lock, need_issue);
+  }
+  else if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
+    mds->mdcache->queue_file_recover(in);
+    mds->mdcache->do_file_recover();
+  }
+}
+
+
+
+void Locker::scatter_mix(ScatterLock *lock, bool *need_issue)
+{
+  dout(7) << "scatter_mix " << *lock << " on " << *lock->get_parent() << dendl;
+
+  CInode *in = static_cast<CInode*>(lock->get_parent());
+  ceph_assert(in->is_auth());
+  ceph_assert(lock->is_stable());
+
+  if (lock->get_state() == LOCK_LOCK) {
+    in->start_scatter(lock);
+    if (in->is_replicated()) {
+      // data
+      bufferlist softdata;
+      lock->encode_locked_state(softdata);
+
+      // bcast to replicas
+      send_lock_message(lock, LOCK_AC_MIX, softdata);
+    }
+
+    // change lock
+    lock->set_state(LOCK_MIX);
+    lock->clear_scatter_wanted();
+    if (lock->get_cap_shift()) {
+      if (need_issue)
+	*need_issue = true;
+      else
+	issue_caps(in);
+    }
+  } else {
+    // gather?
+    switch (lock->get_state()) {
+    case LOCK_SYNC: lock->set_state(LOCK_SYNC_MIX); break;
+    case LOCK_EXCL: lock->set_state(LOCK_EXCL_MIX); break;
+    case LOCK_XSYN: lock->set_state(LOCK_XSYN_MIX); break;
+    case LOCK_TSYN: lock->set_state(LOCK_TSYN_MIX); break;
+    default: ceph_abort();
+    }
+
+    int gather = 0;
+    if (lock->is_rdlocked()) {
+      if (lock->is_cached())
+	invalidate_lock_caches(lock);
+      gather++;
+    }
+    if (in->is_replicated()) {
+      if (lock->get_state() == LOCK_SYNC_MIX) { // for the rest states, replicas are already LOCK
+	send_lock_message(lock, LOCK_AC_MIX);
+	lock->init_gather();
+	gather++;
+      }
+    }
+    if (lock->is_leased()) {
+      revoke_client_leases(lock);
+      gather++;
+    }
+    if (lock->get_cap_shift() &&
+	in->is_head() &&
+	in->issued_caps_need_gather(lock)) {
+      if (need_issue)
+	*need_issue = true;
+      else
+	issue_caps(in);
+      gather++;
+    }
+    bool need_recover = false;
+    if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
+      mds->mdcache->queue_file_recover(in);
+      need_recover = true;
+      gather++;
+    }
+
+    if (gather) {
+      lock->get_parent()->auth_pin(lock);
+      if (need_recover)
+	mds->mdcache->do_file_recover();
+    } else {
+      in->start_scatter(lock);
+      lock->set_state(LOCK_MIX);
+      lock->clear_scatter_wanted();
+      if (in->is_replicated()) {
+	bufferlist softdata;
+	lock->encode_locked_state(softdata);
+	send_lock_message(lock, LOCK_AC_MIX, softdata);
+      }
+      if (lock->get_cap_shift()) {
+	if (need_issue)
+	  *need_issue = true;
+	else
+	  issue_caps(in);
+      }
+    }
+  }
+}
+
+
+void Locker::file_excl(ScatterLock *lock, bool *need_issue)
+{
+  CInode *in = static_cast<CInode*>(lock->get_parent());
+  dout(7) << "file_excl " << *lock << " on " << *lock->get_parent() << dendl;  
+
+  ceph_assert(in->is_auth());
+  ceph_assert(lock->is_stable());
+
+  ceph_assert((in->get_loner() >= 0 && in->get_mds_caps_wanted().empty()) ||
+	 (lock->get_state() == LOCK_XSYN));  // must do xsyn -> excl -> <anything else>
+  
+  switch (lock->get_state()) {
+  case LOCK_SYNC: lock->set_state(LOCK_SYNC_EXCL); break;
+  case LOCK_MIX: lock->set_state(LOCK_MIX_EXCL); break;
+  case LOCK_LOCK: lock->set_state(LOCK_LOCK_EXCL); break;
+  case LOCK_XSYN: lock->set_state(LOCK_XSYN_EXCL); break;
+  default: ceph_abort();
+  }
+  int gather = 0;
+  
+  if (lock->is_rdlocked())
+    gather++;
+  if (lock->is_wrlocked())
+    gather++;
+  if (gather && lock->is_cached())
+    invalidate_lock_caches(lock);
+
+  if (in->is_replicated() &&
+      lock->get_state() != LOCK_LOCK_EXCL &&
+      lock->get_state() != LOCK_XSYN_EXCL) {  // if we were lock, replicas are already lock.
+    send_lock_message(lock, LOCK_AC_LOCK);
+    lock->init_gather();
+    gather++;
+  }
+  if (lock->is_leased()) {
+    revoke_client_leases(lock);
+    gather++;
+  }
+  if (in->is_head() &&
+      in->issued_caps_need_gather(lock)) {
+    if (need_issue)
+      *need_issue = true;
+    else
+      issue_caps(in);
+    gather++;
+  }
+  bool need_recover = false;
+  if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
+    mds->mdcache->queue_file_recover(in);
+    need_recover = true;
+    gather++;
+  }
+  
+  if (gather) {
+    lock->get_parent()->auth_pin(lock);
+    if (need_recover)
+      mds->mdcache->do_file_recover();
+  } else {
+    lock->set_state(LOCK_EXCL);
+    if (need_issue)
+      *need_issue = true;
+    else
+      issue_caps(in);
+  }
+}
+
+void Locker::file_xsyn(SimpleLock *lock, bool *need_issue)
+{
+  dout(7) << "file_xsyn on " << *lock << " on " << *lock->get_parent() << dendl;
+  CInode *in = static_cast<CInode *>(lock->get_parent());
+  ceph_assert(in->is_auth());
+  ceph_assert(in->get_loner() >= 0 && in->get_mds_caps_wanted().empty());
+
+  switch (lock->get_state()) {
+  case LOCK_EXCL: lock->set_state(LOCK_EXCL_XSYN); break;
+  default: ceph_abort();
+  }
+  
+  int gather = 0;
+  if (lock->is_wrlocked()) {
+    if (lock->is_cached())
+      invalidate_lock_caches(lock);
+    gather++;
+  }
+
+  if (in->is_head() &&
+      in->issued_caps_need_gather(lock)) {
+    if (need_issue)
+      *need_issue = true;
+    else
+      issue_caps(in);
+    gather++;
+  }
+  
+  if (gather) {
+    lock->get_parent()->auth_pin(lock);
+  } else {
+    lock->set_state(LOCK_XSYN);
+    lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
+    if (need_issue)
+      *need_issue = true;
+    else
+      issue_caps(in);
+  }
+}
+
+void Locker::file_recover(ScatterLock *lock)
+{
+  CInode *in = static_cast<CInode *>(lock->get_parent());
+  dout(7) << "file_recover " << *lock << " on " << *in << dendl;
+
+  ceph_assert(in->is_auth());
+  //assert(lock->is_stable());
+  ceph_assert(lock->get_state() == LOCK_PRE_SCAN); // only called from MDCache::start_files_to_recover()
+
+  int gather = 0;
+  
+  /*
+  if (in->is_replicated()
+      lock->get_sm()->states[oldstate].replica_state != LOCK_LOCK) {
+    send_lock_message(lock, LOCK_AC_LOCK);
+    lock->init_gather();
+    gather++;
+  }
+  */
+  if (in->is_head() &&
+      in->issued_caps_need_gather(lock)) {
+    issue_caps(in);
+    gather++;
+  }
+
+  lock->set_state(LOCK_SCAN);
+  if (gather)
+    in->state_set(CInode::STATE_NEEDSRECOVER);
+  else
+    mds->mdcache->queue_file_recover(in);
+}
+
+
+// messenger
+void Locker::handle_file_lock(ScatterLock *lock, const cref_t<MLock> &m)
+{
+  CInode *in = static_cast<CInode*>(lock->get_parent());
+  int from = m->get_asker();
+
+  if (mds->is_rejoin()) {
+    if (in->is_rejoining()) {
+      dout(7) << "handle_file_lock still rejoining " << *in
+	      << ", dropping " << *m << dendl;
+      return;
+    }
+  }
+
+  dout(7) << "handle_file_lock a=" << lock->get_lock_action_name(m->get_action())
+	  << " on " << *lock
+	  << " from mds." << from << " " 
+	  << *in << dendl;
+
+  bool caps = lock->get_cap_shift();
+  
+  switch (m->get_action()) {
+    // -- replica --
+  case LOCK_AC_SYNC:
+    ceph_assert(lock->get_state() == LOCK_LOCK ||
+	   lock->get_state() == LOCK_MIX ||
+	   lock->get_state() == LOCK_MIX_SYNC2);
+    
+    if (lock->get_state() == LOCK_MIX) {
+      lock->set_state(LOCK_MIX_SYNC);
+      eval_gather(lock, true);
+      if (lock->is_unstable_and_locked()) {
+	if (lock->is_cached())
+	  invalidate_lock_caches(lock);
+	mds->mdlog->flush();
+      }
+      break;
+    }
+
+    (static_cast<ScatterLock *>(lock))->finish_flush();
+    (static_cast<ScatterLock *>(lock))->clear_flushed();
+
+    // ok
+    lock->decode_locked_state(m->get_data());
+    lock->set_state(LOCK_SYNC);
+
+    lock->get_rdlock();
+    if (caps)
+      issue_caps(in);
+    lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
+    lock->put_rdlock();
+    break;
+    
+  case LOCK_AC_LOCK:
+    switch (lock->get_state()) {
+    case LOCK_SYNC: lock->set_state(LOCK_SYNC_LOCK); break;
+    case LOCK_MIX: lock->set_state(LOCK_MIX_LOCK); break;
+    default: ceph_abort();
+    }
+
+    eval_gather(lock, true);
+    if (lock->is_unstable_and_locked()) {
+      if (lock->is_cached())
+	invalidate_lock_caches(lock);
+      mds->mdlog->flush();
+    }
+
+    break;
+
+  case LOCK_AC_LOCKFLUSHED:
+    (static_cast<ScatterLock *>(lock))->finish_flush();
+    (static_cast<ScatterLock *>(lock))->clear_flushed();
+    // wake up scatter_nudge waiters
+    if (lock->is_stable())
+      lock->finish_waiters(SimpleLock::WAIT_STABLE);
+    break;
+    
+  case LOCK_AC_MIX:
+    ceph_assert(lock->get_state() == LOCK_SYNC ||
+           lock->get_state() == LOCK_LOCK ||
+	   lock->get_state() == LOCK_SYNC_MIX2);
+    
+    if (lock->get_state() == LOCK_SYNC) {
+      // MIXED
+      lock->set_state(LOCK_SYNC_MIX);
+      eval_gather(lock, true);
+      if (lock->is_unstable_and_locked()) {
+	if (lock->is_cached())
+	  invalidate_lock_caches(lock);
+	mds->mdlog->flush();
+      }
+      break;
+    } 
+
+    // ok
+    lock->set_state(LOCK_MIX);
+    lock->decode_locked_state(m->get_data());
+
+    if (caps)
+      issue_caps(in);
+    
+    lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
+    break;
+
+
+    // -- auth --
+  case LOCK_AC_LOCKACK:
+    ceph_assert(lock->get_state() == LOCK_SYNC_LOCK ||
+           lock->get_state() == LOCK_MIX_LOCK ||
+           lock->get_state() == LOCK_MIX_LOCK2 ||
+           lock->get_state() == LOCK_MIX_EXCL ||
+           lock->get_state() == LOCK_SYNC_EXCL ||
+           lock->get_state() == LOCK_SYNC_MIX ||
+	   lock->get_state() == LOCK_MIX_TSYN);
+    ceph_assert(lock->is_gathering(from));
+    lock->remove_gather(from);
+    
+    if (lock->get_state() == LOCK_MIX_LOCK ||
+	lock->get_state() == LOCK_MIX_LOCK2 ||
+	lock->get_state() == LOCK_MIX_EXCL ||
+	lock->get_state() == LOCK_MIX_TSYN) {
+      lock->decode_locked_state(m->get_data());
+      // replica is waiting for AC_LOCKFLUSHED, eval_gather() should not
+      // delay calling scatter_writebehind().
+      lock->clear_flushed();
+    }
+
+    if (lock->is_gathering()) {
+      dout(7) << "handle_file_lock " << *in << " from " << from
+	      << ", still gathering " << lock->get_gather_set() << dendl;
+    } else {
+      dout(7) << "handle_file_lock " << *in << " from " << from
+	      << ", last one" << dendl;
+      eval_gather(lock);
+    }
+    break;
+    
+  case LOCK_AC_SYNCACK:
+    ceph_assert(lock->get_state() == LOCK_MIX_SYNC);
+    ceph_assert(lock->is_gathering(from));
+    lock->remove_gather(from);
+    
+    lock->decode_locked_state(m->get_data());
+
+    if (lock->is_gathering()) {
+      dout(7) << "handle_file_lock " << *in << " from " << from
+	      << ", still gathering " << lock->get_gather_set() << dendl;
+    } else {
+      dout(7) << "handle_file_lock " << *in << " from " << from
+	      << ", last one" << dendl;
+      eval_gather(lock);
+    }
+    break;
+
+  case LOCK_AC_MIXACK:
+    ceph_assert(lock->get_state() == LOCK_SYNC_MIX);
+    ceph_assert(lock->is_gathering(from));
+    lock->remove_gather(from);
+    
+    if (lock->is_gathering()) {
+      dout(7) << "handle_file_lock " << *in << " from " << from
+	      << ", still gathering " << lock->get_gather_set() << dendl;
+    } else {
+      dout(7) << "handle_file_lock " << *in << " from " << from
+	      << ", last one" << dendl;
+      eval_gather(lock);
+    }
+    break;
+
+
+    // requests....
+  case LOCK_AC_REQSCATTER:
+    if (lock->is_stable()) {
+      /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing)
+       *  because the replica should be holding an auth_pin if they're
+       *  doing this (and thus, we are freezing, not frozen, and indefinite
+       *  starvation isn't an issue).
+       */
+      dout(7) << "handle_file_lock got scatter request on " << *lock
+	      << " on " << *lock->get_parent() << dendl;
+      if (lock->get_state() != LOCK_MIX)  // i.e., the reqscatter didn't race with an actual mix/scatter
+	scatter_mix(lock);
+    } else {
+      dout(7) << "handle_file_lock got scatter request, !stable, marking scatter_wanted on " << *lock
+	      << " on " << *lock->get_parent() << dendl;
+      lock->set_scatter_wanted();
+    }
+    break;
+
+  case LOCK_AC_REQUNSCATTER:
+    if (lock->is_stable()) {
+      /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing)
+       *  because the replica should be holding an auth_pin if they're
+       *  doing this (and thus, we are freezing, not frozen, and indefinite
+       *  starvation isn't an issue).
+       */
+      dout(7) << "handle_file_lock got unscatter request on " << *lock
+	      << " on " << *lock->get_parent() << dendl;
+      if (lock->get_state() == LOCK_MIX)  // i.e., the reqscatter didn't race with an actual mix/scatter
+	simple_lock(lock);  // FIXME tempsync?
+    } else {
+      dout(7) << "handle_file_lock ignoring unscatter request on " << *lock
+	      << " on " << *lock->get_parent() << dendl;
+      lock->set_unscatter_wanted();
+    }
+    break;
+
+  case LOCK_AC_REQRDLOCK:
+    handle_reqrdlock(lock, m);
+    break;
+
+  case LOCK_AC_NUDGE:
+    if (!lock->get_parent()->is_auth()) {
+      dout(7) << "handle_file_lock IGNORING nudge on non-auth " << *lock
+	      << " on " << *lock->get_parent() << dendl;
+    } else if (!lock->get_parent()->is_replicated()) {
+      dout(7) << "handle_file_lock IGNORING nudge on non-replicated " << *lock
+	      << " on " << *lock->get_parent() << dendl;
+    } else {
+      dout(7) << "handle_file_lock trying nudge on " << *lock
+	      << " on " << *lock->get_parent() << dendl;
+      scatter_nudge(lock, 0, true);
+      mds->mdlog->flush();
+    }
+    break;
+
+  default:
+    ceph_abort();
+  }  
+}
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
new file mode 100644
index 000000000..3aff8db0b
--- /dev/null
+++ b/src/mds/Locker.h
@@ -0,0 +1,270 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_LOCKER_H
+#define CEPH_MDS_LOCKER_H
+
+#include "include/types.h"
+
+#include "messages/MClientCaps.h"
+#include "messages/MClientCapRelease.h"
+#include "messages/MClientLease.h"
+#include "messages/MLock.h"
+
+#include "CInode.h"
+#include "SimpleLock.h"
+#include "MDSContext.h"
+#include "Mutation.h"
+#include "messages/MClientReply.h"
+
+struct SnapRealm;
+
+class MDSRank;
+class Session;
+class CDentry;
+class Capability;
+class SimpleLock;
+class ScatterLock;
+class LocalLockC;
+
+class Locker {
+public:
+  Locker(MDSRank *m, MDCache *c);
+
+  SimpleLock *get_lock(int lock_type, const MDSCacheObjectInfo &info);
+  
+  void dispatch(const cref_t<Message> &m);
+  void handle_lock(const cref_t<MLock> &m);
+
+  void tick();
+
+  void nudge_log(SimpleLock *lock);
+
+  bool acquire_locks(MDRequestRef& mdr,
+		     MutationImpl::LockOpVec& lov,
+		     CInode *auth_pin_freeze=NULL,
+		     bool auth_pin_nonblocking=false);
+
+  bool try_rdlock_snap_layout(CInode *in, MDRequestRef& mdr,
+			      int n=0, bool want_layout=false);
+
+  void notify_freeze_waiter(MDSCacheObject *o);
+  void cancel_locking(MutationImpl *mut, std::set<CInode*> *pneed_issue);
+  void drop_locks(MutationImpl *mut, std::set<CInode*> *pneed_issue=0);
+  void set_xlocks_done(MutationImpl *mut, bool skip_dentry=false);
+  void drop_non_rdlocks(MutationImpl *mut, std::set<CInode*> *pneed_issue=0);
+  void drop_rdlocks_for_early_reply(MutationImpl *mut);
+  void drop_locks_for_fragment_unfreeze(MutationImpl *mut);
+
+  int get_cap_bit_for_lock_cache(int op);
+  void create_lock_cache(MDRequestRef& mdr, CInode *diri, file_layout_t *dir_layout=nullptr);
+  bool find_and_attach_lock_cache(MDRequestRef& mdr, CInode *diri);
+  void invalidate_lock_caches(CDir *dir);
+  void invalidate_lock_caches(SimpleLock *lock);
+  void invalidate_lock_cache(MDLockCache *lock_cache);
+  void eval_lock_caches(Capability *cap);
+  void put_lock_cache(MDLockCache* lock_cache);
+
+  void eval_gather(SimpleLock *lock, bool first=false, bool *need_issue=0, MDSContext::vec *pfinishers=0);
+  void eval(SimpleLock *lock, bool *need_issue);
+  void eval_any(SimpleLock *lock, bool *need_issue, MDSContext::vec *pfinishers=0, bool first=false) {
+    if (!lock->is_stable())
+      eval_gather(lock, first, need_issue, pfinishers);
+    else if (lock->get_parent()->is_auth())
+      eval(lock, need_issue);
+  }
+
+  void eval_scatter_gathers(CInode *in);
+
+  void eval_cap_gather(CInode *in, std::set<CInode*> *issue_set=0);
+
+  bool eval(CInode *in, int mask, bool caps_imported=false);
+  void try_eval(MDSCacheObject *p, int mask);
+  void try_eval(SimpleLock *lock, bool *pneed_issue);
+
+  bool _rdlock_kick(SimpleLock *lock, bool as_anon);
+  bool rdlock_try(SimpleLock *lock, client_t client);
+  bool rdlock_start(SimpleLock *lock, MDRequestRef& mut, bool as_anon=false);
+  void rdlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue);
+  bool rdlock_try_set(MutationImpl::LockOpVec& lov, MDRequestRef& mdr);
+  bool rdlock_try_set(MutationImpl::LockOpVec& lov, MutationRef& mut);
+
+  void wrlock_force(SimpleLock *lock, MutationRef& mut);
+  bool wrlock_try(SimpleLock *lock, const MutationRef& mut, client_t client=-1);
+  bool wrlock_start(const MutationImpl::LockOp &op, MDRequestRef& mut);
+  void wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue);
+
+  void remote_wrlock_start(SimpleLock *lock, mds_rank_t target, MDRequestRef& mut);
+  void remote_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut);
+
+  bool xlock_start(SimpleLock *lock, MDRequestRef& mut);
+  void _finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue);
+  void xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue);
+
+  void xlock_export(const MutationImpl::lock_iterator& it, MutationImpl *mut);
+  void xlock_import(SimpleLock *lock);
+  void xlock_downgrade(SimpleLock *lock, MutationImpl *mut);
+
+  void try_simple_eval(SimpleLock *lock);
+  bool simple_rdlock_try(SimpleLock *lock, MDSContext *con);
+
+  bool simple_sync(SimpleLock *lock, bool *need_issue=0);
+
+   // scatter
+  void scatter_eval(ScatterLock *lock, bool *need_issue);        // public for MDCache::adjust_subtree_auth()
+
+  void scatter_tick();
+  void scatter_nudge(ScatterLock *lock, MDSContext *c, bool forcelockchange=false);
+
+  void mark_updated_scatterlock(ScatterLock *lock);
+
+  void handle_reqrdlock(SimpleLock *lock, const cref_t<MLock> &m);
+
+  // caps
+
+  // when to defer processing client cap release or writeback due to being
+  // frozen.  the condition must be consistent across handle_client_caps and
+  // process_request_cap_release to preserve ordering.
+  bool should_defer_client_cap_frozen(CInode *in);
+
+  void process_request_cap_release(MDRequestRef& mdr, client_t client, const ceph_mds_request_release& r,
+				   std::string_view dname);
+
+  void kick_cap_releases(MDRequestRef& mdr);
+  void kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq);
+
+  void remove_client_cap(CInode *in, Capability *cap, bool kill=false);
+
+  std::set<client_t> get_late_revoking_clients(double timeout) const;
+
+  void snapflush_nudge(CInode *in);
+  void mark_need_snapflush_inode(CInode *in);
+  bool is_revoking_any_caps_from(client_t client);
+
+  // local
+  void local_wrlock_grab(LocalLockC *lock, MutationRef& mut);
+
+  // file
+  void file_eval(ScatterLock *lock, bool *need_issue);
+  void file_recover(ScatterLock *lock);
+
+  void mark_updated_Filelock(ScatterLock *lock);
+
+  // -- file i/o --
+  version_t issue_file_data_version(CInode *in);
+  Capability* issue_new_caps(CInode *in, int mode, MDRequestRef& mdr, SnapRealm *conrealm);
+  int get_allowed_caps(CInode *in, Capability *cap, int &all_allowed,
+                       int &loner_allowed, int &xlocker_allowed);
+  int issue_caps(CInode *in, Capability *only_cap=0);
+  void issue_caps_set(std::set<CInode*>& inset);
+  void issue_truncate(CInode *in);
+  void revoke_stale_cap(CInode *in, client_t client);
+  bool revoke_stale_caps(Session *session);
+  void resume_stale_caps(Session *session);
+  void remove_stale_leases(Session *session);
+
+  void request_inode_file_caps(CInode *in);
+
+  bool check_client_ranges(CInode *in, uint64_t size);
+  bool calc_new_client_ranges(CInode *in, uint64_t size,
+			      bool *max_increased=nullptr);
+  bool check_inode_max_size(CInode *in, bool force_wrlock=false,
+                            uint64_t newmax=0, uint64_t newsize=0,
+			    utime_t mtime=utime_t());
+  void share_inode_max_size(CInode *in, Capability *only_cap=0);
+
+  // -- client leases --
+  void handle_client_lease(const cref_t<MClientLease> &m);
+
+  void issue_client_lease(CDentry *dn, CInode *in, MDRequestRef &mdr, utime_t now, bufferlist &bl);
+  void revoke_client_leases(SimpleLock *lock);
+  static void encode_lease(bufferlist& bl, const session_info_t& info, const LeaseStat& ls);
+
+protected:
+  void send_lock_message(SimpleLock *lock, int msg);
+  void send_lock_message(SimpleLock *lock, int msg, const bufferlist &data);
+
+  // -- locks --
+  void _drop_locks(MutationImpl *mut, std::set<CInode*> *pneed_issue, bool drop_rdlocks);
+
+  void simple_eval(SimpleLock *lock, bool *need_issue);
+  void handle_simple_lock(SimpleLock *lock, const cref_t<MLock> &m);
+
+  void simple_lock(SimpleLock *lock, bool *need_issue=0);
+  void simple_excl(SimpleLock *lock, bool *need_issue=0);
+  void simple_xlock(SimpleLock *lock);
+
+  void handle_scatter_lock(ScatterLock *lock, const cref_t<MLock> &m);
+  bool scatter_scatter_fastpath(ScatterLock *lock);
+  void scatter_scatter(ScatterLock *lock, bool nowait=false);
+  void scatter_tempsync(ScatterLock *lock, bool *need_issue=0);
+
+  void scatter_writebehind(ScatterLock *lock);
+
+  void scatter_writebehind_finish(ScatterLock *lock, MutationRef& mut);
+
+  bool _need_flush_mdlog(CInode *in, int wanted_caps, bool lock_state_any=false);
+  void adjust_cap_wanted(Capability *cap, int wanted, int issue_seq);
+  void handle_client_caps(const cref_t<MClientCaps> &m);
+  void _update_cap_fields(CInode *in, int dirty, const cref_t<MClientCaps> &m, CInode::mempool_inode *pi);
+  void _do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, const cref_t<MClientCaps> &m, const ref_t<MClientCaps> &ack);
+  void _do_null_snapflush(CInode *head_in, client_t client, snapid_t last=CEPH_NOSNAP);
+  bool _do_cap_update(CInode *in, Capability *cap, int dirty, snapid_t follows, const cref_t<MClientCaps> &m,
+		      const ref_t<MClientCaps> &ack, bool *need_flush=NULL);
+  void handle_client_cap_release(const cref_t<MClientCapRelease> &m);
+  void _do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id, ceph_seq_t mseq, ceph_seq_t seq);
+  void caps_tick();
+
+  bool local_wrlock_start(LocalLockC *lock, MDRequestRef& mut);
+  void local_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut);
+  bool local_xlock_start(LocalLockC *lock, MDRequestRef& mut);
+  void local_xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut);
+
+  void handle_file_lock(ScatterLock *lock, const cref_t<MLock> &m);
+  void scatter_mix(ScatterLock *lock, bool *need_issue=0);
+  void file_excl(ScatterLock *lock, bool *need_issue=0);
+  void file_xsyn(SimpleLock *lock, bool *need_issue=0);
+
+  void handle_inode_file_caps(const cref_t<MInodeFileCaps> &m);
+
+  void file_update_finish(CInode *in, MutationRef& mut, unsigned flags,
+			  client_t client, const ref_t<MClientCaps> &ack);
+
+  xlist<ScatterLock*> updated_scatterlocks;
+
+  // Maintain a global list to quickly find if any caps are late revoking
+  xlist<Capability*> revoking_caps;
+  // Maintain a per-client list to find clients responsible for late ones quickly
+  std::map<client_t, xlist<Capability*> > revoking_caps_by_client;
+
+  elist<CInode*> need_snapflush_inodes;
+
+private:
+  friend class C_MDL_CheckMaxSize;
+  friend class C_MDL_RequestInodeFileCaps;
+  friend class C_Locker_FileUpdate_finish;
+  friend class C_Locker_RetryCapRelease;
+  friend class C_Locker_Eval;
+  friend class C_Locker_ScatterWB;
+  friend class LockerContext;
+  friend class LockerLogContext;
+
+  bool any_late_revoking_caps(xlist<Capability*> const &revoking, double timeout) const;
+  uint64_t calc_new_max_size(const CInode::inode_const_ptr& pi, uint64_t size);
+
+  MDSRank *mds;
+  MDCache *mdcache;
+  xlist<ScatterLock*> updated_filelocks;
+};
+#endif
diff --git a/src/mds/LogEvent.cc b/src/mds/LogEvent.cc
new file mode 100644
index 000000000..3df8f327c
--- /dev/null
+++ b/src/mds/LogEvent.cc
@@ -0,0 +1,215 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "common/config.h"
+#include "LogEvent.h"
+
+#include "MDSRank.h"
+
+// events i know of
+#include "events/ESubtreeMap.h"
+#include "events/EExport.h"
+#include "events/EImportStart.h"
+#include "events/EImportFinish.h"
+#include "events/EFragment.h"
+
+#include "events/EResetJournal.h"
+#include "events/ESession.h"
+#include "events/ESessions.h"
+
+#include "events/EUpdate.h"
+#include "events/EPeerUpdate.h"
+#include "events/EOpen.h"
+#include "events/ECommitted.h"
+#include "events/EPurged.h"
+
+#include "events/ETableClient.h"
+#include "events/ETableServer.h"
+
+#include "events/ENoOp.h"
+
+#define dout_context g_ceph_context
+
+
+std::unique_ptr<LogEvent> LogEvent::decode_event(bufferlist::const_iterator p)
+{
+  // parse type, length
+  EventType type;
+  std::unique_ptr<LogEvent> event;
+  using ceph::decode;
+  decode(type, p);
+
+  if (EVENT_NEW_ENCODING == type) {
+    try {
+      DECODE_START(1, p);
+      decode(type, p);
+      event = decode_event(p, type);
+      DECODE_FINISH(p);
+    }
+    catch (const buffer::error &e) {
+      generic_dout(0) << "failed to decode LogEvent (type maybe " << type << ")" << dendl;
+      return NULL;
+    }
+  } else { // we are using classic encoding
+    event = decode_event(p, type);
+  }
+  return event;
+}
+
+
+std::string_view LogEvent::get_type_str() const
+{
+  switch(_type) {
+  case EVENT_SUBTREEMAP: return "SUBTREEMAP";
+  case EVENT_SUBTREEMAP_TEST: return "SUBTREEMAP_TEST";
+  case EVENT_EXPORT: return "EXPORT";
+  case EVENT_IMPORTSTART: return "IMPORTSTART";
+  case EVENT_IMPORTFINISH: return "IMPORTFINISH";
+  case EVENT_FRAGMENT: return "FRAGMENT";
+  case EVENT_RESETJOURNAL: return "RESETJOURNAL";
+  case EVENT_SESSION: return "SESSION";
+  case EVENT_SESSIONS_OLD: return "SESSIONS_OLD";
+  case EVENT_SESSIONS: return "SESSIONS";
+  case EVENT_UPDATE: return "UPDATE";
+  case EVENT_PEERUPDATE: return "PEERUPDATE";
+  case EVENT_OPEN: return "OPEN";
+  case EVENT_COMMITTED: return "COMMITTED";
+  case EVENT_PURGED: return "PURGED";
+  case EVENT_TABLECLIENT: return "TABLECLIENT";
+  case EVENT_TABLESERVER: return "TABLESERVER";
+  case EVENT_NOOP: return "NOOP";
+
+  default:
+    generic_dout(0) << "get_type_str: unknown type " << _type << dendl;
+    return "UNKNOWN";
+  }
+}
+
+const std::map<std::string, LogEvent::EventType> LogEvent::types = {
+  {"SUBTREEMAP", EVENT_SUBTREEMAP},
+  {"SUBTREEMAP_TEST", EVENT_SUBTREEMAP_TEST},
+  {"EXPORT", EVENT_EXPORT},
+  {"IMPORTSTART", EVENT_IMPORTSTART},
+  {"IMPORTFINISH", EVENT_IMPORTFINISH},
+  {"FRAGMENT", EVENT_FRAGMENT},
+  {"RESETJOURNAL", EVENT_RESETJOURNAL},
+  {"SESSION", EVENT_SESSION},
+  {"SESSIONS_OLD", EVENT_SESSIONS_OLD},
+  {"SESSIONS", EVENT_SESSIONS},
+  {"UPDATE", EVENT_UPDATE},
+  {"PEERUPDATE", EVENT_PEERUPDATE},
+  {"OPEN", EVENT_OPEN},
+  {"COMMITTED", EVENT_COMMITTED},
+  {"PURGED", EVENT_PURGED},
+  {"TABLECLIENT", EVENT_TABLECLIENT},
+  {"TABLESERVER", EVENT_TABLESERVER},
+  {"NOOP", EVENT_NOOP}
+};
+
+/*
+ * Resolve type string to type enum
+ *
+ * Return -1 if not found
+ */
+LogEvent::EventType LogEvent::str_to_type(std::string_view str)
+{
+  return LogEvent::types.at(std::string(str));
+}
+
+
+std::unique_ptr<LogEvent> LogEvent::decode_event(bufferlist::const_iterator& p, LogEvent::EventType type)
+{
+  const auto length = p.get_remaining();
+  generic_dout(15) << "decode_log_event type " << type << ", size " << length << dendl;
+  
+  // create event
+  std::unique_ptr<LogEvent> le;
+  switch (type) {
+  case EVENT_SUBTREEMAP:
+    le = std::make_unique<ESubtreeMap>();
+    break;
+  case EVENT_SUBTREEMAP_TEST: 
+    le = std::make_unique<ESubtreeMap>();
+    le->set_type(type);
+    break;
+  case EVENT_EXPORT:
+    le = std::make_unique<EExport>();
+    break;
+  case EVENT_IMPORTSTART:
+    le = std::make_unique<EImportStart>();
+    break;
+  case EVENT_IMPORTFINISH:
+    le = std::make_unique<EImportFinish>();
+    break;
+  case EVENT_FRAGMENT:
+    le = std::make_unique<EFragment>();
+    break;
+  case EVENT_RESETJOURNAL:
+    le = std::make_unique<EResetJournal>();
+    break;
+  case EVENT_SESSION:
+    le = std::make_unique<ESession>();
+    break;
+  case EVENT_SESSIONS_OLD:
+    {
+      auto e = std::make_unique<ESessions>();
+      e->mark_old_encoding();
+      le = std::move(e);
+    }
+    break;
+  case EVENT_SESSIONS:
+    le = std::make_unique<ESessions>();
+    break;
+  case EVENT_UPDATE:
+    le = std::make_unique<EUpdate>();
+    break;
+  case EVENT_PEERUPDATE:
+    le = std::make_unique<EPeerUpdate>();
+    break;
+  case EVENT_OPEN:
+    le = std::make_unique<EOpen>();
+    break;
+  case EVENT_COMMITTED:
+    le = std::make_unique<ECommitted>();
+    break;
+  case EVENT_PURGED:
+    le = std::make_unique<EPurged>();
+    break;
+  case EVENT_TABLECLIENT:
+    le = std::make_unique<ETableClient>();
+    break;
+  case EVENT_TABLESERVER:
+    le = std::make_unique<ETableServer>();
+    break;
+  case EVENT_NOOP:
+    le = std::make_unique<ENoOp>();
+    break;
+  default:
+    generic_dout(0) << "uh oh, unknown log event type " << type << " length " << length << dendl;
+    return nullptr;
+  }
+
+  // decode
+  try {
+    le->decode(p);
+  }
+  catch (const buffer::error &e) {
+    generic_dout(0) << "failed to decode LogEvent type " << type << dendl;
+    return nullptr;
+  }
+
+  ceph_assert(p.end());
+  return le;
+}
+
diff --git a/src/mds/LogEvent.h b/src/mds/LogEvent.h
new file mode 100644
index 000000000..4e368c97b
--- /dev/null
+++ b/src/mds/LogEvent.h
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_LOGEVENT_H
+#define CEPH_LOGEVENT_H
+
+#define EVENT_NEW_ENCODING 0 // indicates that the encoding is versioned
+#define EVENT_UNUSED       1 // was previously EVENT_STRING
+
+#define EVENT_SUBTREEMAP   2
+#define EVENT_EXPORT       3
+#define EVENT_IMPORTSTART  4
+#define EVENT_IMPORTFINISH 5
+#define EVENT_FRAGMENT     6
+
+#define EVENT_RESETJOURNAL 9
+
+#define EVENT_SESSION      10
+#define EVENT_SESSIONS_OLD 11
+#define EVENT_SESSIONS     12
+
+#define EVENT_UPDATE       20
+#define EVENT_PEERUPDATE   21
+#define EVENT_OPEN         22
+#define EVENT_COMMITTED    23
+#define EVENT_PURGED       24
+
+#define EVENT_TABLECLIENT  42
+#define EVENT_TABLESERVER  43
+
+#define EVENT_SUBTREEMAP_TEST   50
+#define EVENT_NOOP        51
+
+
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "include/utime.h"
+
+class MDSRank;
+class LogSegment;
+class EMetaBlob;
+
+// generic log event
+class LogEvent {
+public:
+  typedef __u32 EventType;
+  friend class MDLog;
+
+  LogEvent() = delete;
+  explicit LogEvent(int t) : _type(t) {}
+  LogEvent(const LogEvent&) = delete;
+  LogEvent& operator=(const LogEvent&) = delete;
+  virtual ~LogEvent() {}
+
+  std::string_view get_type_str() const;
+  static EventType str_to_type(std::string_view str);
+  EventType get_type() const { return _type; }
+  void set_type(EventType t) { _type = t; }
+
+  uint64_t get_start_off() const { return _start_off; }
+  void set_start_off(uint64_t o) { _start_off = o; }
+
+  utime_t get_stamp() const { return stamp; }
+  void set_stamp(utime_t t) { stamp = t; }
+
+  // encoding
+  virtual void encode(bufferlist& bl, uint64_t features) const = 0;
+  virtual void decode(bufferlist::const_iterator &) = 0;
+  static std::unique_ptr<LogEvent> decode_event(bufferlist::const_iterator);
+  virtual void dump(Formatter *f) const = 0;
+
+  void encode_with_header(bufferlist& bl, uint64_t features) {
+    using ceph::encode;
+    encode(EVENT_NEW_ENCODING, bl);
+    ENCODE_START(1, 1, bl)
+    encode(_type, bl);
+    this->encode(bl, features);
+    ENCODE_FINISH(bl);
+  }
+
+  virtual void print(ostream& out) const { 
+    out << "event(" << _type << ")";
+  }
+
+  /*** live journal ***/
+  /* update_segment() - adjust any state we need to in the LogSegment 
+   */
+  virtual void update_segment() { }
+
+  /*** recovery ***/
+  /* replay() - replay given event.  this is idempotent.
+   */
+  virtual void replay(MDSRank *m) { ceph_abort(); }
+
+  /**
+   * If the subclass embeds a MetaBlob, return it here so that
+   * tools can examine metablobs while traversing lists of LogEvent.
+   */
+  virtual EMetaBlob *get_metablob() { return NULL; }
+
+protected:
+  LogSegment* get_segment() { return _segment; }
+  LogSegment const* get_segment() const { return _segment; }
+
+  utime_t stamp;
+
+private:
+  static const std::map<std::string, LogEvent::EventType> types;
+
+  static std::unique_ptr<LogEvent> decode_event(bufferlist::const_iterator&, EventType);
+
+  EventType _type = 0;
+  uint64_t _start_off = 0;
+  LogSegment *_segment = nullptr;
+};
+
+inline ostream& operator<<(ostream& out, const LogEvent &le) {
+  le.print(out);
+  return out;
+}
+
+#endif
diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h
new file mode 100644
index 000000000..a7f3f3971
--- /dev/null
+++ b/src/mds/LogSegment.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_LOGSEGMENT_H
+#define CEPH_LOGSEGMENT_H
+
+#include "include/elist.h"
+#include "include/interval_set.h"
+#include "include/Context.h"
+#include "MDSContext.h"
+#include "mdstypes.h"
+#include "CInode.h"
+#include "CDentry.h"
+#include "CDir.h"
+
+#include "include/unordered_set.h"
+
+using ceph::unordered_set;
+
+class CDir;
+class CInode;
+class CDentry;
+class MDSRank;
+struct MDPeerUpdate;
+
+class LogSegment {
+ public:
+  using seq_t = uint64_t;
+
+  LogSegment(uint64_t _seq, loff_t off=-1) :
+    seq(_seq), offset(off), end(off),
+    dirty_dirfrags(member_offset(CDir, item_dirty)),
+    new_dirfrags(member_offset(CDir, item_new)),
+    dirty_inodes(member_offset(CInode, item_dirty)),
+    dirty_dentries(member_offset(CDentry, item_dirty)),
+    open_files(member_offset(CInode, item_open_file)),
+    dirty_parent_inodes(member_offset(CInode, item_dirty_parent)),
+    dirty_dirfrag_dir(member_offset(CInode, item_dirty_dirfrag_dir)),
+    dirty_dirfrag_nest(member_offset(CInode, item_dirty_dirfrag_nest)),
+    dirty_dirfrag_dirfragtree(member_offset(CInode, item_dirty_dirfrag_dirfragtree))
+  {}
+
+  void try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio);
+  void purge_inodes_finish(interval_set<inodeno_t>& inos){
+    purging_inodes.subtract(inos);
+    if (NULL != purged_cb &&
+	purging_inodes.empty())
+      purged_cb->complete(0);
+  }
+  void set_purged_cb(MDSContext* c){
+    ceph_assert(purged_cb == NULL);
+    purged_cb = c;
+  }
+  void wait_for_expiry(MDSContext *c)
+  {
+    ceph_assert(c != NULL);
+    expiry_waiters.push_back(c);
+  }
+
+  const seq_t seq;
+  uint64_t offset, end;
+  int num_events = 0;
+
+  // dirty items
+  elist<CDir*>    dirty_dirfrags, new_dirfrags;
+  elist<CInode*>  dirty_inodes;
+  elist<CDentry*> dirty_dentries;
+
+  elist<CInode*>  open_files;
+  elist<CInode*>  dirty_parent_inodes;
+  elist<CInode*>  dirty_dirfrag_dir;
+  elist<CInode*>  dirty_dirfrag_nest;
+  elist<CInode*>  dirty_dirfrag_dirfragtree;
+
+  set<CInode*> truncating_inodes;
+  interval_set<inodeno_t> purging_inodes;
+  MDSContext* purged_cb = nullptr;
+
+  map<int, ceph::unordered_set<version_t> > pending_commit_tids;  // mdstable
+  set<metareqid_t> uncommitted_leaders;
+  set<metareqid_t> uncommitted_peers;
+  set<dirfrag_t> uncommitted_fragments;
+
+  // client request ids
+  map<int, ceph_tid_t> last_client_tids;
+
+  // potentially dirty sessions
+  std::set<entity_name_t> touched_sessions;
+
+  // table version
+  version_t inotablev = 0;
+  version_t sessionmapv = 0;
+  map<int,version_t> tablev;
+
+  MDSContext::vec expiry_waiters;
+};
+
+#endif
diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc
new file mode 100644
index 000000000..5ea313560
--- /dev/null
+++ b/src/mds/MDBalancer.cc
@@ -0,0 +1,1499 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "mdstypes.h"
+
+#include "mon/MonClient.h"
+#include "MDBalancer.h"
+#include "MDSRank.h"
+#include "MDSMap.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "MDCache.h"
+#include "Migrator.h"
+#include "Mantle.h"
+
+#include "include/Context.h"
+#include "msg/Messenger.h"
+
+#include <fstream>
+#include <vector>
+#include <map>
+using std::map;
+using std::vector;
+using std::chrono::duration_cast;
+
+#include "common/config.h"
+#include "common/errno.h"
+
+#define dout_context g_ceph_context
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal " << __func__ << " "
+#undef dout
+#define dout(lvl) \
+  do {\
+    auto subsys = ceph_subsys_mds;\
+    if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
+      subsys = ceph_subsys_mds_balancer;\
+    }\
+    dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
+#undef dendl
+#define dendl dendl_impl; } while (0)
+
+
+#define MIN_LOAD    50   //  ??
+#define MIN_REEXPORT 5  // will automatically reexport
+#define MIN_OFFLOAD 10   // point at which i stop trying, close enough
+
+
+int MDBalancer::proc_message(const cref_t<Message> &m)
+{
+  switch (m->get_type()) {
+
+  case MSG_MDS_HEARTBEAT:
+    handle_heartbeat(ref_cast<MHeartbeat>(m));
+    break;
+
+  default:
+    derr << " balancer unknown message " << m->get_type() << dendl_impl;
+    ceph_abort_msg("balancer unknown message");
+  }
+
+  return 0;
+}
+
+MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
+    mds(m), messenger(msgr), mon_client(monc)
+{
+  bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
+  bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
+}
+
+void MDBalancer::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
+{
+  if (changed.count("mds_bal_fragment_dirs"))
+    bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
+  if (changed.count("mds_bal_fragment_interval"))
+    bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
+}
+
+void MDBalancer::handle_export_pins(void)
+{
+  const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
+  auto mdcache = mds->mdcache;
+
+  auto &q = mdcache->export_pin_queue;
+  auto it = q.begin();
+  dout(20) << "export_pin_queue size=" << q.size() << dendl;
+  while (it != q.end()) {
+    auto cur = it++;
+    CInode *in = *cur;
+    ceph_assert(in->is_dir());
+
+    mds_rank_t export_pin = in->get_export_pin(false);
+    in->check_pin_policy(export_pin);
+
+    if (export_pin >= max_mds) {
+      dout(20) << " delay export_pin=" << export_pin << " on " << *in << dendl;
+      in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
+      q.erase(cur);
+
+      in->state_set(CInode::STATE_DELAYEDEXPORTPIN);
+      mdcache->export_pin_delayed_queue.insert(in);
+      continue;
+    }
+
+    dout(20) << " executing export_pin=" << export_pin << " on " << *in << dendl;
+    unsigned min_frag_bits = 0;
+    mds_rank_t target = MDS_RANK_NONE;
+    if (export_pin >= 0)
+      target = export_pin;
+    else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
+      target = mdcache->hash_into_rank_bucket(in->ino());
+    else if (export_pin == MDS_RANK_EPHEMERAL_DIST)
+      min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
+
+    bool remove = true;
+    for (auto&& dir : in->get_dirfrags()) {
+      if (!dir->is_auth())
+	continue;
+
+      if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
+	if (dir->get_frag().bits() < min_frag_bits) {
+	  if (!dir->state_test(CDir::STATE_CREATING) &&
+	      !dir->is_frozen() && !dir->is_freezing()) {
+	    queue_split(dir, true);
+	  }
+	  remove = false;
+	  continue;
+	}
+	target = mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag());
+      }
+
+      if (target == MDS_RANK_NONE) {
+	if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
+	  if (dir->is_frozen() || dir->is_freezing()) {
+	    // try again later
+	    remove = false;
+	    continue;
+	  }
+	  dout(10) << " clear auxsubtree on " << *dir << dendl;
+	  dir->state_clear(CDir::STATE_AUXSUBTREE);
+	  mds->mdcache->try_subtree_merge(dir);
+	}
+      } else if (target == mds->get_nodeid()) {
+        if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
+          ceph_assert(dir->is_subtree_root());
+        } else if (dir->state_test(CDir::STATE_CREATING) ||
+	           dir->is_frozen() || dir->is_freezing()) {
+	  // try again later
+	  remove = false;
+	  continue;
+	} else if (!dir->is_subtree_root()) {
+	  dir->state_set(CDir::STATE_AUXSUBTREE);
+	  mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
+	  dout(10) << " create aux subtree on " << *dir << dendl;
+	} else {
+	  dout(10) << " set auxsubtree bit on " << *dir << dendl;
+	  dir->state_set(CDir::STATE_AUXSUBTREE);
+	}
+      } else {
+        /* Only export a directory if it's non-empty. An empty directory will
+         * be sent back by the importer.
+         */
+        if (dir->get_num_head_items() > 0) {
+	  mds->mdcache->migrator->export_dir(dir, target);
+        }
+	remove = false;
+      }
+    }
+
+    if (remove) {
+      in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
+      q.erase(cur);
+    }
+  }
+
+  std::vector<CDir*> authsubs = mdcache->get_auth_subtrees();
+  bool print_auth_subtrees = true;
+
+  if (authsubs.size() > AUTH_TREES_THRESHOLD &&
+      !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
+    dout(15) << "number of auth trees = " << authsubs.size() << "; not "
+		"printing auth trees" << dendl;
+    print_auth_subtrees = false;
+  }
+
+  for (auto &cd : authsubs) {
+    mds_rank_t export_pin = cd->inode->get_export_pin();
+    cd->inode->check_pin_policy(export_pin);
+
+    if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
+      export_pin = mdcache->hash_into_rank_bucket(cd->ino(), cd->get_frag());
+    } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) {
+      export_pin = mdcache->hash_into_rank_bucket(cd->ino());
+    }
+
+    if (print_auth_subtrees)
+      dout(25) << "auth tree " << *cd << " export_pin=" << export_pin << dendl;
+
+    if (export_pin >= 0 && export_pin != mds->get_nodeid() &&
+	export_pin < mds->mdsmap->get_max_mds()) {
+      mdcache->migrator->export_dir(cd, export_pin);
+    }
+  }
+}
+
+void MDBalancer::tick()
+{
+  static int num_bal_times = g_conf()->mds_bal_max;
+  auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
+  auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
+  time now = clock::now();
+
+  if (g_conf()->mds_bal_export_pin) {
+    handle_export_pins();
+  }
+
+  // sample?
+  if (chrono::duration<double>(now-last_sample).count() >
+    g_conf()->mds_bal_sample_interval) {
+    dout(15) << "tick last_sample now " << now << dendl;
+    last_sample = now;
+  }
+
+  // We can use duration_cast below, although the result is an int,
+  // because the values from g_conf are also integers.
+  // balance?
+  if (mds->get_nodeid() == 0
+      && mds->is_active()
+      && bal_interval > 0
+      && duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval
+      && (num_bal_times || (bal_max_until >= 0 && mds->get_uptime().count() > bal_max_until))) {
+    last_heartbeat = now;
+    send_heartbeat();
+    num_bal_times--;
+  }
+
+  mds->mdcache->show_subtrees(10, true);
+}
+
+
+
+
+class C_Bal_SendHeartbeat : public MDSInternalContext {
+public:
+  explicit C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { }
+  void finish(int f) override {
+    mds->balancer->send_heartbeat();
+  }
+};
+
+
+double mds_load_t::mds_load() const
+{
+  switch(g_conf()->mds_bal_mode) {
+  case 0:
+    return
+      .8 * auth.meta_load() +
+      .2 * all.meta_load() +
+      req_rate +
+      10.0 * queue_len;
+
+  case 1:
+    return req_rate + 10.0*queue_len;
+
+  case 2:
+    return cpu_load_avg;
+
+  }
+  ceph_abort();
+  return 0;
+}
+
+mds_load_t MDBalancer::get_load()
+{
+  auto now = clock::now();
+
+  mds_load_t load{DecayRate()}; /* zero DecayRate! */
+
+  if (mds->mdcache->get_root()) {
+    auto&& ls = mds->mdcache->get_root()->get_dirfrags();
+    for (auto &d : ls) {
+      load.auth.add(d->pop_auth_subtree_nested);
+      load.all.add(d->pop_nested);
+    }
+  } else {
+    dout(20) << "no root, no load" << dendl;
+  }
+
+  uint64_t num_requests = mds->get_num_requests();
+  uint64_t num_traverse = mds->logger->get(l_mds_traverse);
+  uint64_t num_traverse_hit = mds->logger->get(l_mds_traverse_hit);
+
+  uint64_t cpu_time = 1;
+  {
+    string stat_path = PROCPREFIX "/proc/self/stat";
+    ifstream stat_file(stat_path);
+    if (stat_file.is_open()) {
+      vector<string> stat_vec(std::istream_iterator<string>{stat_file},
+			      std::istream_iterator<string>());
+      if (stat_vec.size() >= 15) {
+	// utime + stime
+	cpu_time = strtoll(stat_vec[13].c_str(), nullptr, 10) +
+		   strtoll(stat_vec[14].c_str(), nullptr, 10);
+      } else {
+	derr << "input file '" << stat_path << "' not resolvable" << dendl_impl;
+      }
+    } else {
+      derr << "input file '" << stat_path << "' not found" << dendl_impl;
+    }
+  }
+
+  load.queue_len = messenger->get_dispatch_queue_len();
+
+  bool update_last = true;
+  if (last_get_load != clock::zero() &&
+      now > last_get_load) {
+    double el = std::chrono::duration<double>(now-last_get_load).count();
+    if (el >= 1.0) {
+      if (num_requests > last_num_requests)
+	load.req_rate = (num_requests - last_num_requests) / el;
+      if (cpu_time > last_cpu_time)
+	load.cpu_load_avg = (cpu_time - last_cpu_time) / el;
+      if (num_traverse > last_num_traverse && num_traverse_hit > last_num_traverse_hit)
+        load.cache_hit_rate = (double)(num_traverse_hit - last_num_traverse_hit) / (num_traverse - last_num_traverse);
+    } else {
+      auto p = mds_load.find(mds->get_nodeid());
+      if (p != mds_load.end()) {
+	load.req_rate = p->second.req_rate;
+	load.cpu_load_avg = p->second.cpu_load_avg;
+	load.cache_hit_rate = p->second.cache_hit_rate;
+      }
+      if (num_requests >= last_num_requests && cpu_time >= last_cpu_time &&
+          num_traverse >= last_num_traverse && num_traverse_hit >= last_num_traverse_hit)
+	update_last = false;
+    }
+  }
+
+  if (update_last) {
+    last_num_requests = num_requests;
+    last_cpu_time = cpu_time;
+    last_get_load = now;
+    last_num_traverse = num_traverse;
+    last_num_traverse_hit = num_traverse_hit;
+  }
+
+  dout(15) << load << dendl;
+  return load;
+}
+
+/*
+ * Read synchronously from RADOS using a timeout. We cannot do daemon-local
+ * fallbacks (i.e. kick off async read when we are processing the map and
+ * check status when we get here) with the way the mds is structured.
+ */
+int MDBalancer::localize_balancer()
+{
+  /* reset everything */
+  bool ack = false;
+  int r = 0;
+  bufferlist lua_src;
+  ceph::mutex lock = ceph::make_mutex("lock");
+  ceph::condition_variable cond;
+
+  /* we assume that balancer is in the metadata pool */
+  object_t oid = object_t(mds->mdsmap->get_balancer());
+  object_locator_t oloc(mds->get_metadata_pool());
+  ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
+                                       new C_SafeCond(lock, cond, &ack, &r));
+  dout(15) << "launched non-blocking read tid=" << tid
+           << " oid=" << oid << " oloc=" << oloc << dendl;
+
+  /* timeout: if we waste half our time waiting for RADOS, then abort! */
+  std::cv_status ret_t = [&] {
+    auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
+    std::unique_lock locker{lock};
+    return cond.wait_for(locker, std::chrono::seconds(bal_interval / 2));
+  }();
+  /* success: store the balancer in memory and set the version. */
+  if (!r) {
+    if (ret_t == std::cv_status::timeout) {
+      mds->objecter->op_cancel(tid, -CEPHFS_ECANCELED);
+      return -CEPHFS_ETIMEDOUT;
+    }
+    bal_code.assign(lua_src.to_str());
+    bal_version.assign(oid.name);
+    dout(10) "bal_code=" << bal_code << dendl;
+  }
+  return r;
+}
+
+void MDBalancer::send_heartbeat()
+{
+  if (mds->is_cluster_degraded()) {
+    dout(10) << "degraded" << dendl;
+    return;
+  }
+
+  if (!mds->mdcache->is_open()) {
+    dout(10) << "not open" << dendl;
+    mds->mdcache->wait_for_open(new C_Bal_SendHeartbeat(mds));
+    return;
+  }
+
+  if (mds->get_nodeid() == 0) {
+    beat_epoch++;
+    mds_load.clear();
+  }
+
+  // my load
+  mds_load_t load = get_load();
+  mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
+  mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
+
+  auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(mds->get_nodeid()), std::forward_as_tuple(load));
+  if (!em.second) {
+    em.first->second = load;
+  }
+
+  // import_map -- how much do i import from whom
+  map<mds_rank_t, float> import_map;
+  for (auto& im : mds->mdcache->get_auth_subtrees()) {
+    mds_rank_t from = im->inode->authority().first;
+    if (from == mds->get_nodeid()) continue;
+    if (im->get_inode()->is_stray()) continue;
+    import_map[from] += im->pop_auth_subtree.meta_load();
+  }
+  mds_import_map[ mds->get_nodeid() ] = import_map;
+
+
+  dout(3) << " epoch " << beat_epoch << " load " << load << dendl;
+  for (const auto& [rank, load] : import_map) {
+    dout(5) << "  import_map from " << rank << " -> " << load << dendl;
+  }
+
+
+  set<mds_rank_t> up;
+  mds->get_mds_map()->get_up_mds_set(up);
+  for (const auto& r : up) {
+    if (r == mds->get_nodeid())
+      continue;
+    auto hb = make_message<MHeartbeat>(load, beat_epoch);
+    hb->get_import_map() = import_map;
+    mds->send_message_mds(hb, r);
+  }
+}
+
+void MDBalancer::handle_heartbeat(const cref_t<MHeartbeat> &m)
+{
+  mds_rank_t who = mds_rank_t(m->get_source().num());
+  dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
+
+  if (!mds->is_active())
+    return;
+
+  if (!mds->mdcache->is_open()) {
+    dout(10) << "opening root on handle_heartbeat" << dendl;
+    mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
+  if (mds->is_cluster_degraded()) {
+    dout(10) << " degraded, ignoring" << dendl;
+    return;
+  }
+
+  if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) {
+    dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl;
+
+    beat_epoch = m->get_beat();
+    // clear the mds load info whose epoch is less than beat_epoch 
+    mds_load.clear();
+  }
+
+  if (who == 0) {
+    dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
+    if (beat_epoch != m->get_beat()) {
+      beat_epoch = m->get_beat();
+      mds_load.clear();
+    }
+
+    send_heartbeat();
+
+    mds->mdcache->show_subtrees();
+  } else if (mds->get_nodeid() == 0) {
+    if (beat_epoch != m->get_beat()) {
+      dout(10) << " old heartbeat epoch, ignoring" << dendl;
+      return;
+    }
+  }
+
+  {
+    auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(who), std::forward_as_tuple(m->get_load()));
+    if (!em.second) {
+      em.first->second = m->get_load();
+    }
+  }
+  mds_import_map[who] = m->get_import_map();
+
+  {
+    unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
+    if (mds_load.size() == cluster_size) {
+      // let's go!
+      //export_empties();  // no!
+
+      /* avoid spamming ceph -w if user does not turn mantle on */
+      if (mds->mdsmap->get_balancer() != "") {
+        int r = mantle_prep_rebalance();
+        if (!r) return;
+	mds->clog->warn() << "using old balancer; mantle failed for "
+                          << "balancer=" << mds->mdsmap->get_balancer()
+                          << " : " << cpp_strerror(r);
+      }
+      prep_rebalance(m->get_beat());
+    }
+  }
+}
+
+double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
+                             mds_rank_t im, double& maxim)
+{
+  if (maxex <= 0 || maxim <= 0) return 0.0;
+
+  double howmuch = std::min(maxex, maxim);
+
+  dout(5) << "   - mds." << ex << " exports " << howmuch << " to mds." << im << dendl;
+
+  if (ex == mds->get_nodeid())
+    state.targets[im] += howmuch;
+
+  state.exported[ex] += howmuch;
+  state.imported[im] += howmuch;
+
+  maxex -= howmuch;
+  maxim -= howmuch;
+
+  return howmuch;
+}
+
+void MDBalancer::queue_split(const CDir *dir, bool fast)
+{
+  dout(10) << __func__ << " enqueuing " << *dir
+                       << " (fast=" << fast << ")" << dendl;
+
+  const dirfrag_t df = dir->dirfrag();
+
+  auto callback = [this, df](int r) {
+    if (split_pending.erase(df) == 0) {
+      // Someone beat me to it.  This can happen in the fast splitting
+      // path, because we spawn two contexts, one with mds->timer and
+      // one with mds->queue_waiter.  The loser can safely just drop
+      // out.
+      return;
+    }
+
+    auto mdcache = mds->mdcache;
+
+    CDir *dir = mdcache->get_dirfrag(df);
+    if (!dir) {
+      dout(10) << "drop split on " << df << " because not in cache" << dendl;
+      return;
+    }
+    if (!dir->is_auth()) {
+      dout(10) << "drop split on " << df << " because non-auth" << dendl;
+      return;
+    }
+
+    // Pass on to MDCache: note that the split might still not
+    // happen if the checks in MDCache::can_fragment fail.
+    dout(10) << __func__ << " splitting " << *dir << dendl;
+    int bits = g_conf()->mds_bal_split_bits;
+    if (dir->inode->is_ephemeral_dist()) {
+      unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
+      if (df.frag.bits() + bits < min_frag_bits)
+	bits = min_frag_bits - df.frag.bits();
+    }
+    mdcache->split_dir(dir, bits);
+  };
+
+  auto ret = split_pending.insert(df);
+  bool is_new = ret.second;
+
+  if (fast) {
+    // Do the split ASAP: enqueue it in the MDSRank waiters which are
+    // run at the end of dispatching the current request
+    mds->queue_waiter(new MDSInternalContextWrapper(mds, 
+          new LambdaContext(std::move(callback))));
+  } else if (is_new) {
+    // Set a timer to really do the split: we don't do it immediately
+    // so that bursts of ops on a directory have a chance to go through
+    // before we freeze it.
+    mds->timer.add_event_after(bal_fragment_interval,
+                               new LambdaContext(std::move(callback)));
+  }
+}
+
+void MDBalancer::queue_merge(CDir *dir)
+{
+  const auto frag = dir->dirfrag();
+  auto callback = [this, frag](int r) {
+    ceph_assert(frag.frag != frag_t());
+
+    // frag must be in this set because only one context is in flight
+    // for a given frag at a time (because merge_pending is checked before
+    // starting one), and this context is the only one that erases it.
+    merge_pending.erase(frag);
+
+    auto mdcache = mds->mdcache;
+    CDir *dir = mdcache->get_dirfrag(frag);
+    if (!dir) {
+      dout(10) << "drop merge on " << frag << " because not in cache" << dendl;
+      return;
+    }
+    ceph_assert(dir->dirfrag() == frag);
+
+    if(!dir->is_auth()) {
+      dout(10) << "drop merge on " << *dir << " because lost auth" << dendl;
+      return;
+    }
+
+    dout(10) << "merging " << *dir << dendl;
+
+    CInode *diri = dir->get_inode();
+
+    unsigned min_frag_bits = 0;
+    if (diri->is_ephemeral_dist())
+      min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
+
+    frag_t fg = dir->get_frag();
+    while (fg.bits() > min_frag_bits) {
+      frag_t sibfg = fg.get_sibling();
+      auto&& [complete, sibs] = diri->get_dirfrags_under(sibfg);
+      if (!complete) {
+        dout(10) << "  not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl;
+        break;
+      }
+      bool all = true;
+      for (auto& sib : sibs) {
+        if (!sib->is_auth() || !sib->should_merge()) {
+          all = false;
+          break;
+        }
+      }
+      if (!all) {
+        dout(10) << "  not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl;
+        break;
+      }
+      dout(10) << "  all sibs under " << sibfg << " " << sibs << " should merge" << dendl;
+      fg = fg.parent();
+    }
+
+    if (fg != dir->get_frag())
+      mdcache->merge_dir(diri, fg);
+  };
+
+  if (merge_pending.count(frag) == 0) {
+    dout(20) << " enqueued dir " << *dir << dendl;
+    merge_pending.insert(frag);
+    mds->timer.add_event_after(bal_fragment_interval,
+        new LambdaContext(std::move(callback)));
+  } else {
+    dout(20) << " dir already in queue " << *dir << dendl;
+  }
+}
+
+void MDBalancer::prep_rebalance(int beat)
+{
+  balance_state_t state;
+
+  if (g_conf()->mds_thrash_exports) {
+    //we're going to randomly export to all the mds in the cluster
+    set<mds_rank_t> up_mds;
+    mds->get_mds_map()->get_up_mds_set(up_mds);
+    for (const auto &rank : up_mds) {
+      state.targets[rank] = 0.0;
+    }
+  } else {
+    int cluster_size = mds->get_mds_map()->get_num_in_mds();
+    mds_rank_t whoami = mds->get_nodeid();
+    rebalance_time = clock::now();
+
+    dout(7) << "cluster loads are" << dendl;
+
+    mds->mdcache->migrator->clear_export_queue();
+
+    // rescale!  turn my mds_load back into meta_load units
+    double load_fac = 1.0;
+    map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
+    if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
+      double metald = m->second.auth.meta_load();
+      double mdsld = m->second.mds_load();
+      load_fac = metald / mdsld;
+      dout(7) << " load_fac is " << load_fac
+	      << " <- " << m->second.auth << " " << metald
+	      << " / " << mdsld
+	      << dendl;
+    }
+
+    mds_meta_load.clear();
+
+    double total_load = 0.0;
+    multimap<double,mds_rank_t> load_map;
+    for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
+      mds_load_t& load = mds_load.at(i);
+
+      double l = load.mds_load() * load_fac;
+      mds_meta_load[i] = l;
+
+      if (whoami == 0)
+	dout(7) << "  mds." << i
+		<< " " << load
+		<< " = " << load.mds_load()
+		<< " ~ " << l << dendl;
+
+      if (whoami == i) my_load = l;
+      total_load += l;
+
+      load_map.insert(pair<double,mds_rank_t>( l, i ));
+    }
+
+    // target load
+    target_load = total_load / (double)cluster_size;
+    dout(7) << "my load " << my_load
+	    << "   target " << target_load
+	    << "   total " << total_load
+	    << dendl;
+
+    // under or over?
+    for (const auto& [load, rank] : load_map) {
+      if (load < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
+	dout(7) << " mds." << rank << " is underloaded or barely overloaded." << dendl;
+	mds_last_epoch_under_map[rank] = beat_epoch;
+      }
+    }
+
+    int last_epoch_under = mds_last_epoch_under_map[whoami];
+    if (last_epoch_under == beat_epoch) {
+      dout(7) << "  i am underloaded or barely overloaded, doing nothing." << dendl;
+      return;
+    }
+    // am i over long enough?
+    if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
+      dout(7) << "  i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
+      return;
+    }
+
+    dout(7) << "  i am sufficiently overloaded" << dendl;
+
+
+    // first separate exporters and importers
+    multimap<double,mds_rank_t> importers;
+    multimap<double,mds_rank_t> exporters;
+    set<mds_rank_t>             importer_set;
+    set<mds_rank_t>             exporter_set;
+
+    for (multimap<double,mds_rank_t>::iterator it = load_map.begin();
+	 it != load_map.end();
+	 ++it) {
+      if (it->first < target_load) {
+	dout(15) << "   mds." << it->second << " is importer" << dendl;
+	importers.insert(pair<double,mds_rank_t>(it->first,it->second));
+	importer_set.insert(it->second);
+      } else {
+	int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
+	if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
+	  dout(15) << "   mds." << it->second << " is exporter" << dendl;
+	  exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
+	  exporter_set.insert(it->second);
+	}
+      }
+    }
+
+
+    // determine load transfer mapping
+
+    if (true) {
+      // analyze import_map; do any matches i can
+
+      dout(15) << "  matching exporters to import sources" << dendl;
+
+      // big -> small exporters
+      for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
+	   ex != exporters.rend();
+	   ++ex) {
+	double maxex = get_maxex(state, ex->second);
+	if (maxex <= .001) continue;
+
+	// check importers. for now, just in arbitrary order (no intelligent matching).
+	for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
+	     im != mds_import_map[ex->second].end();
+	     ++im) {
+	  double maxim = get_maxim(state, im->first);
+	  if (maxim <= .001) continue;
+	  try_match(state, ex->second, maxex, im->first, maxim);
+	  if (maxex <= .001) break;
+	}
+      }
+    }
+
+    // old way
+    if (beat % 2 == 1) {
+      dout(15) << "  matching big exporters to big importers" << dendl;
+      // big exporters to big importers
+      multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
+      multimap<double,mds_rank_t>::iterator im = importers.begin();
+      while (ex != exporters.rend() &&
+	     im != importers.end()) {
+        double maxex = get_maxex(state, ex->second);
+	double maxim = get_maxim(state, im->second);
+	if (maxex < .001 || maxim < .001) break;
+	try_match(state, ex->second, maxex, im->second, maxim);
+	if (maxex <= .001) ++ex;
+	if (maxim <= .001) ++im;
+      }
+    } else { // new way
+      dout(15) << "  matching small exporters to big importers" << dendl;
+      // small exporters to big importers
+      multimap<double,mds_rank_t>::iterator ex = exporters.begin();
+      multimap<double,mds_rank_t>::iterator im = importers.begin();
+      while (ex != exporters.end() &&
+	     im != importers.end()) {
+        double maxex = get_maxex(state, ex->second);
+	double maxim = get_maxim(state, im->second);
+	if (maxex < .001 || maxim < .001) break;
+	try_match(state, ex->second, maxex, im->second, maxim);
+	if (maxex <= .001) ++ex;
+	if (maxim <= .001) ++im;
+      }
+    }
+  }
+  try_rebalance(state);
+}
+
+int MDBalancer::mantle_prep_rebalance()
+{
+  balance_state_t state;
+
+  /* refresh balancer if it has changed */
+  if (bal_version != mds->mdsmap->get_balancer()) {
+    bal_version.assign("");
+    int r = localize_balancer();
+    if (r) return r;
+
+    /* only spam the cluster log from 1 mds on version changes */
+    if (mds->get_nodeid() == 0)
+      mds->clog->info() << "mantle balancer version changed: " << bal_version;
+  }
+
+  /* prepare for balancing */
+  int cluster_size = mds->get_mds_map()->get_num_in_mds();
+  rebalance_time = clock::now();
+  mds->mdcache->migrator->clear_export_queue();
+
+  /* fill in the metrics for each mds by grabbing load struct */
+  vector < map<string, double> > metrics (cluster_size);
+  for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
+    mds_load_t& load = mds_load.at(i);
+
+    metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
+                  {"all.meta_load", load.all.meta_load()},
+                  {"req_rate", load.req_rate},
+                  {"queue_len", load.queue_len},
+                  {"cpu_load_avg", load.cpu_load_avg}};
+  }
+
+  /* execute the balancer */
+  Mantle mantle;
+  int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
+  dout(7) << " mantle decided that new targets=" << state.targets << dendl;
+
+  /* mantle doesn't know about cluster size, so check target len here */
+  if ((int) state.targets.size() != cluster_size)
+    return -CEPHFS_EINVAL;
+  else if (ret)
+    return ret;
+
+  try_rebalance(state);
+  return 0;
+}
+
+
+
+void MDBalancer::try_rebalance(balance_state_t& state)
+{
+  if (g_conf()->mds_thrash_exports) {
+    dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
+	    << dendl;
+    return;
+  }
+
+  // make a sorted list of my imports
+  multimap<double, CDir*> import_pop_map;
+  multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
+
+  for (auto& dir : mds->mdcache->get_fullauth_subtrees()) {
+    CInode *diri = dir->get_inode();
+    if (diri->is_mdsdir())
+      continue;
+    if (diri->get_export_pin(false) != MDS_RANK_NONE)
+      continue;
+    if (dir->is_freezing() || dir->is_frozen())
+      continue;  // export pbly already in progress
+
+    mds_rank_t from = diri->authority().first;
+    double pop = dir->pop_auth_subtree.meta_load();
+    if (g_conf()->mds_bal_idle_threshold > 0 &&
+	pop < g_conf()->mds_bal_idle_threshold &&
+	diri != mds->mdcache->get_root() &&
+	from != mds->get_nodeid()) {
+      dout(5) << " exporting idle (" << pop << ") import " << *dir
+	      << " back to mds." << from << dendl;
+      mds->mdcache->migrator->export_dir_nicely(dir, from);
+      continue;
+    }
+
+    dout(15) << "  map: i imported " << *dir << " from " << from << dendl;
+    import_pop_map.insert(make_pair(pop, dir));
+    import_from_map.insert(make_pair(from, make_pair(dir, pop)));
+  }
+
+  // do my exports!
+  map<mds_rank_t, double> export_pop_map;
+
+  for (auto &it : state.targets) {
+    mds_rank_t target = it.first;
+    double amount = it.second;
+
+    if (amount < MIN_OFFLOAD)
+      continue;
+    if (amount * 10 * state.targets.size() < target_load)
+      continue;
+
+    dout(5) << "want to send " << amount << " to mds." << target
+      //<< " .. " << (*it).second << " * " << load_fac
+	    << " -> " << amount
+	    << dendl;//" .. fudge is " << fudge << dendl;
+
+    double& have = export_pop_map[target];
+
+    mds->mdcache->show_subtrees();
+
+    // search imports from target
+    if (import_from_map.count(target)) {
+      dout(7) << " aha, looking through imports from target mds." << target << dendl;
+      for (auto p = import_from_map.equal_range(target);
+	   p.first != p.second; ) {
+	CDir *dir = p.first->second.first;
+	double pop = p.first->second.second;
+	dout(7) << "considering " << *dir << " from " << (*p.first).first << dendl;
+	auto plast = p.first++;
+
+	if (dir->inode->is_base())
+	  continue;
+	ceph_assert(dir->inode->authority().first == target);  // cuz that's how i put it in the map, dummy
+
+	if (pop <= amount-have) {
+	  dout(7) << "reexporting " << *dir << " pop " << pop
+		  << " back to mds." << target << dendl;
+	  mds->mdcache->migrator->export_dir_nicely(dir, target);
+	  have += pop;
+	  import_from_map.erase(plast);
+	  for (auto q = import_pop_map.equal_range(pop);
+	       q.first != q.second; ) {
+	    if (q.first->second == dir) {
+	      import_pop_map.erase(q.first);
+	      break;
+	    }
+	    q.first++;
+	  }
+	} else {
+	  dout(7) << "can't reexport " << *dir << ", too big " << pop << dendl;
+	}
+	if (amount-have < MIN_OFFLOAD)
+	  break;
+      }
+    }
+  }
+
+  // any other imports
+  for (auto &it : state.targets) {
+    mds_rank_t target = it.first;
+    double amount = it.second;
+
+    if (!export_pop_map.count(target))
+      continue;
+    double& have = export_pop_map[target];
+    if (amount-have < MIN_OFFLOAD)
+      continue;
+
+    for (auto p = import_pop_map.begin();
+	 p != import_pop_map.end(); ) {
+      CDir *dir = p->second;
+      if (dir->inode->is_base()) {
+	++p;
+	continue;
+      }
+
+      double pop = p->first;
+      if (pop <= amount-have && pop > MIN_REEXPORT) {
+	dout(5) << "reexporting " << *dir << " pop " << pop
+		<< " to mds." << target << dendl;
+	have += pop;
+	mds->mdcache->migrator->export_dir_nicely(dir, target);
+	import_pop_map.erase(p++);
+      } else {
+	++p;
+      }
+      if (amount-have < MIN_OFFLOAD)
+	break;
+    }
+  }
+
+  set<CDir*> already_exporting;
+
+  for (auto &it : state.targets) {
+    mds_rank_t target = it.first;
+    double amount = it.second;
+
+    if (!export_pop_map.count(target))
+      continue;
+    double& have = export_pop_map[target];
+    if (amount-have < MIN_OFFLOAD)
+      continue;
+
+    // okay, search for fragments of my workload
+    std::vector<CDir*> exports;
+
+    for (auto p = import_pop_map.rbegin();
+	 p != import_pop_map.rend();
+	 ++p) {
+      CDir *dir = p->second;
+      find_exports(dir, amount, &exports, have, already_exporting);
+      if (amount-have < MIN_OFFLOAD)
+	break;
+    }
+    //fudge = amount - have;
+
+    for (const auto& dir : exports) {
+      dout(5) << "   - exporting " << dir->pop_auth_subtree
+	      << " " << dir->pop_auth_subtree.meta_load()
+	      << " to mds." << target << " " << *dir << dendl;
+      mds->mdcache->migrator->export_dir_nicely(dir, target);
+    }
+  }
+
+  dout(7) << "done" << dendl;
+  mds->mdcache->show_subtrees();
+}
+
+void MDBalancer::find_exports(CDir *dir,
+                              double amount,
+                              std::vector<CDir*>* exports,
+                              double& have,
+                              set<CDir*>& already_exporting)
+{
+  auto now = clock::now();
+  auto duration = std::chrono::duration<double>(now-rebalance_time).count();
+  if (duration > 0.1) {
+    derr << " balancer runs too long"  << dendl_impl;
+    have = amount;
+    return;
+  }
+
+  ceph_assert(dir->is_auth());
+
+  double need = amount - have;
+  if (need < amount * g_conf()->mds_bal_min_start)
+    return;   // good enough!
+
+  double needmax = need * g_conf()->mds_bal_need_max;
+  double needmin = need * g_conf()->mds_bal_need_min;
+  double midchunk = need * g_conf()->mds_bal_midchunk;
+  double minchunk = need * g_conf()->mds_bal_minchunk;
+
+  std::vector<CDir*> bigger_rep, bigger_unrep;
+  multimap<double, CDir*> smaller;
+
+  double dir_pop = dir->pop_auth_subtree.meta_load();
+  dout(7) << "in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
+
+  double subdir_sum = 0;
+  for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
+       !it.end(); ) {
+    CInode *in = *it;
+    ++it;
+
+    ceph_assert(in->is_dir());
+    ceph_assert(in->get_parent_dir() == dir);
+
+    auto&& dfls = in->get_nested_dirfrags();
+
+    size_t num_idle_frags = 0;
+    for (const auto& subdir : dfls) {
+      if (already_exporting.count(subdir))
+	continue;
+
+      // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
+      // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
+      if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
+	  subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
+	continue;  // can't export this right now!
+
+      // how popular?
+      double pop = subdir->pop_auth_subtree.meta_load();
+      subdir_sum += pop;
+      dout(15) << "   subdir pop " << pop << " " << *subdir << dendl;
+
+      if (pop < minchunk) {
+	num_idle_frags++;
+	continue;
+      }
+
+      // lucky find?
+      if (pop > needmin && pop < needmax) {
+	exports->push_back(subdir);
+	already_exporting.insert(subdir);
+	have += pop;
+	return;
+      }
+
+      if (pop > need) {
+	if (subdir->is_rep())
+	  bigger_rep.push_back(subdir);
+	else
+	  bigger_unrep.push_back(subdir);
+      } else
+	smaller.insert(pair<double,CDir*>(pop, subdir));
+    }
+    if (dfls.size() == num_idle_frags)
+      in->item_pop_lru.remove_myself();
+  }
+  dout(15) << "   sum " << subdir_sum << " / " << dir_pop << dendl;
+
+  // grab some sufficiently big small items
+  multimap<double,CDir*>::reverse_iterator it;
+  for (it = smaller.rbegin();
+       it != smaller.rend();
+       ++it) {
+
+    if ((*it).first < midchunk)
+      break;  // try later
+
+    dout(7) << "   taking smaller " << *(*it).second << dendl;
+
+    exports->push_back((*it).second);
+    already_exporting.insert((*it).second);
+    have += (*it).first;
+    if (have > needmin)
+      return;
+  }
+
+  // apprently not enough; drill deeper into the hierarchy (if non-replicated)
+  for (const auto& dir : bigger_unrep) {
+    dout(15) << "   descending into " << *dir << dendl;
+    find_exports(dir, amount, exports, have, already_exporting);
+    if (have > needmin)
+      return;
+  }
+
+  // ok fine, use smaller bits
+  for (;
+       it != smaller.rend();
+       ++it) {
+    dout(7) << "   taking (much) smaller " << it->first << " " << *(*it).second << dendl;
+
+    exports->push_back((*it).second);
+    already_exporting.insert((*it).second);
+    have += (*it).first;
+    if (have > needmin)
+      return;
+  }
+
+  // ok fine, drill into replicated dirs
+  for (const auto& dir : bigger_rep) {
+    dout(7) << "   descending into replicated " << *dir << dendl;
+    find_exports(dir, amount, exports, have, already_exporting);
+    if (have > needmin)
+      return;
+  }
+}
+
+void MDBalancer::hit_inode(CInode *in, int type, int who)
+{
+  // hit inode
+  in->pop.get(type).hit();
+
+  if (in->get_parent_dn())
+    hit_dir(in->get_parent_dn()->get_dir(), type, who);
+}
+
+void MDBalancer::maybe_fragment(CDir *dir, bool hot)
+{
+  // split/merge
+  if (bal_fragment_dirs && bal_fragment_interval > 0 &&
+      dir->is_auth() &&
+      !dir->inode->is_base() &&  // not root/mdsdir (for now at least)
+      !dir->inode->is_stray()) { // not straydir
+
+    // split
+    if (dir->should_split() || hot) {
+      if (split_pending.count(dir->dirfrag()) == 0) {
+        queue_split(dir, false);
+      } else {
+        if (dir->should_split_fast()) {
+          queue_split(dir, true);
+        } else {
+          dout(10) << ": fragment already enqueued to split: "
+                   << *dir << dendl;
+        }
+      }
+    }
+
+    // merge?
+    if (dir->get_frag() != frag_t() && dir->should_merge() &&
+	merge_pending.count(dir->dirfrag()) == 0) {
+      queue_merge(dir);
+    }
+  }
+}
+
+void MDBalancer::hit_dir(CDir *dir, int type, int who, double amount)
+{
+  if (dir->inode->is_stray())
+    return;
+  // hit me
+  double v = dir->pop_me.get(type).hit(amount);
+
+  const bool hot = (v > g_conf()->mds_bal_split_rd && type == META_POP_IRD) ||
+                   (v > g_conf()->mds_bal_split_wr && type == META_POP_IWR);
+
+  dout(20) << type << " pop is " << v << ", frag " << dir->get_frag()
+           << " size " << dir->get_frag_size() << " " << dir->pop_me << dendl;
+
+  maybe_fragment(dir, hot);
+
+  // replicate?
+  if (type == META_POP_IRD && who >= 0) {
+    dir->pop_spread.hit(who);
+  }
+
+  double rd_adj = 0.0;
+  if (type == META_POP_IRD &&
+      dir->last_popularity_sample < last_sample) {
+    double dir_pop = dir->pop_auth_subtree.get(type).get();    // hmm??
+    dir->last_popularity_sample = last_sample;
+    double pop_sp = dir->pop_spread.get();
+    dir_pop += pop_sp * 10;
+
+    //if (dir->ino() == inodeno_t(0x10000000002))
+    if (pop_sp > 0) {
+      dout(20) << type << " pop " << dir_pop << " spread " << pop_sp
+	      << " " << dir->pop_spread.last[0]
+	      << " " << dir->pop_spread.last[1]
+	      << " " << dir->pop_spread.last[2]
+	      << " " << dir->pop_spread.last[3]
+	      << " in " << *dir << dendl;
+    }
+
+    if (dir->is_auth() && !dir->is_ambiguous_auth()) {
+      if (dir->can_rep() &&
+	  dir_pop >= g_conf()->mds_bal_replicate_threshold) {
+	// replicate
+	double rdp = dir->pop_me.get(META_POP_IRD).get();
+	rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
+	rd_adj /= 2.0;  // temper somewhat
+
+	dout(5) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl;
+
+	dir->dir_rep = CDir::REP_ALL;
+	mds->mdcache->send_dir_updates(dir, true);
+
+	// fixme this should adjust the whole pop hierarchy
+	dir->pop_me.get(META_POP_IRD).adjust(rd_adj);
+	dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
+      }
+
+      if (dir->ino() != 1 &&
+	  dir->is_rep() &&
+	  dir_pop < g_conf()->mds_bal_unreplicate_threshold) {
+	// unreplicate
+	dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
+
+	dir->dir_rep = CDir::REP_NONE;
+	mds->mdcache->send_dir_updates(dir);
+      }
+    }
+  }
+
+  // adjust ancestors
+  bool hit_subtree = dir->is_auth();         // current auth subtree (if any)
+  bool hit_subtree_nested = dir->is_auth();  // all nested auth subtrees
+
+  while (true) {
+    CDir *pdir = dir->inode->get_parent_dir();
+    dir->pop_nested.get(type).hit(amount);
+    if (rd_adj != 0.0)
+      dir->pop_nested.get(META_POP_IRD).adjust(rd_adj);
+
+    if (hit_subtree) {
+      dir->pop_auth_subtree.get(type).hit(amount);
+
+      if (rd_adj != 0.0)
+	dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
+
+      if (dir->is_subtree_root())
+	hit_subtree = false;                // end of auth domain, stop hitting auth counters.
+      else if (pdir)
+	pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
+    }
+
+    if (hit_subtree_nested) {
+      dir->pop_auth_subtree_nested.get(type).hit(amount);
+      if (rd_adj != 0.0)
+	dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(rd_adj);
+    }
+    if (!pdir) break;
+    dir = pdir;
+  }
+}
+
+
+/*
+ * subtract off an exported chunk.
+ *  this excludes *dir itself (encode_export_dir should have take care of that)
+ *  we _just_ do the parents' nested counters.
+ *
+ * NOTE: call me _after_ forcing *dir into a subtree root,
+ *       but _before_ doing the encode_export_dirs.
+ */
+void MDBalancer::subtract_export(CDir *dir)
+{
+  dirfrag_load_vec_t subload = dir->pop_auth_subtree;
+
+  while (true) {
+    dir = dir->inode->get_parent_dir();
+    if (!dir) break;
+
+    dir->pop_nested.sub(subload);
+    dir->pop_auth_subtree_nested.sub(subload);
+  }
+}
+
+
+void MDBalancer::add_import(CDir *dir)
+{
+  dirfrag_load_vec_t subload = dir->pop_auth_subtree;
+
+  while (true) {
+    dir = dir->inode->get_parent_dir();
+    if (!dir) break;
+
+    dir->pop_nested.add(subload);
+    dir->pop_auth_subtree_nested.add(subload);
+  }
+}
+
+void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc)
+{
+  bool adjust_subtree_nest = dir->is_auth();
+  bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
+  CDir *cur = dir;
+  while (true) {
+    if (inc) {
+      pdir->pop_nested.add(dir->pop_nested);
+      if (adjust_subtree) {
+	pdir->pop_auth_subtree.add(dir->pop_auth_subtree);
+	pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
+      }
+
+      if (adjust_subtree_nest)
+	pdir->pop_auth_subtree_nested.add(dir->pop_auth_subtree_nested);
+    } else {
+      pdir->pop_nested.sub(dir->pop_nested);
+      if (adjust_subtree)
+	pdir->pop_auth_subtree.sub(dir->pop_auth_subtree);
+
+      if (adjust_subtree_nest)
+	pdir->pop_auth_subtree_nested.sub(dir->pop_auth_subtree_nested);
+    }
+
+    if (pdir->is_subtree_root())
+      adjust_subtree = false;
+    cur = pdir;
+    pdir = pdir->inode->get_parent_dir();
+    if (!pdir) break;
+  }
+}
+
+void MDBalancer::handle_mds_failure(mds_rank_t who)
+{
+  if (0 == who) {
+    mds_last_epoch_under_map.clear();
+  }
+}
+
+int MDBalancer::dump_loads(Formatter *f) const
+{
+  std::deque<CDir*> dfs;
+  if (mds->mdcache->get_root()) {
+    mds->mdcache->get_root()->get_dirfrags(dfs);
+  } else {
+    dout(10) << "no root" << dendl;
+  }
+
+  f->open_object_section("loads");
+
+  f->open_array_section("dirfrags");
+  while (!dfs.empty()) {
+    CDir *dir = dfs.front();
+    dfs.pop_front();
+
+    f->open_object_section("dir");
+    dir->dump_load(f);
+    f->close_section();
+
+    for (auto it = dir->begin(); it != dir->end(); ++it) {
+      CInode *in = it->second->get_linkage()->get_inode();
+      if (!in || !in->is_dir())
+	continue;
+
+      auto&& ls = in->get_dirfrags();
+      for (const auto& subdir : ls) {
+	if (subdir->pop_nested.meta_load() < .001)
+	  continue;
+	dfs.push_back(subdir);
+      }
+    }
+  }
+  f->close_section();  // dirfrags array
+
+  f->open_object_section("mds_load");
+  {
+
+    auto dump_mds_load = [f](const mds_load_t& load) {
+      f->dump_float("request_rate", load.req_rate);
+      f->dump_float("cache_hit_rate", load.cache_hit_rate);
+      f->dump_float("queue_length", load.queue_len);
+      f->dump_float("cpu_load", load.cpu_load_avg);
+      f->dump_float("mds_load", load.mds_load());
+
+      f->open_object_section("auth_dirfrags");
+      load.auth.dump(f);
+      f->close_section();
+      f->open_object_section("all_dirfrags");
+      load.all.dump(f);
+      f->close_section();
+    };
+
+    for (const auto& [rank, load] : mds_load) {
+      CachedStackStringStream css;
+      *css << "mds." << rank;
+      f->open_object_section(css->strv());
+      dump_mds_load(load);
+      f->close_section();
+    }
+  }
+  f->close_section(); // mds_load
+
+  f->open_object_section("mds_meta_load");
+  for (auto& [rank, mload] : mds_meta_load) {
+    CachedStackStringStream css;
+    *css << "mds." << rank;
+    f->dump_float(css->strv(), mload);
+  }
+  f->close_section(); // mds_meta_load
+
+  f->open_object_section("mds_import_map");
+  for (auto& [rank, imports] : mds_import_map) {
+    {
+      CachedStackStringStream css;
+      *css << "mds." << rank;
+      f->open_array_section(css->strv());
+    }
+    for (auto& [rank_from, mload] : imports) {
+      f->open_object_section("from");
+      CachedStackStringStream css;
+      *css << "mds." << rank_from;
+      f->dump_float(css->strv(), mload);
+      f->close_section();
+    }
+    f->close_section(); // mds.? array
+  }
+  f->close_section(); // mds_import_map
+
+  f->close_section(); // loads
+  return 0;
+}
diff --git a/src/mds/MDBalancer.h b/src/mds/MDBalancer.h
new file mode 100644
index 000000000..d9172e565
--- /dev/null
+++ b/src/mds/MDBalancer.h
@@ -0,0 +1,159 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#ifndef CEPH_MDBALANCER_H
+#define CEPH_MDBALANCER_H
+
+#include "include/types.h"
+#include "common/Clock.h"
+#include "common/Cond.h"
+
+#include "msg/Message.h"
+#include "messages/MHeartbeat.h"
+
+#include "MDSMap.h"
+
+class MDSRank;
+class MHeartbeat;
+class CInode;
+class CDir;
+class Messenger;
+class MonClient;
+
+class MDBalancer {
+public:
+  using clock = ceph::coarse_mono_clock;
+  using time = ceph::coarse_mono_time;
+  friend class C_Bal_SendHeartbeat;
+
+  MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc);
+
+  void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
+
+  int proc_message(const cref_t<Message> &m);
+
+  /**
+   * Regularly called upkeep function.
+   *
+   * Sends MHeartbeat messages to the mons.
+   */
+  void tick();
+
+  void handle_export_pins(void);
+
+  void subtract_export(CDir *ex);
+  void add_import(CDir *im);
+  void adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc);
+
+  void hit_inode(CInode *in, int type, int who=-1);
+  void hit_dir(CDir *dir, int type, int who=-1, double amount=1.0);
+
+  void queue_split(const CDir *dir, bool fast);
+  void queue_merge(CDir *dir);
+  bool is_fragment_pending(dirfrag_t df) {
+    return split_pending.count(df) || merge_pending.count(df);
+  }
+
+  /**
+   * Based on size and configuration, decide whether to issue a queue_split
+   * or queue_merge for this CDir.
+   *
+   * \param hot whether the directory's temperature is enough to split it
+   */
+  void maybe_fragment(CDir *dir, bool hot);
+
+  void handle_mds_failure(mds_rank_t who);
+
+  int dump_loads(Formatter *f) const;
+
+private:
+  typedef struct {
+    std::map<mds_rank_t, double> targets;
+    std::map<mds_rank_t, double> imported;
+    std::map<mds_rank_t, double> exported;
+  } balance_state_t;
+
+  //set up the rebalancing targets for export and do one if the
+  //MDSMap is up to date
+  void prep_rebalance(int beat);
+  int mantle_prep_rebalance();
+
+  mds_load_t get_load();
+  int localize_balancer();
+  void send_heartbeat();
+  void handle_heartbeat(const cref_t<MHeartbeat> &m);
+  void find_exports(CDir *dir,
+                    double amount,
+                    std::vector<CDir*>* exports,
+                    double& have,
+                    set<CDir*>& already_exporting);
+
+  double try_match(balance_state_t &state,
+                   mds_rank_t ex, double& maxex,
+                   mds_rank_t im, double& maxim);
+
+  double get_maxim(balance_state_t &state, mds_rank_t im) {
+    return target_load - mds_meta_load[im] - state.imported[im];
+  }
+  double get_maxex(balance_state_t &state, mds_rank_t ex) {
+    return mds_meta_load[ex] - target_load - state.exported[ex];
+  }
+
+  /**
+   * Try to rebalance.
+   *
+   * Check if the monitor has recorded the current export targets;
+   * if it has then do the actual export. Otherwise send off our
+   * export targets message again.
+   */
+  void try_rebalance(balance_state_t& state);
+
+  bool bal_fragment_dirs;
+  int64_t bal_fragment_interval;
+  static const unsigned int AUTH_TREES_THRESHOLD = 5;
+
+  MDSRank *mds;
+  Messenger *messenger;
+  MonClient *mon_client;
+  int beat_epoch = 0;
+
+  string bal_code;
+  string bal_version;
+
+  time last_heartbeat = clock::zero();
+  time last_sample = clock::zero();
+  time rebalance_time = clock::zero(); //ensure a consistent view of load for rebalance
+
+  time last_get_load = clock::zero();
+  uint64_t last_num_requests = 0;
+  uint64_t last_cpu_time = 0;
+  uint64_t last_num_traverse = 0;
+  uint64_t last_num_traverse_hit = 0;
+
+  // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
+  // just as soon as a delayed context comes back and triggers it.
+  // These sets just prevent us from spawning extra timer contexts for
+  // dirfrags that already have one in flight.
+  set<dirfrag_t> split_pending, merge_pending;
+
+  // per-epoch scatter/gathered info
+  std::map<mds_rank_t, mds_load_t> mds_load;
+  std::map<mds_rank_t, double> mds_meta_load;
+  std::map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;
+  std::map<mds_rank_t, int> mds_last_epoch_under_map;
+
+  // per-epoch state
+  double my_load = 0;
+  double target_load = 0;
+};
+#endif
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
new file mode 100644
index 000000000..a271032ae
--- /dev/null
+++ b/src/mds/MDCache.cc
@@ -0,0 +1,13370 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <errno.h>
+#include <ostream>
+#include <string>
+#include <string_view>
+#include <map>
+
+#include "MDCache.h"
+#include "MDSRank.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDLog.h"
+#include "MDBalancer.h"
+#include "Migrator.h"
+#include "ScrubStack.h"
+
+#include "SnapClient.h"
+
+#include "MDSMap.h"
+
+#include "CInode.h"
+#include "CDir.h"
+
+#include "Mutation.h"
+
+#include "include/ceph_fs.h"
+#include "include/filepath.h"
+#include "include/util.h"
+
+#include "messages/MClientCaps.h"
+
+#include "msg/Message.h"
+#include "msg/Messenger.h"
+
+#include "common/MemoryModel.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "common/safe_io.h"
+
+#include "osdc/Journaler.h"
+#include "osdc/Filer.h"
+
+#include "events/ESubtreeMap.h"
+#include "events/EUpdate.h"
+#include "events/EPeerUpdate.h"
+#include "events/EImportFinish.h"
+#include "events/EFragment.h"
+#include "events/ECommitted.h"
+#include "events/EPurged.h"
+#include "events/ESessions.h"
+
+#include "InoTable.h"
+
+#include "common/Timer.h"
+
+#include "perfglue/heap_profiler.h"
+
+
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+  return *_dout << "mds." << mds->get_nodeid() << ".cache ";
+}
+
+set<int> SimpleLock::empty_gather_set;
+
+
+/**
+ * All non-I/O contexts that require a reference
+ * to an MDCache instance descend from this.
+ */
+class MDCacheContext : public virtual MDSContext {
+protected:
+  MDCache *mdcache;
+  MDSRank *get_mds() override
+  {
+    ceph_assert(mdcache != NULL);
+    return mdcache->mds;
+  }
+public:
+  explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
+};
+
+
+/**
+ * Only for contexts called back from an I/O completion
+ *
+ * Note: duplication of members wrt MDCacheContext, because
+ * it'ls the lesser of two evils compared with introducing
+ * yet another piece of (multiple) inheritance.
+ */
+class MDCacheIOContext : public virtual MDSIOContextBase {
+protected:
+  MDCache *mdcache;
+  MDSRank *get_mds() override
+  {
+    ceph_assert(mdcache != NULL);
+    return mdcache->mds;
+  }
+public:
+  explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
+    MDSIOContextBase(track), mdcache(mdc_) {}
+};
+
+class MDCacheLogContext : public virtual MDSLogContextBase {
+protected:
+  MDCache *mdcache;
+  MDSRank *get_mds() override
+  {
+    ceph_assert(mdcache != NULL);
+    return mdcache->mds;
+  }
+public:
+  explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
+};
+
+MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
+  mds(m),
+  open_file_table(m),
+  filer(m->objecter, m->finisher),
+  stray_manager(m, purge_queue_),
+  recovery_queue(m),
+  trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate"))
+{
+  migrator.reset(new Migrator(mds, this));
+
+  max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
+                        (g_conf()->mds_dir_max_commit_size << 20) :
+                        (0.9 *(g_conf()->osd_max_write_size << 20));
+
+  cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
+  cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
+  cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
+
+  export_ephemeral_distributed_config =  g_conf().get_val<bool>("mds_export_ephemeral_distributed");
+  export_ephemeral_random_config =  g_conf().get_val<bool>("mds_export_ephemeral_random");
+  export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
+
+  lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
+
+  bottom_lru.lru_set_midpoint(0);
+
+  decayrate.set_halflife(g_conf()->mds_decay_halflife);
+
+  upkeeper = std::thread(&MDCache::upkeep_main, this);
+}
+
+MDCache::~MDCache() 
+{
+  if (logger) {
+    g_ceph_context->get_perfcounters_collection()->remove(logger.get());
+  }
+  if (upkeeper.joinable())
+    upkeeper.join();
+}
+
+void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap)
+{
+  dout(20) << "config changes: " << changed << dendl;
+  if (changed.count("mds_cache_memory_limit"))
+    cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
+  if (changed.count("mds_cache_reservation"))
+    cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
+
+  bool ephemeral_pin_config_changed = false;
+  if (changed.count("mds_export_ephemeral_distributed")) {
+    export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
+    dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl;
+    /* copy to vector to avoid removals during iteration */
+    ephemeral_pin_config_changed = true;
+  }
+  if (changed.count("mds_export_ephemeral_random")) {
+    export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
+    dout(10) << "Migrating any ephemeral random pinned inodes" << dendl;
+    /* copy to vector to avoid removals during iteration */
+    ephemeral_pin_config_changed = true;
+  }
+  if (ephemeral_pin_config_changed) {
+    std::vector<CInode*> migrate;
+    migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end());
+    for (auto& in : migrate) {
+      in->maybe_export_pin(true);
+    }
+  }
+  if (changed.count("mds_export_ephemeral_random_max")) {
+    export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
+  }
+  if (changed.count("mds_health_cache_threshold"))
+    cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
+  if (changed.count("mds_cache_mid"))
+    lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
+  if (changed.count("mds_cache_trim_decay_rate")) {
+    trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
+  }
+
+  migrator->handle_conf_change(changed, mdsmap);
+  mds->balancer->handle_conf_change(changed, mdsmap);
+}
+
+void MDCache::log_stat()
+{
+  mds->logger->set(l_mds_inodes, lru.lru_get_size());
+  mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
+  mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
+  mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
+  mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
+  mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
+  mds->logger->set(l_mds_caps, Capability::count());
+  if (root) {
+    mds->logger->set(l_mds_root_rfiles, root->get_inode()->rstat.rfiles);
+    mds->logger->set(l_mds_root_rbytes, root->get_inode()->rstat.rbytes);
+    mds->logger->set(l_mds_root_rsnaps, root->get_inode()->rstat.rsnaps);
+  }
+}
+
+
+//
+
+bool MDCache::shutdown()
+{
+  {
+    std::scoped_lock lock(upkeep_mutex);
+    upkeep_trim_shutdown = true;
+    upkeep_cvar.notify_one();
+  }
+  if (lru.lru_get_size() > 0) {
+    dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
+    //show_cache();
+    show_subtrees();
+    //dump();
+  }
+  return true;
+}
+
+
+// ====================================================================
+// some inode functions
+
+void MDCache::add_inode(CInode *in)
+{
+  // add to inode map
+  if (in->last == CEPH_NOSNAP) {
+    auto &p = inode_map[in->ino()];
+    ceph_assert(!p); // should be no dup inos!
+    p = in;
+  } else {
+    auto &p = snap_inode_map[in->vino()];
+    ceph_assert(!p); // should be no dup inos!
+    p = in;
+  }
+
+  if (in->ino() < MDS_INO_SYSTEM_BASE) {
+    if (in->ino() == CEPH_INO_ROOT)
+      root = in;
+    else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
+      myin = in;
+    else if (in->is_stray()) {
+      if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
+	strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
+      }
+    }
+    if (in->is_base())
+      base_inodes.insert(in);
+  }
+}
+
+void MDCache::remove_inode(CInode *o) 
+{ 
+  dout(14) << "remove_inode " << *o << dendl;
+
+  if (o->get_parent_dn()) {
+    // FIXME: multiple parents?
+    CDentry *dn = o->get_parent_dn();
+    ceph_assert(!dn->is_dirty());
+    dn->dir->unlink_inode(dn);   // leave dentry ... FIXME?
+  }
+
+  if (o->is_dirty())
+    o->mark_clean();
+  if (o->is_dirty_parent())
+    o->clear_dirty_parent();
+
+  o->clear_scatter_dirty();
+
+  o->clear_clientwriteable();
+
+  o->item_open_file.remove_myself();
+
+  if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
+    export_pin_queue.erase(o);
+
+  if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
+    export_pin_delayed_queue.erase(o);
+
+  o->clear_ephemeral_pin(true, true);
+
+  // remove from inode map
+  if (o->last == CEPH_NOSNAP) {
+    inode_map.erase(o->ino());
+  } else {
+    o->item_caps.remove_myself();
+    snap_inode_map.erase(o->vino());
+  }
+
+  if (o->ino() < MDS_INO_SYSTEM_BASE) {
+    if (o == root) root = 0;
+    if (o == myin) myin = 0;
+    if (o->is_stray()) {
+      if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
+	strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
+      }
+    }
+    if (o->is_base())
+      base_inodes.erase(o);
+  }
+
+  // delete it
+  ceph_assert(o->get_num_ref() == 0);
+  delete o; 
+}
+
+file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
+{
+  file_layout_t result = file_layout_t::get_default();
+  result.pool_id = mdsmap.get_first_data_pool();
+  return result;
+}
+
+file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
+{
+  file_layout_t result = file_layout_t::get_default();
+  result.pool_id = mdsmap.get_metadata_pool();
+  if (g_conf()->mds_log_segment_size > 0) {
+    result.object_size = g_conf()->mds_log_segment_size;
+    result.stripe_unit = g_conf()->mds_log_segment_size;
+  }
+  return result;
+}
+
+void MDCache::init_layouts()
+{
+  default_file_layout = gen_default_file_layout(*(mds->mdsmap));
+  default_log_layout = gen_default_log_layout(*(mds->mdsmap));
+}
+
+void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino, int mode) const
+{
+  auto _inode = in->_get_inode();
+  _inode->ino = ino;
+  _inode->version = 1;
+  _inode->xattr_version = 1;
+  _inode->mode = 0500 | mode;
+  _inode->size = 0;
+  _inode->ctime = _inode->mtime = _inode->btime = ceph_clock_now();
+  _inode->nlink = 1;
+  _inode->truncate_size = -1ull;
+  _inode->change_attr = 0;
+  _inode->export_pin = MDS_RANK_NONE;
+
+  // FIPS zeroization audit 20191117: this memset is not security related.
+  memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
+  if (_inode->is_dir()) {
+    _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+    _inode->rstat.rsubdirs = 1; /* itself */
+    _inode->rstat.rctime = in->get_inode()->ctime;
+  } else {
+    _inode->layout = default_file_layout;
+    ++_inode->rstat.rfiles;
+  }
+  _inode->accounted_rstat = _inode->rstat;
+
+  if (in->is_base()) {
+    if (in->is_root())
+      in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
+    else
+      in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
+    in->open_snaprealm();  // empty snaprealm
+    ceph_assert(!in->snaprealm->parent); // created its own
+    in->snaprealm->srnode.seq = 1;
+  }
+}
+
+CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
+{
+  dout(0) << "creating system inode with ino:" << ino << dendl;
+  CInode *in = new CInode(this);
+  create_unlinked_system_inode(in, ino, mode);
+  add_inode(in);
+  return in;
+}
+
+CInode *MDCache::create_root_inode()
+{
+  CInode *in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755);
+  auto _inode = in->_get_inode();
+  _inode->uid = g_conf()->mds_root_ino_uid;
+  _inode->gid = g_conf()->mds_root_ino_gid;
+  _inode->layout = default_file_layout;
+  _inode->layout.pool_id = mds->mdsmap->get_first_data_pool();
+  return in;
+}
+
+void MDCache::create_empty_hierarchy(MDSGather *gather)
+{
+  // create root dir
+  CInode *root = create_root_inode();
+
+  // force empty root dir
+  CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
+  adjust_subtree_auth(rootdir, mds->get_nodeid());   
+  rootdir->dir_rep = CDir::REP_ALL;   //NONE;
+
+  ceph_assert(rootdir->get_fnode()->accounted_fragstat == rootdir->get_fnode()->fragstat);
+  ceph_assert(rootdir->get_fnode()->fragstat == root->get_inode()->dirstat);
+  ceph_assert(rootdir->get_fnode()->accounted_rstat == rootdir->get_fnode()->rstat);
+  /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
+   * assume version 0 is stale/invalid.
+   */
+
+  rootdir->mark_complete();
+  rootdir->_get_fnode()->version = rootdir->pre_dirty();
+  rootdir->mark_dirty(mds->mdlog->get_current_segment());
+  rootdir->commit(0, gather->new_sub());
+
+  root->store(gather->new_sub());
+  root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
+  root->store_backtrace(gather->new_sub());
+}
+
+void MDCache::create_mydir_hierarchy(MDSGather *gather)
+{
+  // create mds dir
+  CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
+
+  CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
+  auto mydir_fnode = mydir->_get_fnode();
+
+  adjust_subtree_auth(mydir, mds->get_nodeid());   
+
+  LogSegment *ls = mds->mdlog->get_current_segment();
+
+  // stray dir
+  for (int i = 0; i < NUM_STRAY; ++i) {
+    CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
+    CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
+    CachedStackStringStream css;
+    *css << "stray" << i;
+    CDentry *sdn = mydir->add_primary_dentry(css->str(), stray, "");
+    sdn->_mark_dirty(mds->mdlog->get_current_segment());
+
+    stray->_get_inode()->dirstat = straydir->get_fnode()->fragstat;
+
+    mydir_fnode->rstat.add(stray->get_inode()->rstat);
+    mydir_fnode->fragstat.nsubdirs++;
+    // save them
+    straydir->mark_complete();
+    straydir->_get_fnode()->version = straydir->pre_dirty();
+    straydir->mark_dirty(ls);
+    straydir->commit(0, gather->new_sub());
+    stray->mark_dirty_parent(ls, true);
+    stray->store_backtrace(gather->new_sub());
+  }
+
+  mydir_fnode->accounted_fragstat = mydir->get_fnode()->fragstat;
+  mydir_fnode->accounted_rstat = mydir->get_fnode()->rstat;
+
+  auto inode = myin->_get_inode();
+  inode->dirstat = mydir->get_fnode()->fragstat;
+  inode->rstat = mydir->get_fnode()->rstat;
+  ++inode->rstat.rsubdirs;
+  inode->accounted_rstat = inode->rstat;
+
+  mydir->mark_complete();
+  mydir_fnode->version = mydir->pre_dirty();
+  mydir->mark_dirty(ls);
+  mydir->commit(0, gather->new_sub());
+
+  myin->store(gather->new_sub());
+}
+
+struct C_MDC_CreateSystemFile : public MDCacheLogContext {
+  MutationRef mut;
+  CDentry *dn;
+  version_t dpv;
+  MDSContext *fin;
+  C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
+    MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
+  void finish(int r) override {
+    mdcache->_create_system_file_finish(mut, dn, dpv, fin);
+  }
+};
+
+void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
+{
+  dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
+  CDentry *dn = dir->add_null_dentry(name);
+
+  dn->push_projected_linkage(in);
+  version_t dpv = dn->pre_dirty();
+  
+  CDir *mdir = 0;
+  auto inode = in->_get_inode();
+  if (in->is_dir()) {
+    inode->rstat.rsubdirs = 1;
+
+    mdir = in->get_or_open_dirfrag(this, frag_t());
+    mdir->mark_complete();
+    mdir->_get_fnode()->version = mdir->pre_dirty();
+  } else {
+    inode->rstat.rfiles = 1;
+  }
+
+  inode->version = dn->pre_dirty();
+  
+  SnapRealm *realm = dir->get_inode()->find_snaprealm();
+  dn->first = in->first = realm->get_newest_seq() + 1;
+
+  MutationRef mut(new MutationImpl());
+
+  // force some locks.  hacky.
+  mds->locker->wrlock_force(&dir->inode->filelock, mut);
+  mds->locker->wrlock_force(&dir->inode->nestlock, mut);
+
+  mut->ls = mds->mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mds->mdlog, "create system file");
+  mds->mdlog->start_entry(le);
+
+  if (!in->is_mdsdir()) {
+    predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+    le->metablob.add_primary_dentry(dn, in, true);
+  } else {
+    predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
+    journal_dirty_inode(mut.get(), &le->metablob, in);
+    dn->push_projected_linkage(in->ino(), in->d_type());
+    le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
+    le->metablob.add_root(true, in);
+  }
+  if (mdir)
+    le->metablob.add_new_dir(mdir); // dirty AND complete AND new
+
+  mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
+  mds->mdlog->flush();
+}
+
+void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
+{
+  dout(10) << "_create_system_file_finish " << *dn << dendl;
+  
+  dn->pop_projected_linkage();
+  dn->mark_dirty(dpv, mut->ls);
+
+  CInode *in = dn->get_linkage()->get_inode();
+  in->mark_dirty(mut->ls);
+
+  if (in->is_dir()) {
+    CDir *dir = in->get_dirfrag(frag_t());
+    ceph_assert(dir);
+    dir->mark_dirty(mut->ls);
+    dir->mark_new(mut->ls);
+  }
+
+  mut->apply();
+  mds->locker->drop_locks(mut.get());
+  mut->cleanup();
+
+  fin->complete(0);
+
+  //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
+  //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
+}
+
+
+
+struct C_MDS_RetryOpenRoot : public MDSInternalContext {
+  MDCache *cache;
+  explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
+  void finish(int r) override {
+    if (r < 0) {
+      // If we can't open root, something disastrous has happened: mark
+      // this rank damaged for operator intervention.  Note that
+      // it is not okay to call suicide() here because we are in
+      // a Finisher callback.
+      cache->mds->damaged();
+      ceph_abort();  // damaged should never return
+    } else {
+      cache->open_root();
+    }
+  }
+};
+
+void MDCache::open_root_inode(MDSContext *c)
+{
+  if (mds->get_nodeid() == mds->mdsmap->get_root()) {
+    CInode *in;
+    in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755);  // initially inaccurate!
+    in->fetch(c);
+  } else {
+    discover_base_ino(CEPH_INO_ROOT, c, mds->mdsmap->get_root());
+  }
+}
+
+void MDCache::open_mydir_inode(MDSContext *c)
+{
+  CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755);  // initially inaccurate!
+  in->fetch(c);
+}
+
+void MDCache::open_mydir_frag(MDSContext *c)
+{
+  open_mydir_inode(
+      new MDSInternalContextWrapper(mds,
+	new LambdaContext([this, c](int r) {
+	    if (r < 0) {
+	      c->complete(r);
+	      return;
+	    }
+	    CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
+	    ceph_assert(mydir);
+	    adjust_subtree_auth(mydir, mds->get_nodeid());
+	    mydir->fetch(c);
+	  })
+	)
+      );
+}
+
+void MDCache::open_root()
+{
+  dout(10) << "open_root" << dendl;
+
+  if (!root) {
+    open_root_inode(new C_MDS_RetryOpenRoot(this));
+    return;
+  }
+  if (mds->get_nodeid() == mds->mdsmap->get_root()) {
+    ceph_assert(root->is_auth());  
+    CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
+    ceph_assert(rootdir);
+    if (!rootdir->is_subtree_root())
+      adjust_subtree_auth(rootdir, mds->get_nodeid());   
+    if (!rootdir->is_complete()) {
+      rootdir->fetch(new C_MDS_RetryOpenRoot(this));
+      return;
+    }
+  } else {
+    ceph_assert(!root->is_auth());
+    CDir *rootdir = root->get_dirfrag(frag_t());
+    if (!rootdir) {
+      open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
+      return;
+    }    
+  }
+
+  if (!myin) {
+    CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755);  // initially inaccurate!
+    in->fetch(new C_MDS_RetryOpenRoot(this));
+    return;
+  }
+  CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
+  ceph_assert(mydir);
+  adjust_subtree_auth(mydir, mds->get_nodeid());
+
+  populate_mydir();
+}
+
+void MDCache::advance_stray() {
+  // check whether the directory has been fragmented
+  if (stray_fragmenting_index >= 0) {
+    auto&& dfs = strays[stray_fragmenting_index]->get_dirfrags();
+    bool any_fragmenting = false;
+    for (const auto& dir : dfs) {
+      if (dir->state_test(CDir::STATE_FRAGMENTING) ||
+	  mds->balancer->is_fragment_pending(dir->dirfrag())) {
+	any_fragmenting = true;
+	break;
+      }
+    }
+    if (!any_fragmenting)
+      stray_fragmenting_index = -1;
+  }
+
+  for (int i = 1; i < NUM_STRAY; i++){
+    stray_index = (stray_index + i) % NUM_STRAY;
+    if (stray_index != stray_fragmenting_index)
+      break;
+  }
+
+  if (stray_fragmenting_index == -1 && is_open()) {
+    // Fragment later stray dir in advance. We don't choose past
+    // stray dir because in-flight requests may still use it.
+    stray_fragmenting_index = (stray_index + 3) % NUM_STRAY;
+    auto&& dfs = strays[stray_fragmenting_index]->get_dirfrags();
+    bool any_fragmenting = false;
+    for (const auto& dir : dfs) {
+      if (dir->should_split()) {
+	mds->balancer->queue_split(dir, true);
+	any_fragmenting = true;
+      } else if (dir->should_merge()) {
+	mds->balancer->queue_merge(dir);
+	any_fragmenting = true;
+      }
+    }
+    if (!any_fragmenting)
+      stray_fragmenting_index = -1;
+  }
+
+  dout(10) << "advance_stray to index " << stray_index
+	   << " fragmenting index " << stray_fragmenting_index << dendl;
+}
+
+void MDCache::populate_mydir()
+{
+  ceph_assert(myin);
+  CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
+  ceph_assert(mydir);
+
+  dout(10) << "populate_mydir " << *mydir << dendl;
+
+  if (!mydir->is_complete()) {
+    mydir->fetch(new C_MDS_RetryOpenRoot(this));
+    return;
+  }
+
+  if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
+    // A missing dirfrag, we will recreate it.  Before that, we must dirty
+    // it before dirtying any of the strays we create within it.
+    mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
+      "recreating it now";
+    LogSegment *ls = mds->mdlog->get_current_segment();
+    mydir->state_clear(CDir::STATE_BADFRAG);
+    mydir->mark_complete();
+    mydir->_get_fnode()->version = mydir->pre_dirty();
+    mydir->mark_dirty(ls);
+  }
+
+  // open or create stray
+  uint64_t num_strays = 0;
+  for (int i = 0; i < NUM_STRAY; ++i) {
+    CachedStackStringStream css;
+    *css << "stray" << i;
+    CDentry *straydn = mydir->lookup(css->str());
+
+    // allow for older fs's with stray instead of stray0
+    if (straydn == NULL && i == 0)
+      straydn = mydir->lookup("stray");
+
+    if (!straydn || !straydn->get_linkage()->get_inode()) {
+      _create_system_file(mydir, css->strv(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
+			  new C_MDS_RetryOpenRoot(this));
+      return;
+    }
+    ceph_assert(straydn);
+    ceph_assert(strays[i]);
+    // we make multiple passes through this method; make sure we only pin each stray once.
+    if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
+      strays[i]->get(CInode::PIN_STRAY);
+      strays[i]->state_set(CInode::STATE_STRAYPINNED);
+      strays[i]->get_stickydirs();
+    }
+    dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
+
+    // open all frags
+    frag_vec_t leaves;
+    strays[i]->dirfragtree.get_leaves(leaves);
+    for (const auto& leaf : leaves) {
+      CDir *dir = strays[i]->get_dirfrag(leaf);
+      if (!dir) {
+	dir = strays[i]->get_or_open_dirfrag(this, leaf);
+      }
+
+      // DamageTable applies special handling to strays: it will
+      // have damaged() us out if one is damaged.
+      ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
+
+      if (dir->get_version() == 0) {
+        dir->fetch(new C_MDS_RetryOpenRoot(this));
+        return;
+      }
+
+      if (dir->get_frag_size() > 0)
+	num_strays += dir->get_frag_size();
+    }
+  }
+
+  // okay!
+  dout(10) << "populate_mydir done" << dendl;
+  ceph_assert(!open);    
+  open = true;
+  mds->queue_waiters(waiting_for_open);
+
+  stray_manager.set_num_strays(num_strays);
+  stray_manager.activate();
+
+  scan_stray_dir();
+}
+
+void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
+{
+  discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
+}
+
+CDir *MDCache::get_stray_dir(CInode *in)
+{
+  string straydname;
+  in->name_stray_dentry(straydname);
+
+  CInode *strayi = get_stray();
+  ceph_assert(strayi);
+  frag_t fg = strayi->pick_dirfrag(straydname);
+  CDir *straydir = strayi->get_dirfrag(fg);
+  ceph_assert(straydir);
+  return straydir;
+}
+
+MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
+{
+  // inode?
+  if (info.ino) 
+    return get_inode(info.ino, info.snapid);
+
+  // dir or dentry.
+  CDir *dir = get_dirfrag(info.dirfrag);
+  if (!dir) return 0;
+    
+  if (info.dname.length()) 
+    return dir->lookup(info.dname, info.snapid);
+  else
+    return dir;
+}
+
+
+// ====================================================================
+// consistent hash ring
+
+/*
+ * hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf
+*/
+mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino, frag_t fg)
+{
+  const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
+  uint64_t hash = rjhash64(ino);
+  if (fg)
+    hash = rjhash64(hash + rjhash64(fg.value()));
+
+  int64_t b = -1, j = 0;
+  while (j < max_mds) {
+    b = j;
+    hash = hash*2862933555777941757ULL + 1;
+    j = (b + 1) * (double(1LL << 31) / double((hash >> 33) + 1));
+  }
+  // verify bounds before returning
+  auto result = mds_rank_t(b);
+  ceph_assert(result >= 0 && result < max_mds);
+  return result;
+}
+
+
+// ====================================================================
+// subtree management
+
+/*
+ * adjust the dir_auth of a subtree.
+ * merge with parent and/or child subtrees, if is it appropriate.
+ * merge can ONLY happen if both parent and child have unambiguous auth.
+ */
+void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
+{
+  dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
+	  << " on " << *dir << dendl;
+
+  show_subtrees();
+
+  CDir *root;
+  if (dir->inode->is_base()) {
+    root = dir;  // bootstrap hack.
+    if (subtrees.count(root) == 0) {
+      subtrees[root];
+      root->get(CDir::PIN_SUBTREE);
+    }
+  } else {
+    root = get_subtree_root(dir);  // subtree root
+  }
+  ceph_assert(root);
+  ceph_assert(subtrees.count(root));
+  dout(7) << " current root is " << *root << dendl;
+
+  if (root == dir) {
+    // i am already a subtree.
+    dir->set_dir_auth(auth);
+  } else {
+    // i am a new subtree.
+    dout(10) << "  new subtree at " << *dir << dendl;
+    ceph_assert(subtrees.count(dir) == 0);
+    subtrees[dir];      // create empty subtree bounds list for me.
+    dir->get(CDir::PIN_SUBTREE);
+
+    // set dir_auth
+    dir->set_dir_auth(auth);
+    
+    // move items nested beneath me, under me.
+    set<CDir*>::iterator p = subtrees[root].begin();
+    while (p != subtrees[root].end()) {
+      set<CDir*>::iterator next = p;
+      ++next;
+      if (get_subtree_root((*p)->get_parent_dir()) == dir) {
+	// move under me
+	dout(10) << "  claiming child bound " << **p << dendl;
+	subtrees[dir].insert(*p); 
+	subtrees[root].erase(p);
+      }
+      p = next;
+    }
+    
+    // i am a bound of the parent subtree.
+    subtrees[root].insert(dir); 
+
+    // i am now the subtree root.
+    root = dir;
+
+    // adjust recursive pop counters
+    if (adjust_pop && dir->is_auth()) {
+      CDir *p = dir->get_parent_dir();
+      while (p) {
+	p->pop_auth_subtree.sub(dir->pop_auth_subtree);
+	if (p->is_subtree_root()) break;
+	p = p->inode->get_parent_dir();
+      }
+    }
+  }
+
+  show_subtrees();
+}
+
+
+void MDCache::try_subtree_merge(CDir *dir)
+{
+  dout(7) << "try_subtree_merge " << *dir << dendl;
+  // record my old bounds
+  auto oldbounds = subtrees.at(dir);
+
+  set<CInode*> to_eval;
+  // try merge at my root
+  try_subtree_merge_at(dir, &to_eval);
+
+  // try merge at my old bounds
+  for (auto bound : oldbounds)
+    try_subtree_merge_at(bound, &to_eval);
+
+  if (!(mds->is_any_replay() || mds->is_resolve())) {
+    for(auto in : to_eval)
+      eval_subtree_root(in);
+  }
+}
+
+void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
+{
+  dout(10) << "try_subtree_merge_at " << *dir << dendl;
+
+  if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
+      dir->state_test(CDir::STATE_EXPORTBOUND) ||
+      dir->state_test(CDir::STATE_AUXSUBTREE))
+    return;
+
+  auto it = subtrees.find(dir);
+  ceph_assert(it != subtrees.end());
+
+  // merge with parent?
+  CDir *parent = dir;  
+  if (!dir->inode->is_base())
+    parent = get_subtree_root(dir->get_parent_dir());
+  
+  if (parent != dir &&				// we have a parent,
+      parent->dir_auth == dir->dir_auth) {	// auth matches,
+    // merge with parent.
+    dout(10) << "  subtree merge at " << *dir << dendl;
+    dir->set_dir_auth(CDIR_AUTH_DEFAULT);
+    
+    // move our bounds under the parent
+    subtrees[parent].insert(it->second.begin(), it->second.end());
+    
+    // we are no longer a subtree or bound
+    dir->put(CDir::PIN_SUBTREE);
+    subtrees.erase(it);
+    subtrees[parent].erase(dir);
+
+    // adjust popularity?
+    if (adjust_pop && dir->is_auth()) {
+      CDir *cur = dir;
+      CDir *p = dir->get_parent_dir();
+      while (p) {
+	p->pop_auth_subtree.add(dir->pop_auth_subtree);
+	p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
+	if (p->is_subtree_root()) break;
+	cur = p;
+	p = p->inode->get_parent_dir();
+      }
+    }
+
+    if (to_eval && dir->get_inode()->is_auth())
+      to_eval->insert(dir->get_inode());
+
+    show_subtrees(15);
+  }
+}
+
+void MDCache::eval_subtree_root(CInode *diri)
+{
+  // evaluate subtree inode filelock?
+  //  (we should scatter the filelock on subtree bounds)
+  ceph_assert(diri->is_auth());
+  mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
+}
+
+
+void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
+{
+  dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
+	  << " on " << *dir
+	  << " bounds " << bounds
+	  << dendl;
+
+  show_subtrees();
+
+  CDir *root;
+  if (dir->ino() == CEPH_INO_ROOT) {
+    root = dir;  // bootstrap hack.
+    if (subtrees.count(root) == 0) {
+      subtrees[root];
+      root->get(CDir::PIN_SUBTREE);
+    }
+  } else {
+    root = get_subtree_root(dir);  // subtree root
+  }
+  ceph_assert(root);
+  ceph_assert(subtrees.count(root));
+  dout(7) << " current root is " << *root << dendl;
+
+  mds_authority_t oldauth = dir->authority();
+
+  if (root == dir) {
+    // i am already a subtree.
+    dir->set_dir_auth(auth);
+  } else {
+    // i am a new subtree.
+    dout(10) << "  new subtree at " << *dir << dendl;
+    ceph_assert(subtrees.count(dir) == 0);
+    subtrees[dir];      // create empty subtree bounds list for me.
+    dir->get(CDir::PIN_SUBTREE);
+    
+    // set dir_auth
+    dir->set_dir_auth(auth);
+    
+    // move items nested beneath me, under me.
+    set<CDir*>::iterator p = subtrees[root].begin();
+    while (p != subtrees[root].end()) {
+      set<CDir*>::iterator next = p;
+      ++next;
+      if (get_subtree_root((*p)->get_parent_dir()) == dir) {
+	// move under me
+	dout(10) << "  claiming child bound " << **p << dendl;
+	subtrees[dir].insert(*p); 
+	subtrees[root].erase(p);
+      }
+      p = next;
+    }
+    
+    // i am a bound of the parent subtree.
+    subtrees[root].insert(dir); 
+
+    // i am now the subtree root.
+    root = dir;
+  }
+
+  set<CInode*> to_eval;
+
+  // verify/adjust bounds.
+  // - these may be new, or
+  // - beneath existing ambiguous bounds (which will be collapsed),
+  // - but NOT beneath unambiguous bounds.
+  for (const auto& bound : bounds) {
+    // new bound?
+    if (subtrees[dir].count(bound) == 0) {
+      if (get_subtree_root(bound) == dir) {
+	dout(10) << "  new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
+	adjust_subtree_auth(bound, oldauth);       // otherwise, adjust at bound.
+      }
+      else {
+	dout(10) << "  want bound " << *bound << dendl;
+	CDir *t = get_subtree_root(bound->get_parent_dir());
+	if (subtrees[t].count(bound) == 0) {
+	  ceph_assert(t != dir);
+	  dout(10) << "  new bound " << *bound << dendl;
+	  adjust_subtree_auth(bound, t->authority());
+	}
+	// make sure it's nested beneath ambiguous subtree(s)
+	while (1) {
+	  while (subtrees[dir].count(t) == 0)
+	    t = get_subtree_root(t->get_parent_dir());
+	  dout(10) << "  swallowing intervening subtree at " << *t << dendl;
+	  adjust_subtree_auth(t, auth);
+	  try_subtree_merge_at(t, &to_eval);
+	  t = get_subtree_root(bound->get_parent_dir());
+	  if (t == dir) break;
+	}
+      }
+    }
+    else {
+      dout(10) << "  already have bound " << *bound << dendl;
+    }
+  }
+  // merge stray bounds?
+  while (!subtrees[dir].empty()) {
+    set<CDir*> copy = subtrees[dir];
+    for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
+      if (bounds.count(*p) == 0) {
+	CDir *stray = *p;
+	dout(10) << "  swallowing extra subtree at " << *stray << dendl;
+	adjust_subtree_auth(stray, auth);
+	try_subtree_merge_at(stray, &to_eval);
+      }
+    }
+    // swallowing subtree may add new subtree bounds
+    if (copy == subtrees[dir])
+      break;
+  }
+
+  // bound should now match.
+  verify_subtree_bounds(dir, bounds);
+
+  show_subtrees();
+
+  if (!(mds->is_any_replay() || mds->is_resolve())) {
+    for(auto in : to_eval)
+      eval_subtree_root(in);
+  }
+}
+
+
+/*
+ * return a set of CDir*'s that correspond to the given bound set.  Only adjust
+ * fragmentation as necessary to get an equivalent bounding set.  That is, only
+ * split if one of our frags spans the provided bounding set.  Never merge.
+ */
+void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
+{
+  dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
+
+  // sort by ino
+  map<inodeno_t, fragset_t> byino;
+  for (auto& frag : dfs) {
+    byino[frag.ino].insert_raw(frag.frag);
+  }
+  dout(10) << " by ino: " << byino << dendl;
+
+  for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
+    p->second.simplify();
+    CInode *diri = get_inode(p->first);
+    if (!diri)
+      continue;
+    dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
+
+    fragtree_t tmpdft;
+    for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
+      tmpdft.force_to_leaf(g_ceph_context, *q);
+
+    for (const auto& fg : p->second) {
+      frag_vec_t leaves;
+      diri->dirfragtree.get_leaves_under(fg, leaves);
+      if (leaves.empty()) {
+	frag_t approx_fg = diri->dirfragtree[fg.value()];
+        frag_vec_t approx_leaves;
+	tmpdft.get_leaves_under(approx_fg, approx_leaves);
+	for (const auto& leaf : approx_leaves) {
+	  if (p->second.get().count(leaf) == 0) {
+	    // not bound, so the resolve message is from auth MDS of the dirfrag
+	    force_dir_fragment(diri, leaf);
+	  }
+	}
+      }
+
+      auto&& [complete, sibs] = diri->get_dirfrags_under(fg);
+      for (const auto& sib : sibs)
+	bounds.insert(sib);
+    }
+  }
+}
+
+void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
+{
+  dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
+	  << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
+
+  set<CDir*> bounds;
+  get_force_dirfrag_bound_set(bound_dfs, bounds);
+  adjust_bounded_subtree_auth(dir, bounds, auth);
+}
+
+void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
+{
+  dout(10) << "map_dirfrag_set " << dfs << dendl;
+
+  // group by inode
+  map<inodeno_t, fragset_t> ino_fragset;
+  for (const auto &df : dfs) {
+    ino_fragset[df.ino].insert_raw(df.frag);
+  }
+  // get frags
+  for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
+       p != ino_fragset.end();
+       ++p) {
+    p->second.simplify();
+    CInode *in = get_inode(p->first);
+    if (!in)
+      continue;
+
+    frag_vec_t fgs;
+    for (const auto& fg : p->second) {
+      in->dirfragtree.get_leaves_under(fg, fgs);
+    }
+
+    dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
+	     << " on " << *in << dendl;
+
+    for (const auto& fg : fgs) {
+      CDir *dir = in->get_dirfrag(fg);
+      if (dir)
+	result.insert(dir);
+    }
+  }
+}
+
+
+
+CDir *MDCache::get_subtree_root(CDir *dir)
+{
+  // find the underlying dir that delegates (or is about to delegate) auth
+  while (true) {
+    if (dir->is_subtree_root()) 
+      return dir;
+    dir = dir->get_inode()->get_parent_dir();
+    if (!dir) 
+      return 0;             // none
+  }
+}
+
+CDir *MDCache::get_projected_subtree_root(CDir *dir)
+{
+  // find the underlying dir that delegates (or is about to delegate) auth
+  while (true) {
+    if (dir->is_subtree_root()) 
+      return dir;
+    dir = dir->get_inode()->get_projected_parent_dir();
+    if (!dir) 
+      return 0;             // none
+  }
+}
+
+void MDCache::remove_subtree(CDir *dir)
+{
+  dout(10) << "remove_subtree " << *dir << dendl;
+  auto it = subtrees.find(dir);
+  ceph_assert(it != subtrees.end());
+  subtrees.erase(it);
+  dir->put(CDir::PIN_SUBTREE);
+  if (dir->get_parent_dir()) {
+    CDir *p = get_subtree_root(dir->get_parent_dir());
+    auto it = subtrees.find(p);
+    ceph_assert(it != subtrees.end());
+    auto count = it->second.erase(dir);
+    ceph_assert(count == 1);
+  }
+}
+
+void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
+{
+  ceph_assert(subtrees.count(dir));
+  bounds = subtrees[dir];
+}
+
+void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
+{
+  if (subtrees.count(dir)) {
+    // just copy them, dir is a subtree.
+    get_subtree_bounds(dir, bounds);
+  } else {
+    // find them
+    CDir *root = get_subtree_root(dir);
+    for (set<CDir*>::iterator p = subtrees[root].begin();
+	 p != subtrees[root].end();
+	 ++p) {
+      CDir *t = *p;
+      while (t != root) {
+	t = t->get_parent_dir();
+	ceph_assert(t);
+	if (t == dir) {
+	  bounds.insert(*p);
+	  continue;
+	}
+      }
+    }
+  }
+}
+
+void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
+{
+  // for debugging only.
+  ceph_assert(subtrees.count(dir));
+  if (bounds != subtrees[dir]) {
+    dout(0) << "verify_subtree_bounds failed" << dendl;
+    set<CDir*> b = bounds;
+    for (auto &cd : subtrees[dir]) {
+      if (bounds.count(cd)) {
+	b.erase(cd);
+	continue;
+      }
+      dout(0) << "  missing bound " << *cd << dendl;
+    }
+    for (const auto &cd : b)
+      dout(0) << "    extra bound " << *cd << dendl;
+  }
+  ceph_assert(bounds == subtrees[dir]);
+}
+
+void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
+{
+  // for debugging only.
+  ceph_assert(subtrees.count(dir));
+
+  // make sure that any bounds i do have are properly noted as such.
+  int failed = 0;
+  for (const auto &fg : bounds) {
+    CDir *bd = get_dirfrag(fg);
+    if (!bd) continue;
+    if (subtrees[dir].count(bd) == 0) {
+      dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
+      failed++;
+    }
+  }
+  ceph_assert(failed == 0);
+}
+
+void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
+{
+  dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
+	   << " to " << *newdir << dendl;
+  projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
+}
+
+void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
+{
+  dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
+
+  CDir *newdir = diri->get_parent_dir();
+
+  if (pop) {
+    map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
+    ceph_assert(p != projected_subtree_renames.end());
+    ceph_assert(!p->second.empty());
+    ceph_assert(p->second.front().first == olddir);
+    ceph_assert(p->second.front().second == newdir);
+    p->second.pop_front();
+    if (p->second.empty())
+      projected_subtree_renames.erase(p);
+  }
+
+  // adjust total auth pin of freezing subtree
+  if (olddir != newdir) {
+    auto&& dfls = diri->get_nested_dirfrags();
+    for (const auto& dir : dfls)
+      olddir->adjust_freeze_after_rename(dir);
+  }
+
+  // adjust subtree
+  // N.B. make sure subtree dirfrags are at the front of the list
+  auto dfls = diri->get_subtree_dirfrags();
+  diri->get_nested_dirfrags(dfls);
+  for (const auto& dir : dfls) {
+    dout(10) << "dirfrag " << *dir << dendl;
+    CDir *oldparent = get_subtree_root(olddir);
+    dout(10) << " old parent " << *oldparent << dendl;
+    CDir *newparent = get_subtree_root(newdir);
+    dout(10) << " new parent " << *newparent << dendl;
+
+    auto& oldbounds = subtrees[oldparent];
+    auto& newbounds = subtrees[newparent];
+
+    if (olddir != newdir)
+      mds->balancer->adjust_pop_for_rename(olddir, dir, false);
+
+    if (oldparent == newparent) {
+      dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
+    } else if (dir->is_subtree_root()) {
+      // children are fine.  change parent.
+      dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
+      {
+        auto n = oldbounds.erase(dir);
+        ceph_assert(n == 1);
+      }
+      newbounds.insert(dir);
+      // caller is responsible for 'eval diri'
+      try_subtree_merge_at(dir, NULL, false);
+    } else {
+      // mid-subtree.
+
+      // see if any old bounds move to the new parent.
+      std::vector<CDir*> tomove;
+      for (const auto& bound : oldbounds) {
+	CDir *broot = get_subtree_root(bound->get_parent_dir());
+	if (broot != oldparent) {
+	  ceph_assert(broot == newparent);
+	  tomove.push_back(bound);
+	}
+      }
+      for (const auto& bound : tomove) {
+	dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
+	oldbounds.erase(bound);
+	newbounds.insert(bound);
+      }	   
+
+      // did auth change?
+      if (oldparent->authority() != newparent->authority()) {
+	adjust_subtree_auth(dir, oldparent->authority(), false);
+	// caller is responsible for 'eval diri'
+	try_subtree_merge_at(dir, NULL, false);
+      }
+    }
+
+    if (olddir != newdir)
+      mds->balancer->adjust_pop_for_rename(newdir, dir, true);
+  }
+
+  show_subtrees();
+}
+
+// ===================================
+// journal and snap/cow helpers
+
+
+/*
+ * find first inode in cache that follows given snapid.  otherwise, return current.
+ */
+CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
+{
+  dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
+  ceph_assert(in->last == CEPH_NOSNAP);
+
+  auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
+  if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
+    dout(10) << "pick_inode_snap found " << *p->second << dendl;
+    in = p->second;
+  }
+
+  return in;
+}
+
+
+/*
+ * note: i'm currently cheating wrt dirty and inode.version on cow
+ * items.  instead of doing a full dir predirty, i just take the
+ * original item's version, and set the dirty flag (via
+ * mutation::add_cow_{inode,dentry}() and mutation::apply().  that
+ * means a special case in the dir commit clean sweep assertions.
+ * bah.
+ */
+CInode *MDCache::cow_inode(CInode *in, snapid_t last)
+{
+  ceph_assert(last >= in->first);
+
+  CInode *oldin = new CInode(this, true, in->first, last);
+  auto _inode = CInode::allocate_inode(*in->get_previous_projected_inode());
+  _inode->trim_client_ranges(last);
+  oldin->reset_inode(std::move(_inode));
+  auto _xattrs = in->get_previous_projected_xattrs();
+  oldin->reset_xattrs(std::move(_xattrs));
+
+  oldin->symlink = in->symlink;
+
+  if (in->first < in->oldest_snap)
+    in->oldest_snap = in->first;
+
+  in->first = last+1;
+
+  dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
+  add_inode(oldin);
+
+  if (in->last != CEPH_NOSNAP) {
+    CInode *head_in = get_inode(in->ino());
+    ceph_assert(head_in);
+    auto ret = head_in->split_need_snapflush(oldin, in);
+    if (ret.first) {
+      oldin->client_snap_caps = in->client_snap_caps;
+      if (!oldin->client_snap_caps.empty()) {
+	for (int i = 0; i < num_cinode_locks; i++) {
+	  SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
+	  ceph_assert(lock);
+	  if (lock->get_state() != LOCK_SNAP_SYNC) {
+	    ceph_assert(lock->is_stable());
+	    lock->set_state(LOCK_SNAP_SYNC);  // gathering
+	    oldin->auth_pin(lock);
+	  }
+	  lock->get_wrlock(true);
+	}
+      }
+    }
+    if (!ret.second) {
+      auto client_snap_caps = std::move(in->client_snap_caps);
+      in->client_snap_caps.clear();
+      in->item_open_file.remove_myself();
+      in->item_caps.remove_myself();
+
+      if (!client_snap_caps.empty()) {
+	MDSContext::vec finished;
+	for (int i = 0; i < num_cinode_locks; i++) {
+	  SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
+	  ceph_assert(lock);
+	  ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering
+	  lock->put_wrlock();
+	  if (!lock->get_num_wrlocks()) {
+	    lock->set_state(LOCK_SYNC);
+	    lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished);
+	    in->auth_unpin(lock);
+	  }
+	}
+	mds->queue_waiters(finished);
+      }
+    }
+    return oldin;
+  }
+
+  if (!in->client_caps.empty()) {
+    const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
+    // clone caps?
+    for (auto &p : in->client_caps) {
+      client_t client = p.first;
+      Capability *cap = &p.second;
+      int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
+      if ((issued & CEPH_CAP_ANY_WR) &&
+	  cap->client_follows < last) {
+	dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl;
+	oldin->client_snap_caps.insert(client);
+	cap->client_follows = last;
+
+	// we need snapflushes for any intervening snaps
+	dout(10) << "  snaps " << snaps << dendl;
+	for (auto q = snaps.lower_bound(oldin->first);
+	     q != snaps.end() && *q <= last;
+	     ++q) {
+	  in->add_need_snapflush(oldin, *q, client);
+	}
+      } else {
+	dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
+      }
+    }
+
+    if (!oldin->client_snap_caps.empty()) {
+      for (int i = 0; i < num_cinode_locks; i++) {
+	SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
+	ceph_assert(lock);
+	if (lock->get_state() != LOCK_SNAP_SYNC) {
+	  ceph_assert(lock->is_stable());
+	  lock->set_state(LOCK_SNAP_SYNC);  // gathering
+	  oldin->auth_pin(lock);
+	}
+	lock->get_wrlock(true);
+      }
+    }
+  }
+  return oldin;
+}
+
+void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
+                                 CDentry *dn, snapid_t follows,
+				 CInode **pcow_inode, CDentry::linkage_t *dnl)
+{
+  if (!dn) {
+    dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
+    return;
+  }
+  dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
+  ceph_assert(dn->is_auth());
+
+  // nothing to cow on a null dentry, fix caller
+  if (!dnl)
+    dnl = dn->get_projected_linkage();
+  ceph_assert(!dnl->is_null());
+
+  CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
+  bool cow_head = false;
+  if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
+    ceph_assert(in->is_frozen_inode());
+    cow_head = true;
+  }
+  if (in && (in->is_multiversion() || cow_head)) {
+    // multiversion inode.
+    SnapRealm *realm = NULL;
+
+    if (in->get_projected_parent_dn() != dn) {
+      ceph_assert(follows == CEPH_NOSNAP);
+      realm = dn->dir->inode->find_snaprealm();
+      snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
+      ceph_assert(dir_follows >= realm->get_newest_seq());
+
+      if (dir_follows+1 > dn->first) {
+	snapid_t oldfirst = dn->first;
+	dn->first = dir_follows+1;
+	if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
+	  CDir *dir = dn->dir;
+	  CDentry *olddn = dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(), dn->alternate_name, oldfirst, dir_follows);
+	  dout(10) << " olddn " << *olddn << dendl;
+	  ceph_assert(dir->is_projected());
+	  olddn->set_projected_version(dir->get_projected_version());
+	  metablob->add_remote_dentry(olddn, true);
+	  mut->add_cow_dentry(olddn);
+	  // FIXME: adjust link count here?  hmm.
+
+	  if (dir_follows+1 > in->first)
+	    in->cow_old_inode(dir_follows, cow_head);
+	}
+      }
+
+      follows = dir_follows;
+      if (in->snaprealm) {
+	realm = in->snaprealm;
+	ceph_assert(follows >= realm->get_newest_seq());
+      }
+    } else {
+      realm = in->find_snaprealm();
+      if (follows == CEPH_NOSNAP) {
+	follows = get_global_snaprealm()->get_newest_seq();
+	ceph_assert(follows >= realm->get_newest_seq());
+      }
+    }
+
+    // already cloned?
+    if (follows < in->first) {
+      dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
+      return;
+    }
+
+    if (!realm->has_snaps_in_range(in->first, follows)) {
+      dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
+      in->first = follows + 1;
+      return;
+    }
+
+    in->cow_old_inode(follows, cow_head);
+
+  } else {
+    SnapRealm *realm = dn->dir->inode->find_snaprealm();
+    if (follows == CEPH_NOSNAP) {
+      follows = get_global_snaprealm()->get_newest_seq();
+      ceph_assert(follows >= realm->get_newest_seq());
+    }
+
+    // already cloned?
+    if (follows < dn->first) {
+      dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
+      return;
+    }
+
+    // update dn.first before adding old dentry to cdir's map
+    snapid_t oldfirst = dn->first;
+    dn->first = follows+1;
+
+    if (!realm->has_snaps_in_range(oldfirst, follows)) {
+      dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
+      if (in)
+	in->first = follows+1;
+      return;
+    }
+    
+    dout(10) << "    dn " << *dn << dendl;
+    CDir *dir = dn->get_dir();
+    ceph_assert(dir->is_projected());
+
+    if (in) {
+      CInode *oldin = cow_inode(in, follows);
+      ceph_assert(in->is_projected());
+      mut->add_cow_inode(oldin);
+      if (pcow_inode)
+	*pcow_inode = oldin;
+      CDentry *olddn = dir->add_primary_dentry(dn->get_name(), oldin, dn->alternate_name, oldfirst, follows);
+      dout(10) << " olddn " << *olddn << dendl;
+      bool need_snapflush = !oldin->client_snap_caps.empty();
+      if (need_snapflush) {
+	mut->ls->open_files.push_back(&oldin->item_open_file);
+	mds->locker->mark_need_snapflush_inode(oldin);
+      }
+      olddn->set_projected_version(dir->get_projected_version());
+      metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
+      mut->add_cow_dentry(olddn);
+    } else {
+      ceph_assert(dnl->is_remote());
+      CDentry *olddn = dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(), dn->alternate_name, oldfirst, follows);
+      dout(10) << " olddn " << *olddn << dendl;
+
+      olddn->set_projected_version(dir->get_projected_version());
+      metablob->add_remote_dentry(olddn, true);
+      mut->add_cow_dentry(olddn);
+    }
+  }
+}
+
+void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
+{
+  if (in->is_base()) {
+    metablob->add_root(true, in);
+  } else {
+    if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
+      follows = in->first - 1;
+    CDentry *dn = in->get_projected_parent_dn();
+    if (!dn->get_projected_linkage()->is_null())  // no need to cow a null dentry
+      journal_cow_dentry(mut, metablob, dn, follows);
+    if (in->get_projected_inode()->is_backtrace_updated()) {
+      bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
+			in->get_previous_projected_inode()->layout.pool_id;
+      metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
+    } else {
+      metablob->add_primary_dentry(dn, in, true);
+    }
+  }
+}
+
+
+
+// nested ---------------------------------------------------------------
+
+void MDCache::project_rstat_inode_to_frag(const MutationRef& mut,
+					  CInode *cur, CDir *parent, snapid_t first,
+					  int linkunlink, SnapRealm *prealm)
+{
+  CDentry *parentdn = cur->get_projected_parent_dn();
+
+  if (cur->first > first)
+    first = cur->first;
+
+  dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
+	   << " " << *cur << dendl;
+  dout(20) << "    frag head is [" << parent->first << ",head] " << dendl;
+  dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
+
+  /*
+   * FIXME.  this incompletely propagates rstats to _old_ parents
+   * (i.e. shortly after a directory rename).  but we need full
+   * blown hard link backpointers to make this work properly...
+   */
+  snapid_t floor = parentdn->first;
+  dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
+
+  if (!prealm)
+      prealm = parent->inode->find_snaprealm();
+  const set<snapid_t> snaps = prealm->get_snaps();
+
+  if (cur->last != CEPH_NOSNAP) {
+    ceph_assert(cur->dirty_old_rstats.empty());
+    set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
+    if (q == snaps.end() || *q > cur->last)
+      return;
+  }
+
+  if (cur->last >= floor) {
+    bool update = true;
+    if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
+      // rename src inode is not projected in the peer rename prep case. so we should
+      // avoid updateing the inode.
+      ceph_assert(linkunlink < 0);
+      ceph_assert(cur->is_frozen_inode());
+      update = false;
+    }
+    // hacky
+    const CInode::mempool_inode *pi;
+    if (update && mut->is_projected(cur)) {
+      pi = cur->_get_projected_inode();
+    } else {
+      pi = cur->get_projected_inode().get();
+      if (update) {
+	// new inode
+	ceph_assert(pi->rstat == pi->accounted_rstat);
+	update = false;
+      }
+    }
+    _project_rstat_inode_to_frag(pi, std::max(first, floor), cur->last, parent,
+				 linkunlink, update);
+  }
+
+  if (g_conf()->mds_snap_rstat) {
+    for (const auto &p : cur->dirty_old_rstats) {
+      const auto &old = cur->get_old_inodes()->at(p);
+      snapid_t ofirst = std::max(old.first, floor);
+      auto it = snaps.lower_bound(ofirst);
+      if (it == snaps.end() || *it > p)
+	continue;
+      if (p >= floor)
+	_project_rstat_inode_to_frag(&old.inode, ofirst, p, parent, 0, false);
+    }
+  }
+  cur->dirty_old_rstats.clear();
+}
+
+
+void MDCache::_project_rstat_inode_to_frag(const CInode::mempool_inode* inode, snapid_t ofirst, snapid_t last,
+					  CDir *parent, int linkunlink, bool update_inode)
+{
+  dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
+  dout(20) << "  inode           rstat " << inode->rstat << dendl;
+  dout(20) << "  inode accounted_rstat " << inode->accounted_rstat << dendl;
+  nest_info_t delta;
+  if (linkunlink == 0) {
+    delta.add(inode->rstat);
+    delta.sub(inode->accounted_rstat);
+  } else if (linkunlink < 0) {
+    delta.sub(inode->accounted_rstat);
+  } else {
+    delta.add(inode->rstat);
+  }
+  dout(20) << "                  delta " << delta << dendl;
+
+
+  while (last >= ofirst) {
+    /*
+     * pick fnode version to update.  at each iteration, we want to
+     * pick a segment ending in 'last' to update.  split as necessary
+     * to make that work.  then, adjust first up so that we only
+     * update one segment at a time.  then loop to cover the whole
+     * [ofirst,last] interval.
+     */    
+    nest_info_t *prstat;
+    snapid_t first;
+    auto pf = parent->_get_projected_fnode();
+    if (last == CEPH_NOSNAP) {
+      if (g_conf()->mds_snap_rstat)
+	first = std::max(ofirst, parent->first);
+      else
+	first = parent->first;
+      prstat = &pf->rstat;
+      dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
+
+      if (first > parent->first &&
+	  !(pf->rstat == pf->accounted_rstat)) {
+	dout(10) << "  target snapped and not fully accounted, cow to dirty_old_rstat ["
+		 << parent->first << "," << (first-1) << "] "
+		 << " " << *prstat << "/" << pf->accounted_rstat
+		 << dendl;
+	parent->dirty_old_rstat[first-1].first = parent->first;
+	parent->dirty_old_rstat[first-1].rstat = pf->rstat;
+	parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
+      }
+      parent->first = first;
+    } else if (!g_conf()->mds_snap_rstat) {
+      // drop snapshots' rstats
+      break;
+    } else if (last >= parent->first) {
+      first = parent->first;
+      parent->dirty_old_rstat[last].first = first;
+      parent->dirty_old_rstat[last].rstat = pf->rstat;
+      parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
+      prstat = &parent->dirty_old_rstat[last].rstat;
+      dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
+	       << " " << *prstat << "/" << pf->accounted_rstat << dendl;
+    } else {
+      // be careful, dirty_old_rstat is a _sparse_ map.
+      // sorry, this is ugly.
+      first = ofirst;
+
+      // find any intersection with last
+      auto it = parent->dirty_old_rstat.lower_bound(last);
+      if (it == parent->dirty_old_rstat.end()) {
+	dout(20) << "  no dirty_old_rstat with last >= last " << last << dendl;
+	if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
+	  dout(20) << "  last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
+	  first = parent->dirty_old_rstat.rbegin()->first+1;
+	}
+      } else {
+	// *it last is >= last
+	if (it->second.first <= last) {
+	  // *it intersects [first,last]
+	  if (it->second.first < first) {
+	    dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
+	    parent->dirty_old_rstat[first-1] = it->second;
+	    it->second.first = first;
+	  }
+	  if (it->second.first > first)
+	    first = it->second.first;
+	  if (last < it->first) {
+	    dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
+	    parent->dirty_old_rstat[last] = it->second;
+	    it->second.first = last+1;
+	  }
+	} else {
+	  // *it is to the _right_ of [first,last]
+	  it = parent->dirty_old_rstat.lower_bound(first);
+	  // new *it last is >= first
+	  if (it->second.first <= last &&  // new *it isn't also to the right, and
+	      it->first >= first) {        // it intersects our first bit,
+	    dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
+	    first = it->first+1;
+	  }
+	  dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
+	}
+      }
+      dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
+      parent->dirty_old_rstat[last].first = first;
+      prstat = &parent->dirty_old_rstat[last].rstat;
+    }
+    
+    // apply
+    dout(20) << "  project to [" << first << "," << last << "] " << *prstat << dendl;
+    ceph_assert(last >= first);
+    prstat->add(delta);
+    dout(20) << "      result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
+
+    last = first-1;
+  }
+
+  if (update_inode) {
+    auto _inode = const_cast<CInode::mempool_inode*>(inode);
+    _inode->accounted_rstat = _inode->rstat;
+  }
+}
+
+void MDCache::project_rstat_frag_to_inode(const nest_info_t& rstat,
+					  const nest_info_t& accounted_rstat,
+					  snapid_t ofirst, snapid_t last, 
+					  CInode *pin, bool cow_head)
+{
+  dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
+  dout(20) << "  frag           rstat " << rstat << dendl;
+  dout(20) << "  frag accounted_rstat " << accounted_rstat << dendl;
+  nest_info_t delta = rstat;
+  delta.sub(accounted_rstat);
+  dout(20) << "                 delta " << delta << dendl;
+
+  CInode::old_inode_map_ptr _old_inodes;
+  while (last >= ofirst) {
+    CInode::mempool_inode *pi;
+    snapid_t first;
+    if (last == pin->last) {
+      pi = pin->_get_projected_inode();
+      first = std::max(ofirst, pin->first);
+      if (first > pin->first) {
+	auto& old = pin->cow_old_inode(first-1, cow_head);
+	dout(20) << "   cloned old_inode rstat is " << old.inode.rstat << dendl;
+      }
+    } else {
+      if (!_old_inodes) {
+	_old_inodes = CInode::allocate_old_inode_map();
+	if (pin->is_any_old_inodes())
+	  *_old_inodes = *pin->get_old_inodes();
+      }
+      if (last >= pin->first) {
+	first = pin->first;
+	pin->cow_old_inode(last, cow_head);
+      } else {
+	// our life is easier here because old_inodes is not sparse
+	// (although it may not begin at snapid 1)
+	auto it = _old_inodes->lower_bound(last);
+	if (it == _old_inodes->end()) {
+	  dout(10) << " no old_inode <= " << last << ", done." << dendl;
+	  break;
+	}
+	first = it->second.first;
+	if (first > last) {
+	  dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
+	  //assert(p == pin->old_inodes.begin());
+	  break;
+	}
+	if (it->first > last) {
+	  dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
+		   << (last+1) << "," << it->first << "]" << dendl;
+	  (*_old_inodes)[last] = it->second;
+	  it->second.first = last+1;
+	  pin->dirty_old_rstats.insert(it->first);
+	}
+      }
+      if (first < ofirst) {
+	dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
+		 << first << "," << ofirst-1 << "]" << dendl;
+	(*_old_inodes)[ofirst-1] = (*_old_inodes)[last];
+	pin->dirty_old_rstats.insert(ofirst-1);
+	(*_old_inodes)[last].first = first = ofirst;
+      }
+      pi = &(*_old_inodes)[last].inode;
+      pin->dirty_old_rstats.insert(last);
+    }
+    dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
+    pi->rstat.add(delta);
+    dout(20) << "        result [" << first << "," << last << "] " << pi->rstat << dendl;
+
+    last = first-1;
+  }
+  if (_old_inodes)
+    pin->reset_old_inodes(std::move(_old_inodes));
+}
+
+void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
+{
+  if (!(mds->is_active() || mds->is_stopping()))
+    return;
+
+  if (!in->is_auth() || in->is_frozen())
+    return;
+
+  const auto& pi = in->get_projected_inode();
+  if (!pi->quota.is_enable() && !quota_change)
+    return;
+
+  // creaete snaprealm for quota inode (quota was set before mimic)
+  if (!in->get_projected_srnode())
+    mds->server->create_quota_realm(in);
+
+  for (auto &p : in->client_caps) {
+    Capability *cap = &p.second;
+    if (cap->is_noquota())
+      continue;
+
+    if (exclude_ct >= 0 && exclude_ct != p.first)
+      goto update;
+
+    if (cap->last_rbytes == pi->rstat.rbytes &&
+        cap->last_rsize == pi->rstat.rsize())
+      continue;
+
+    if (pi->quota.max_files > 0) {
+      if (pi->rstat.rsize() >= pi->quota.max_files)
+        goto update;
+
+      if ((abs(cap->last_rsize - pi->quota.max_files) >> 4) <
+          abs(cap->last_rsize - pi->rstat.rsize()))
+        goto update;
+    }
+
+    if (pi->quota.max_bytes > 0) {
+      if (pi->rstat.rbytes > pi->quota.max_bytes - (pi->quota.max_bytes >> 3))
+        goto update;
+
+      if ((abs(cap->last_rbytes - pi->quota.max_bytes) >> 4) <
+          abs(cap->last_rbytes - pi->rstat.rbytes))
+        goto update;
+    }
+
+    continue;
+
+update:
+    cap->last_rsize = pi->rstat.rsize();
+    cap->last_rbytes = pi->rstat.rbytes;
+
+    auto msg = make_message<MClientQuota>();
+    msg->ino = in->ino();
+    msg->rstat = pi->rstat;
+    msg->quota = pi->quota;
+    mds->send_message_client_counted(msg, cap->get_session());
+  }
+  for (const auto &it : in->get_replicas()) {
+    auto msg = make_message<MGatherCaps>();
+    msg->ino = in->ino();
+    mds->send_message_mds(msg, it.first);
+  }
+}
+
+/*
+ * NOTE: we _have_ to delay the scatter if we are called during a
+ * rejoin, because we can't twiddle locks between when the
+ * rejoin_(weak|strong) is received and when we send the rejoin_ack.
+ * normally, this isn't a problem: a recover mds doesn't twiddle locks
+ * (no requests), and a survivor acks immediately.  _except_ that
+ * during rejoin_(weak|strong) processing, we may complete a lock
+ * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
+ * scatterlock state in that case or the lock states will get out of
+ * sync between the auth and replica.
+ *
+ * the simple solution is to never do the scatter here.  instead, put
+ * the scatterlock on a list if it isn't already wrlockable.  this is
+ * probably the best plan anyway, since we avoid too many
+ * scatters/locks under normal usage.
+ */
+/*
+ * some notes on dirlock/nestlock scatterlock semantics:
+ *
+ * the fragstat (dirlock) will never be updated without
+ * dirlock+nestlock wrlock held by the caller.
+ *
+ * the rstat (nestlock) _may_ get updated without a wrlock when nested
+ * data is pushed up the tree.  this could be changed with some
+ * restructuring here, but in its current form we ensure that the
+ * fragstat+rstat _always_ reflect an accurrate summation over the dir
+ * frag, which is nice.  and, we only need to track frags that need to
+ * be nudged (and not inodes with pending rstat changes that need to
+ * be pushed into the frag).  a consequence of this is that the
+ * accounted_rstat on scatterlock sync may not match our current
+ * rstat.  this is normal and expected.
+ */
+void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
+				       CInode *in, CDir *parent,
+				       int flags, int linkunlink,
+				       snapid_t cfollows)
+{
+  bool primary_dn = flags & PREDIRTY_PRIMARY;
+  bool do_parent_mtime = flags & PREDIRTY_DIR;
+  bool shallow = flags & PREDIRTY_SHALLOW;
+
+  ceph_assert(mds->mdlog->entry_is_open());
+
+  // make sure stamp is set
+  if (mut->get_mds_stamp() == utime_t())
+    mut->set_mds_stamp(ceph_clock_now());
+
+  if (in->is_base())
+    return;
+
+  dout(10) << "predirty_journal_parents"
+	   << (do_parent_mtime ? " do_parent_mtime":"")
+	   << " linkunlink=" <<  linkunlink
+	   << (primary_dn ? " primary_dn":" remote_dn")
+	   << (shallow ? " SHALLOW":"")
+	   << " follows " << cfollows
+	   << " " << *in << dendl;
+
+  if (!parent) {
+    ceph_assert(primary_dn);
+    parent = in->get_projected_parent_dn()->get_dir();
+  }
+
+  if (flags == 0 && linkunlink == 0) {
+    dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
+    blob->add_dir_context(parent);
+    return;
+  }
+
+  // build list of inodes to wrlock, dirty, and update
+  list<CInode*> lsi;
+  CInode *cur = in;
+  CDentry *parentdn = NULL;
+  bool first = true;
+  while (parent) {
+    //assert(cur->is_auth() || !primary_dn);  // this breaks the rename auth twiddle hack
+    ceph_assert(parent->is_auth());
+    
+    // opportunistically adjust parent dirfrag
+    CInode *pin = parent->get_inode();
+
+    // inode -> dirfrag
+    mut->auth_pin(parent);
+
+    auto pf = parent->project_fnode(mut);
+    pf->version = parent->pre_dirty();
+
+    if (do_parent_mtime || linkunlink) {
+      ceph_assert(mut->is_wrlocked(&pin->filelock));
+      ceph_assert(mut->is_wrlocked(&pin->nestlock));
+      ceph_assert(cfollows == CEPH_NOSNAP);
+      
+      // update stale fragstat/rstat?
+      parent->resync_accounted_fragstat();
+      parent->resync_accounted_rstat();
+
+      if (do_parent_mtime) {
+	pf->fragstat.mtime = mut->get_op_stamp();
+	pf->fragstat.change_attr++;
+	dout(10) << "predirty_journal_parents bumping fragstat change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
+	if (pf->fragstat.mtime > pf->rstat.rctime) {
+	  dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
+	  pf->rstat.rctime = pf->fragstat.mtime;
+	} else {
+	  dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
+	}
+      }
+      if (linkunlink) {
+	dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
+	if (in->is_dir()) {
+	  pf->fragstat.nsubdirs += linkunlink;
+	  //pf->rstat.rsubdirs += linkunlink;
+	} else {
+ 	  pf->fragstat.nfiles += linkunlink;
+ 	  //pf->rstat.rfiles += linkunlink;
+	}
+      }
+    }
+
+    // rstat
+    if (!primary_dn) {
+      // don't update parent this pass
+    } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
+				pin->versionlock.can_wrlock())) {
+      dout(20) << " unwritable parent nestlock " << pin->nestlock
+	<< ", marking dirty rstat on " << *cur << dendl;
+      cur->mark_dirty_rstat();
+    } else {
+      // if we don't hold a wrlock reference on this nestlock, take one,
+      // because we are about to write into the dirfrag fnode and that needs
+      // to commit before the lock can cycle.
+      if (linkunlink) {
+	ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_peer());
+      }
+
+      if (!mut->is_wrlocked(&pin->nestlock)) {
+	dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
+	mds->locker->wrlock_force(&pin->nestlock, mut);
+      }
+
+      // now we can project the inode rstat diff the dirfrag
+      SnapRealm *prealm = pin->find_snaprealm();
+
+      snapid_t follows = cfollows;
+      if (follows == CEPH_NOSNAP)
+	follows = prealm->get_newest_seq();
+
+      snapid_t first = follows+1;
+
+      // first, if the frag is stale, bring it back in sync.
+      parent->resync_accounted_rstat();
+
+      // now push inode rstats into frag
+      project_rstat_inode_to_frag(mut, cur, parent, first, linkunlink, prealm);
+      cur->clear_dirty_rstat();
+    }
+
+    bool stop = false;
+    if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
+      dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
+      stop = true;
+    }
+
+    // delay propagating until later?
+    if (!stop && !first &&
+	g_conf()->mds_dirstat_min_interval > 0) {
+      double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
+      if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
+	dout(10) << "predirty_journal_parents last prop " << since_last_prop
+		 << " < " << g_conf()->mds_dirstat_min_interval
+		 << ", stopping" << dendl;
+	stop = true;
+      } else {
+	dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
+      }
+    }
+
+    // can cast only because i'm passing nowait=true in the sole user
+    if (!stop &&
+	!mut->is_wrlocked(&pin->nestlock) &&
+	(!pin->versionlock.can_wrlock() ||                   // make sure we can take versionlock, too
+	 !mds->locker->wrlock_try(&pin->nestlock, mut)
+	 )) {  // ** do not initiate.. see above comment **
+      dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
+	       << " on " << *pin << dendl;
+      stop = true;
+    }
+    if (stop) {
+      dout(10) << "predirty_journal_parents stop.  marking nestlock on " << *pin << dendl;
+      mds->locker->mark_updated_scatterlock(&pin->nestlock);
+      mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
+      mut->add_updated_lock(&pin->nestlock);
+      if (do_parent_mtime || linkunlink) {
+	mds->locker->mark_updated_scatterlock(&pin->filelock);
+	mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
+	mut->add_updated_lock(&pin->filelock);
+      }
+      break;
+    }
+    if (!mut->is_wrlocked(&pin->versionlock))
+      mds->locker->local_wrlock_grab(&pin->versionlock, mut);
+
+    ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_peer());
+    
+    pin->last_dirstat_prop = mut->get_mds_stamp();
+
+    // dirfrag -> diri
+    mut->auth_pin(pin);
+    lsi.push_front(pin);
+
+    pin->pre_cow_old_inode();  // avoid cow mayhem!
+
+    auto pi = pin->project_inode(mut);
+    pi.inode->version = pin->pre_dirty();
+
+    // dirstat
+    if (do_parent_mtime || linkunlink) {
+      dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
+      dout(20) << "predirty_journal_parents         - " << pf->accounted_fragstat << dendl;
+      bool touched_mtime = false, touched_chattr = false;
+      pi.inode->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
+      pf->accounted_fragstat = pf->fragstat;
+      if (touched_mtime)
+	pi.inode->mtime = pi.inode->ctime = pi.inode->dirstat.mtime;
+      if (touched_chattr)
+	pi.inode->change_attr++;
+      dout(20) << "predirty_journal_parents     gives " << pi.inode->dirstat << " on " << *pin << dendl;
+
+      if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
+	if (pi.inode->dirstat.size() < 0)
+	  ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
+	if (pi.inode->dirstat.size() != pf->fragstat.size()) {
+	  mds->clog->error() << "unmatched fragstat size on single dirfrag "
+	     << parent->dirfrag() << ", inode has " << pi.inode->dirstat
+	     << ", dirfrag has " << pf->fragstat;
+	  
+	  // trust the dirfrag for now
+	  pi.inode->dirstat = pf->fragstat;
+
+	  ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
+	}
+      }
+    }
+
+    // rstat
+    dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
+
+    // first, if the frag is stale, bring it back in sync.
+    parent->resync_accounted_rstat();
+
+    if (g_conf()->mds_snap_rstat) {
+      for (auto &p : parent->dirty_old_rstat) {
+	project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
+				    p.first, pin, true);
+      }
+    }
+    parent->dirty_old_rstat.clear();
+    project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
+
+    pf->accounted_rstat = pf->rstat;
+
+    if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
+      if (pi.inode->rstat.rbytes != pf->rstat.rbytes) {
+	mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
+	  << parent->dirfrag() << ", inode has " << pi.inode->rstat
+	  << ", dirfrag has " << pf->rstat;
+
+	// trust the dirfrag for now
+	pi.inode->rstat = pf->rstat;
+
+	ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
+      }
+    }
+
+    parent->check_rstats();
+    broadcast_quota_to_client(pin);
+    if (pin->is_base())
+      break;
+    // next parent!
+    cur = pin;
+    parentdn = pin->get_projected_parent_dn();
+    ceph_assert(parentdn);
+    parent = parentdn->get_dir();
+    linkunlink = 0;
+    do_parent_mtime = false;
+    primary_dn = true;
+    first = false;
+  }
+
+  // now, stick it in the blob
+  ceph_assert(parent);
+  ceph_assert(parent->is_auth());
+  blob->add_dir_context(parent);
+  blob->add_dir(parent, true);
+  for (const auto& in : lsi) {
+    journal_dirty_inode(mut.get(), blob, in);
+  }
+ 
+}
+
+
+
+
+
+// ===================================
+// peer requests
+
+
+/*
+ * some handlers for leader requests with peers.  we need to make
+ * sure leader journal commits before we forget we leadered them and
+ * remove them from the uncommitted_leaders map (used during recovery
+ * to commit|abort peers).
+ */
+struct C_MDC_CommittedLeader : public MDCacheLogContext {
+  metareqid_t reqid;
+  C_MDC_CommittedLeader(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
+  void finish(int r) override {
+    mdcache->_logged_leader_commit(reqid);
+  }
+};
+
+void MDCache::log_leader_commit(metareqid_t reqid)
+{
+  dout(10) << "log_leader_commit " << reqid << dendl;
+  uncommitted_leaders[reqid].committing = true;
+  mds->mdlog->start_submit_entry(new ECommitted(reqid), 
+				 new C_MDC_CommittedLeader(this, reqid));
+}
+
+void MDCache::_logged_leader_commit(metareqid_t reqid)
+{
+  dout(10) << "_logged_leader_commit " << reqid << dendl;
+  ceph_assert(uncommitted_leaders.count(reqid));
+  uncommitted_leaders[reqid].ls->uncommitted_leaders.erase(reqid);
+  mds->queue_waiters(uncommitted_leaders[reqid].waiters);
+  uncommitted_leaders.erase(reqid);
+}
+
+// while active...
+
+void MDCache::committed_leader_peer(metareqid_t r, mds_rank_t from)
+{
+  dout(10) << "committed_leader_peer mds." << from << " on " << r << dendl;
+  ceph_assert(uncommitted_leaders.count(r));
+  uncommitted_leaders[r].peers.erase(from);
+  if (!uncommitted_leaders[r].recovering && uncommitted_leaders[r].peers.empty())
+    log_leader_commit(r);
+}
+
+void MDCache::logged_leader_update(metareqid_t reqid)
+{
+  dout(10) << "logged_leader_update " << reqid << dendl;
+  ceph_assert(uncommitted_leaders.count(reqid));
+  uncommitted_leaders[reqid].safe = true;
+  auto p = pending_leaders.find(reqid);
+  if (p != pending_leaders.end()) {
+    pending_leaders.erase(p);
+    if (pending_leaders.empty())
+      process_delayed_resolve();
+  }
+}
+
+/*
+ * Leader may crash after receiving all peers' commit acks, but before journalling
+ * the final commit. Peers may crash after journalling the peer commit, but before
+ * sending commit ack to the leader. Commit leaders with no uncommitted peer when
+ * resolve finishes.
+ */
+void MDCache::finish_committed_leaders()
+{
+  for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
+       p != uncommitted_leaders.end();
+       ++p) {
+    p->second.recovering = false;
+    if (!p->second.committing && p->second.peers.empty()) {
+      dout(10) << "finish_committed_leaders " << p->first << dendl;
+      log_leader_commit(p->first);
+    }
+  }
+}
+
+/*
+ * at end of resolve... we must journal a commit|abort for all peer
+ * updates, before moving on.
+ * 
+ * this is so that the leader can safely journal ECommitted on ops it
+ * leaders when it reaches up:active (all other recovering nodes must
+ * complete resolve before that happens).
+ */
+struct C_MDC_PeerCommit : public MDCacheLogContext {
+  mds_rank_t from;
+  metareqid_t reqid;
+  C_MDC_PeerCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
+  void finish(int r) override {
+    mdcache->_logged_peer_commit(from, reqid);
+  }
+};
+
+void MDCache::_logged_peer_commit(mds_rank_t from, metareqid_t reqid)
+{
+  dout(10) << "_logged_peer_commit from mds." << from << " " << reqid << dendl;
+  
+  // send a message
+  auto req = make_message<MMDSPeerRequest>(reqid, 0, MMDSPeerRequest::OP_COMMITTED);
+  mds->send_message_mds(req, from);
+}
+
+
+
+
+
+
+// ====================================================================
+// import map, recovery
+
+void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
+				      map<dirfrag_t,vector<dirfrag_t> >& subtrees)
+{
+  if (subtrees.count(oldparent)) {
+      vector<dirfrag_t>& v = subtrees[oldparent];
+      dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
+      for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
+	if (*it == df) {
+	  v.erase(it);
+	  break;
+	}
+    }
+  if (subtrees.count(newparent)) {
+    vector<dirfrag_t>& v = subtrees[newparent];
+    dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
+    v.push_back(df);
+  }
+}
+
+ESubtreeMap *MDCache::create_subtree_map() 
+{
+  dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, " 
+	   << num_subtrees_fullauth() << " fullauth"
+	   << dendl;
+
+  show_subtrees();
+
+  ESubtreeMap *le = new ESubtreeMap();
+  mds->mdlog->_start_entry(le);
+  
+  map<dirfrag_t, CDir*> dirs_to_add;
+
+  if (myin) {
+    CDir* mydir = myin->get_dirfrag(frag_t());
+    dirs_to_add[mydir->dirfrag()] = mydir;
+  }
+
+  // include all auth subtrees, and their bounds.
+  // and a spanning tree to tie it to the root.
+  for (auto& [dir, bounds] : subtrees) {
+    // journal subtree as "ours" if we are
+    //   me, -2
+    //   me, me
+    //   me, !me (may be importing and ambiguous!)
+
+    // so not
+    //   !me, *
+    if (dir->get_dir_auth().first != mds->get_nodeid())
+      continue;
+
+    if (migrator->is_ambiguous_import(dir->dirfrag()) ||
+	my_ambiguous_imports.count(dir->dirfrag())) {
+      dout(15) << " ambig subtree " << *dir << dendl;
+      le->ambiguous_subtrees.insert(dir->dirfrag());
+    } else {
+      dout(15) << " auth subtree " << *dir << dendl;
+    }
+
+    dirs_to_add[dir->dirfrag()] = dir;
+    le->subtrees[dir->dirfrag()].clear();
+
+    // bounds
+    size_t nbounds = bounds.size();
+    if (nbounds > 3) {
+      dout(15) << "  subtree has " << nbounds << " bounds" << dendl;
+    }
+    for (auto& bound : bounds) {
+      if (nbounds <= 3) {
+        dout(15) << "  subtree bound " << *bound << dendl;
+      }
+      dirs_to_add[bound->dirfrag()] = bound;
+      le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
+    }
+  }
+
+  // apply projected renames
+  for (const auto& [diri, renames] : projected_subtree_renames) {
+    for (const auto& [olddir, newdir] : renames) {
+      dout(15) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
+
+      auto&& dfls = diri->get_dirfrags();
+      for (const auto& dir : dfls) {
+	dout(15) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
+	CDir *oldparent = get_projected_subtree_root(olddir);
+	dout(15) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
+	CDir *newparent = get_projected_subtree_root(newdir);
+	dout(15) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
+
+	if (oldparent == newparent) {
+	  dout(15) << "parent unchanged for " << dir->dirfrag() << " at "
+		   << oldparent->dirfrag() << dendl;
+	  continue;
+	}
+
+	if (dir->is_subtree_root()) {
+	  if (le->subtrees.count(newparent->dirfrag()) &&
+	      oldparent->get_dir_auth() != newparent->get_dir_auth())
+	    dirs_to_add[dir->dirfrag()] = dir;
+	  // children are fine.  change parent.
+	  _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
+				  le->subtrees);
+	} else {
+	  // mid-subtree.
+
+	  if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
+	    dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
+	    // if oldparent is auth, subtree is mine; include it.
+	    if (le->subtrees.count(oldparent->dirfrag())) {
+	      dirs_to_add[dir->dirfrag()] = dir;
+	      le->subtrees[dir->dirfrag()].clear();
+	    }
+	    // if newparent is auth, subtree is a new bound
+	    if (le->subtrees.count(newparent->dirfrag())) {
+	      dirs_to_add[dir->dirfrag()] = dir;
+	      le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag());  // newparent is auth; new bound
+	    }
+	    newparent = dir;
+	  }
+	  
+	  // see if any old bounds move to the new parent.
+	  for (auto& bound : subtrees.at(oldparent)) {
+	    if (dir->contains(bound->get_parent_dir()))
+	      _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
+				      le->subtrees);
+	  }
+	}
+      }
+    }
+  }
+
+  // simplify the journaled map.  our in memory map may have more
+  // subtrees than needed due to migrations that are just getting
+  // started or just completing.  but on replay, the "live" map will
+  // be simple and we can do a straight comparison.
+  for (auto& [frag, bfrags] : le->subtrees) {
+    if (le->ambiguous_subtrees.count(frag))
+      continue;
+    unsigned i = 0;
+    while (i < bfrags.size()) {
+      dirfrag_t b = bfrags[i];
+      if (le->subtrees.count(b) &&
+	  le->ambiguous_subtrees.count(b) == 0) {
+	auto& bb = le->subtrees.at(b);
+	dout(10) << "simplify: " << frag << " swallowing " << b << " with bounds " << bb << dendl;
+	for (auto& r : bb) {
+	  bfrags.push_back(r);
+        }
+	dirs_to_add.erase(b);
+	le->subtrees.erase(b);
+	bfrags.erase(bfrags.begin() + i);
+      } else {
+	++i;
+      }
+    }
+  }
+
+  for (auto &p : dirs_to_add) {
+    CDir *dir = p.second;
+    le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
+    le->metablob.add_dir(dir, false);
+  }
+
+  dout(15) << " subtrees " << le->subtrees << dendl;
+  dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
+
+  //le->metablob.print(cout);
+  le->expire_pos = mds->mdlog->journaler->get_expire_pos();
+  return le;
+}
+
+void MDCache::dump_resolve_status(Formatter *f) const
+{
+  f->open_object_section("resolve_status");
+  f->dump_stream("resolve_gather") << resolve_gather;
+  f->dump_stream("resolve_ack_gather") << resolve_gather;
+  f->close_section();
+}
+
+void MDCache::resolve_start(MDSContext *resolve_done_)
+{
+  dout(10) << "resolve_start" << dendl;
+  ceph_assert(!resolve_done);
+  resolve_done.reset(resolve_done_);
+
+  if (mds->mdsmap->get_root() != mds->get_nodeid()) {
+    // if we don't have the root dir, adjust it to UNKNOWN.  during
+    // resolve we want mds0 to explicit claim the portion of it that
+    // it owns, so that anything beyond its bounds get left as
+    // unknown.
+    CDir *rootdir = root->get_dirfrag(frag_t());
+    if (rootdir)
+      adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
+  }
+  resolve_gather = recovery_set;
+
+  resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
+}
+
+void MDCache::send_resolves()
+{
+  send_peer_resolves();
+
+  if (!resolve_done) {
+    // I'm survivor: refresh snap cache
+    mds->snapclient->sync(
+	new MDSInternalContextWrapper(mds,
+	  new LambdaContext([this](int r) {
+	    maybe_finish_peer_resolve();
+	    })
+	  )
+	);
+    dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
+    return;
+  }
+  if (!resolve_ack_gather.empty()) {
+    dout(10) << "send_resolves still waiting for resolve ack from ("
+	     << resolve_ack_gather << ")" << dendl;
+    return;
+  }
+  if (!resolve_need_rollback.empty()) {
+    dout(10) << "send_resolves still waiting for rollback to commit on ("
+	     << resolve_need_rollback << ")" << dendl;
+    return;
+  }
+
+  send_subtree_resolves();
+}
+
+void MDCache::send_peer_resolves()
+{
+  dout(10) << "send_peer_resolves" << dendl;
+
+  map<mds_rank_t, ref_t<MMDSResolve>> resolves;
+
+  if (mds->is_resolve()) {
+    for (map<metareqid_t, upeer>::iterator p = uncommitted_peers.begin();
+	 p != uncommitted_peers.end();
+	 ++p) {
+      mds_rank_t leader = p->second.leader;
+      auto &m = resolves[leader];
+      if (!m) m = make_message<MMDSResolve>();
+      m->add_peer_request(p->first, false);
+    }
+  } else {
+    set<mds_rank_t> resolve_set;
+    mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
+    for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+	 p != active_requests.end();
+	 ++p) {
+      MDRequestRef& mdr = p->second;
+      if (!mdr->is_peer())
+	continue;
+      if (!mdr->peer_did_prepare() && !mdr->committing) {
+	continue;
+      }
+      mds_rank_t leader = mdr->peer_to_mds;
+      if (resolve_set.count(leader) || is_ambiguous_peer_update(p->first, leader)) {
+	dout(10) << " including uncommitted " << *mdr << dendl;
+	if (!resolves.count(leader))
+	  resolves[leader] = make_message<MMDSResolve>();
+	if (!mdr->committing &&
+	    mdr->has_more() && mdr->more()->is_inode_exporter) {
+	  // re-send cap exports
+	  CInode *in = mdr->more()->rename_inode;
+	  map<client_t, Capability::Export> cap_map;
+	  in->export_client_caps(cap_map);
+	  bufferlist bl;
+          MMDSResolve::peer_inode_cap inode_caps(in->ino(), cap_map);
+          encode(inode_caps, bl);
+	  resolves[leader]->add_peer_request(p->first, bl);
+	} else {
+	  resolves[leader]->add_peer_request(p->first, mdr->committing);
+	}
+      }
+    }
+  }
+
+  for (auto &p : resolves) {
+    dout(10) << "sending peer resolve to mds." << p.first << dendl;
+    mds->send_message_mds(p.second, p.first);
+    resolve_ack_gather.insert(p.first);
+  }
+}
+
+void MDCache::send_subtree_resolves()
+{
+  dout(10) << "send_subtree_resolves" << dendl;
+
+  if (migrator->is_exporting() || migrator->is_importing()) {
+    dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
+    migrator->show_importing();
+    migrator->show_exporting();
+    resolves_pending = true;
+    return;  // not now
+  }
+
+  map<mds_rank_t, ref_t<MMDSResolve>> resolves;
+  for (set<mds_rank_t>::iterator p = recovery_set.begin();
+       p != recovery_set.end();
+       ++p) {
+    if (*p == mds->get_nodeid())
+      continue;
+    if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
+      resolves[*p] = make_message<MMDSResolve>();
+  }
+
+  map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
+  map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
+
+  // known
+  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+       p != subtrees.end();
+       ++p) {
+    CDir *dir = p->first;
+
+    // only our subtrees
+    if (dir->authority().first != mds->get_nodeid()) 
+      continue;
+
+    if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
+      continue;  // we'll add it below
+    
+    if (migrator->is_ambiguous_import(dir->dirfrag())) {
+      // ambiguous (mid-import)
+      set<CDir*> bounds;
+      get_subtree_bounds(dir, bounds);
+      vector<dirfrag_t> dfls;
+      for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
+	dfls.push_back((*q)->dirfrag());
+
+      my_ambig_imports[dir->dirfrag()] = dfls;
+      dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
+    } else {
+      // not ambiguous.
+      for (auto &q : resolves) {
+	resolves[q.first]->add_subtree(dir->dirfrag());
+      }
+      // bounds too
+      vector<dirfrag_t> dfls;
+      for (set<CDir*>::iterator q = subtrees[dir].begin();
+	   q != subtrees[dir].end();
+	   ++q) {
+	CDir *bound = *q;
+	dfls.push_back(bound->dirfrag());
+      }
+
+      my_subtrees[dir->dirfrag()] = dfls;
+      dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
+    }
+  }
+
+  // ambiguous
+  for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
+       p != my_ambiguous_imports.end();
+       ++p) {
+    my_ambig_imports[p->first] = p->second;
+    dout(10) << " ambig " << p->first << " " << p->second << dendl;
+  }
+
+  // simplify the claimed subtree.
+  for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
+    unsigned i = 0;
+    while (i < p->second.size()) {
+      dirfrag_t b = p->second[i];
+      if (my_subtrees.count(b)) {
+	vector<dirfrag_t>& bb = my_subtrees[b];
+	dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
+	for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
+	  p->second.push_back(*r);
+	my_subtrees.erase(b);
+	p->second.erase(p->second.begin() + i);
+      } else {
+	++i;
+      }
+    }
+  }
+
+  // send
+  for (auto &p : resolves) {
+    const ref_t<MMDSResolve> &m = p.second;
+    if (mds->is_resolve()) {
+      m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
+    } else {
+      m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
+    }
+    m->subtrees = my_subtrees;
+    m->ambiguous_imports = my_ambig_imports;
+    dout(10) << "sending subtee resolve to mds." << p.first << dendl;
+    mds->send_message_mds(m, p.first);
+  }
+  resolves_pending = false;
+}
+
+void MDCache::maybe_finish_peer_resolve() {
+  if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
+    // snap cache get synced or I'm in resolve state
+    if (mds->snapclient->is_synced() || resolve_done)
+      send_subtree_resolves();
+    process_delayed_resolve();
+  }
+}
+
+void MDCache::handle_mds_failure(mds_rank_t who)
+{
+  dout(7) << "handle_mds_failure mds." << who << dendl;
+  
+  dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
+
+  resolve_gather.insert(who);
+  discard_delayed_resolve(who);
+  ambiguous_peer_updates.erase(who);
+
+  rejoin_gather.insert(who);
+  rejoin_sent.erase(who);        // i need to send another
+  rejoin_ack_sent.erase(who);    // i need to send another
+  rejoin_ack_gather.erase(who);  // i'll need/get another.
+
+  dout(10) << " resolve_gather " << resolve_gather << dendl;
+  dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
+  dout(10) << " rejoin_sent " << rejoin_sent << dendl;
+  dout(10) << " rejoin_gather " << rejoin_gather << dendl;
+  dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
+
+ 
+  // tell the migrator too.
+  migrator->handle_mds_failure_or_stop(who);
+
+  // tell the balancer too.
+  mds->balancer->handle_mds_failure(who);
+
+  // clean up any requests peer to/from this node
+  list<MDRequestRef> finish;
+  for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+       p != active_requests.end();
+       ++p) {
+    MDRequestRef& mdr = p->second;
+    // peer to the failed node?
+    if (mdr->peer_to_mds == who) {
+      if (mdr->peer_did_prepare()) {
+	dout(10) << " peer request " << *mdr << " uncommitted, will resolve shortly" << dendl;
+	if (is_ambiguous_peer_update(p->first, mdr->peer_to_mds))
+	  remove_ambiguous_peer_update(p->first, mdr->peer_to_mds);
+
+	if (!mdr->more()->waiting_on_peer.empty()) {
+	  ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
+	  // will rollback, no need to wait
+	  mdr->reset_peer_request();
+	  mdr->more()->waiting_on_peer.clear();
+	}
+      } else if (!mdr->committing) {
+	dout(10) << " peer request " << *mdr << " has no prepare, finishing up" << dendl;
+	if (mdr->peer_request || mdr->peer_rolling_back())
+	  mdr->aborted = true;
+	else
+	  finish.push_back(mdr);
+      }
+    }
+
+    if (mdr->is_peer() && mdr->peer_did_prepare()) {
+      if (mdr->more()->waiting_on_peer.count(who)) {
+	ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
+	dout(10) << " peer request " << *mdr << " no longer need rename notity ack from mds."
+		 << who << dendl;
+	mdr->more()->waiting_on_peer.erase(who);
+	if (mdr->more()->waiting_on_peer.empty() && mdr->peer_request)
+	  mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
+      }
+
+      if (mdr->more()->srcdn_auth_mds == who &&
+	  mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->peer_to_mds)) {
+	// rename srcdn's auth mds failed, resolve even I'm a survivor.
+	dout(10) << " peer request " << *mdr << " uncommitted, will resolve shortly" << dendl;
+	add_ambiguous_peer_update(p->first, mdr->peer_to_mds);
+      }
+    } else if (mdr->peer_request) {
+      const cref_t<MMDSPeerRequest> &peer_req = mdr->peer_request;
+      // FIXME: Peer rename request can arrive after we notice mds failure.
+      // 	This can cause mds to crash (does not affect integrity of FS).
+      if (peer_req->get_op() == MMDSPeerRequest::OP_RENAMEPREP &&
+	  peer_req->srcdn_auth == who)
+	peer_req->mark_interrupted();
+    }
+    
+    // failed node is peer?
+    if (mdr->is_leader() && !mdr->committing) {
+      if (mdr->more()->srcdn_auth_mds == who) {
+	dout(10) << " leader request " << *mdr << " waiting for rename srcdn's auth mds."
+		 << who << " to recover" << dendl;
+	ceph_assert(mdr->more()->witnessed.count(who) == 0);
+	if (mdr->more()->is_ambiguous_auth)
+	  mdr->clear_ambiguous_auth();
+	// rename srcdn's auth mds failed, all witnesses will rollback
+	mdr->more()->witnessed.clear();
+	pending_leaders.erase(p->first);
+      }
+
+      if (mdr->more()->witnessed.count(who)) {
+	mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
+	if (srcdn_auth >= 0 && mdr->more()->waiting_on_peer.count(srcdn_auth)) {
+	  dout(10) << " leader request " << *mdr << " waiting for rename srcdn's auth mds."
+		   << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
+	  // waiting for the peer (rename srcdn's auth mds), delay sending resolve ack
+	  // until either the request is committing or the peer also fails.
+	  ceph_assert(mdr->more()->waiting_on_peer.size() == 1);
+	  pending_leaders.insert(p->first);
+	} else {
+	  dout(10) << " leader request " << *mdr << " no longer witnessed by peer mds."
+		   << who << " to recover" << dendl;
+	  if (srcdn_auth >= 0)
+	    ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
+
+	  // discard this peer's prepare (if any)
+	  mdr->more()->witnessed.erase(who);
+	}
+      }
+      
+      if (mdr->more()->waiting_on_peer.count(who)) {
+	dout(10) << " leader request " << *mdr << " waiting for peer mds." << who
+		 << " to recover" << dendl;
+	// retry request when peer recovers
+	mdr->more()->waiting_on_peer.erase(who);
+	if (mdr->more()->waiting_on_peer.empty())
+	  mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
+      }
+
+      if (mdr->locking && mdr->locking_target_mds == who)
+	mdr->finish_locking(mdr->locking);
+    }
+  }
+
+  for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
+       p != uncommitted_leaders.end();
+       ++p) {
+    // The failed MDS may have already committed the peer update
+    if (p->second.peers.count(who)) {
+      p->second.recovering = true;
+      p->second.peers.erase(who);
+    }
+  }
+
+  while (!finish.empty()) {
+    dout(10) << "cleaning up peer request " << *finish.front() << dendl;
+    request_finish(finish.front());
+    finish.pop_front();
+  }
+
+  kick_find_ino_peers(who);
+  kick_open_ino_peers(who);
+
+  for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
+       p != fragments.end(); ) {
+    dirfrag_t df = p->first;
+    fragment_info_t& info = p->second;
+
+    if (info.is_fragmenting()) {
+      if (info.notify_ack_waiting.erase(who) &&
+	  info.notify_ack_waiting.empty()) {
+	fragment_drop_locks(info);
+	fragment_maybe_finish(p++);
+      } else {
+	++p;
+      }
+      continue;
+    }
+
+    ++p;
+    dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
+    std::vector<CDir*> dirs;
+    info.dirs.swap(dirs);
+    fragments.erase(df);
+    fragment_unmark_unfreeze_dirs(dirs);
+  }
+
+  // MDCache::shutdown_export_strays() always exports strays to mds.0
+  if (who == mds_rank_t(0))
+    shutdown_exporting_strays.clear();
+
+  show_subtrees();  
+}
+
+/*
+ * handle_mds_recovery - called on another node's transition 
+ * from resolve -> active.
+ */
+void MDCache::handle_mds_recovery(mds_rank_t who)
+{
+  dout(7) << "handle_mds_recovery mds." << who << dendl;
+
+  // exclude all discover waiters. kick_discovers() will do the job
+  static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
+  static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
+
+  MDSContext::vec waiters;
+
+  // wake up any waiters in their subtrees
+  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+       p != subtrees.end();
+       ++p) {
+    CDir *dir = p->first;
+
+    if (dir->authority().first != who ||
+	dir->authority().second == mds->get_nodeid())
+      continue;
+    ceph_assert(!dir->is_auth());
+   
+    // wake any waiters
+    std::queue<CDir*> q;
+    q.push(dir);
+
+    while (!q.empty()) {
+      CDir *d = q.front();
+      q.pop();
+      d->take_waiting(d_mask, waiters);
+
+      // inode waiters too
+      for (auto &p : d->items) {
+	CDentry *dn = p.second;
+	CDentry::linkage_t *dnl = dn->get_linkage();
+	if (dnl->is_primary()) {
+	  dnl->get_inode()->take_waiting(i_mask, waiters);
+	  
+	  // recurse?
+	  auto&& ls = dnl->get_inode()->get_dirfrags();
+	  for (const auto& subdir : ls) {
+	    if (!subdir->is_subtree_root())
+	      q.push(subdir);
+	  }
+	}
+      }
+    }
+  }
+
+  kick_open_ino_peers(who);
+  kick_find_ino_peers(who);
+
+  // queue them up.
+  mds->queue_waiters(waiters);
+}
+
+void MDCache::set_recovery_set(set<mds_rank_t>& s) 
+{
+  dout(7) << "set_recovery_set " << s << dendl;
+  recovery_set = s;
+}
+
+
+/*
+ * during resolve state, we share resolves to determine who
+ * is authoritative for which trees.  we expect to get an resolve
+ * from _everyone_ in the recovery_set (the mds cluster at the time of
+ * the first failure).
+ *
+ * This functions puts the passed message before returning
+ */
+void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
+{
+  dout(7) << "handle_resolve from " << m->get_source() << dendl;
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+
+  if (mds->get_state() < MDSMap::STATE_RESOLVE) {
+    if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
+      mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
+      return;
+    }
+    // wait until we reach the resolve stage!
+    return;
+  }
+
+  discard_delayed_resolve(from);
+
+  // ambiguous peer requests?
+  if (!m->peer_requests.empty()) {
+    if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
+      for (auto p = m->peer_requests.begin(); p != m->peer_requests.end(); ++p) {
+	if (uncommitted_leaders.count(p->first) && !uncommitted_leaders[p->first].safe) {
+	  ceph_assert(!p->second.committing);
+	  pending_leaders.insert(p->first);
+	}
+      }
+
+      if (!pending_leaders.empty()) {
+	dout(10) << " still have pending updates, delay processing peer resolve" << dendl;
+	delayed_resolve[from] = m;
+	return;
+      }
+    }
+
+    auto ack = make_message<MMDSResolveAck>();
+    for (const auto &p : m->peer_requests) {
+      if (uncommitted_leaders.count(p.first)) {  //mds->sessionmap.have_completed_request(p.first)) {
+	// COMMIT
+	if (p.second.committing) {
+	  // already committing, waiting for the OP_COMMITTED peer reply
+	  dout(10) << " already committing peer request " << p << " noop "<< dendl;
+	} else {
+	  dout(10) << " ambiguous peer request " << p << " will COMMIT" << dendl;
+	  ack->add_commit(p.first);
+	}
+	uncommitted_leaders[p.first].peers.insert(from);   // wait for peer OP_COMMITTED before we log ECommitted
+
+	if (p.second.inode_caps.length() > 0) {
+	  // peer wants to export caps (rename)
+	  ceph_assert(mds->is_resolve());
+          MMDSResolve::peer_inode_cap inode_caps;
+	  auto q = p.second.inode_caps.cbegin();
+          decode(inode_caps, q);
+	  inodeno_t ino = inode_caps.ino;
+	  map<client_t,Capability::Export> cap_exports = inode_caps.cap_exports;
+	  ceph_assert(get_inode(ino));
+
+	  for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
+	      q != cap_exports.end();
+	      ++q) {
+	    Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
+	    im.cap_id = ++last_cap_id; // assign a new cap ID
+	    im.issue_seq = 1;
+	    im.mseq = q->second.mseq;
+
+	    Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+	    if (session)
+	      rejoin_client_map.emplace(q->first, session->info.inst);
+	  }
+
+	  // will process these caps in rejoin stage
+	  rejoin_peer_exports[ino].first = from;
+	  rejoin_peer_exports[ino].second.swap(cap_exports);
+
+	  // send information of imported caps back to peer
+	  encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
+	}
+      } else {
+	// ABORT
+	dout(10) << " ambiguous peer request " << p << " will ABORT" << dendl;
+	ceph_assert(!p.second.committing);
+	ack->add_abort(p.first);
+      }
+    }
+    mds->send_message(ack, m->get_connection());
+    return;
+  }
+
+  if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
+    dout(10) << "delay processing subtree resolve" << dendl;
+    delayed_resolve[from] = m;
+    return;
+  }
+
+  bool survivor = false;
+  // am i a surviving ambiguous importer?
+  if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
+    survivor = true;
+    // check for any import success/failure (from this node)
+    map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
+    while (p != my_ambiguous_imports.end()) {
+      map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
+      ++next;
+      CDir *dir = get_dirfrag(p->first);
+      ceph_assert(dir);
+      dout(10) << "checking ambiguous import " << *dir << dendl;
+      if (migrator->is_importing(dir->dirfrag()) &&
+	  migrator->get_import_peer(dir->dirfrag()) == from) {
+	ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
+	
+	// check if sender claims the subtree
+	bool claimed_by_sender = false;
+	for (const auto &q : m->subtrees) {
+	  // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
+	  CDir *base = get_force_dirfrag(q.first, false);
+	  if (!base || !base->contains(dir)) 
+	    continue;  // base not dir or an ancestor of dir, clearly doesn't claim dir.
+
+	  bool inside = true;
+	  set<CDir*> bounds;
+	  get_force_dirfrag_bound_set(q.second, bounds);
+	  for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
+	    CDir *bound = *p;
+	    if (bound->contains(dir)) {
+	      inside = false;  // nope, bound is dir or parent of dir, not inside.
+	      break;
+	    }
+	  }
+	  if (inside)
+	    claimed_by_sender = true;
+	}
+
+	my_ambiguous_imports.erase(p);  // no longer ambiguous.
+	if (claimed_by_sender) {
+	  dout(7) << "ambiguous import failed on " << *dir << dendl;
+	  migrator->import_reverse(dir);
+	} else {
+	  dout(7) << "ambiguous import succeeded on " << *dir << dendl;
+	  migrator->import_finish(dir, true);
+	}
+      }
+      p = next;
+    }
+  }    
+
+  // update my dir_auth values
+  //   need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
+  //   migrations between other nodes)
+  for (const auto& p : m->subtrees) {
+    dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
+    CDir *dir = get_force_dirfrag(p.first, !survivor);
+    if (!dir)
+      continue;
+    adjust_bounded_subtree_auth(dir, p.second, from);
+    try_subtree_merge(dir);
+  }
+
+  show_subtrees();
+
+  // note ambiguous imports too
+  for (const auto& p : m->ambiguous_imports) {
+    dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
+    other_ambiguous_imports[from][p.first] = p.second;
+  }
+
+  // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
+  // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
+  for (const auto& p : m->table_clients) {
+    dout(10) << " noting " << get_mdstable_name(p.type)
+	     << " pending_commits " << p.pending_commits << dendl;
+    MDSTableClient *client = mds->get_table_client(p.type);
+    for (const auto& q : p.pending_commits)
+      client->notify_commit(q);
+  }
+  
+  // did i get them all?
+  resolve_gather.erase(from);
+  
+  maybe_resolve_finish();
+}
+
+void MDCache::process_delayed_resolve()
+{
+  dout(10) << "process_delayed_resolve" << dendl;
+  map<mds_rank_t, cref_t<MMDSResolve>> tmp;
+  tmp.swap(delayed_resolve);
+  for (auto &p : tmp) {
+    handle_resolve(p.second);
+  }
+}
+
+void MDCache::discard_delayed_resolve(mds_rank_t who)
+{
+  delayed_resolve.erase(who);
+}
+
+void MDCache::maybe_resolve_finish()
+{
+  ceph_assert(resolve_ack_gather.empty());
+  ceph_assert(resolve_need_rollback.empty());
+
+  if (!resolve_gather.empty()) {
+    dout(10) << "maybe_resolve_finish still waiting for resolves ("
+	     << resolve_gather << ")" << dendl;
+    return;
+  }
+
+  dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
+  disambiguate_my_imports();
+  finish_committed_leaders();
+
+  if (resolve_done) {
+    ceph_assert(mds->is_resolve());
+    trim_unlinked_inodes();
+    recalc_auth_bits(false);
+    resolve_done.release()->complete(0);
+  } else {
+    // I am survivor.
+    maybe_send_pending_rejoins();
+  }
+}
+
+void MDCache::handle_resolve_ack(const cref_t<MMDSResolveAck> &ack)
+{
+  dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
+  mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+  if (!resolve_ack_gather.count(from) ||
+      mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
+    return;
+  }
+
+  if (ambiguous_peer_updates.count(from)) {
+    ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
+    ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+  }
+
+  for (const auto &p : ack->commit) {
+    dout(10) << " commit on peer " << p.first << dendl;
+    
+    if (ambiguous_peer_updates.count(from)) {
+      remove_ambiguous_peer_update(p.first, from);
+      continue;
+    }
+
+    if (mds->is_resolve()) {
+      // replay
+      MDPeerUpdate *su = get_uncommitted_peer(p.first, from);
+      ceph_assert(su);
+
+      // log commit
+      mds->mdlog->start_submit_entry(new EPeerUpdate(mds->mdlog, "unknown", p.first, from,
+						      EPeerUpdate::OP_COMMIT, su->origop),
+				     new C_MDC_PeerCommit(this, from, p.first));
+      mds->mdlog->flush();
+
+      finish_uncommitted_peer(p.first);
+    } else {
+      MDRequestRef mdr = request_get(p.first);
+      // information about leader imported caps
+      if (p.second.length() > 0)
+	mdr->more()->inode_import.share(p.second);
+
+      ceph_assert(mdr->peer_request == 0);  // shouldn't be doing anything!
+      request_finish(mdr);
+    }
+  }
+
+  for (const auto &metareq : ack->abort) {
+    dout(10) << " abort on peer " << metareq << dendl;
+
+    if (mds->is_resolve()) {
+      MDPeerUpdate *su = get_uncommitted_peer(metareq, from);
+      ceph_assert(su);
+
+      // perform rollback (and journal a rollback entry)
+      // note: this will hold up the resolve a bit, until the rollback entries journal.
+      MDRequestRef null_ref;
+      switch (su->origop) {
+      case EPeerUpdate::LINK:
+	mds->server->do_link_rollback(su->rollback, from, null_ref);
+	break;
+      case EPeerUpdate::RENAME:
+	mds->server->do_rename_rollback(su->rollback, from, null_ref);
+	break;
+      case EPeerUpdate::RMDIR:
+	mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
+	break;
+      default:
+	ceph_abort();
+      }
+    } else {
+      MDRequestRef mdr = request_get(metareq);
+      mdr->aborted = true;
+      if (mdr->peer_request) {
+	if (mdr->peer_did_prepare()) // journaling peer prepare ?
+	  add_rollback(metareq, from);
+      } else {
+	request_finish(mdr);
+      }
+    }
+  }
+
+  if (!ambiguous_peer_updates.count(from)) {
+    resolve_ack_gather.erase(from);
+    maybe_finish_peer_resolve();
+  }
+}
+
+void MDCache::add_uncommitted_peer(metareqid_t reqid, LogSegment *ls, mds_rank_t leader, MDPeerUpdate *su)
+{
+  auto const &ret = uncommitted_peers.emplace(std::piecewise_construct,
+                                               std::forward_as_tuple(reqid),
+                                               std::forward_as_tuple());
+  ceph_assert(ret.second);
+  ls->uncommitted_peers.insert(reqid);
+  upeer &u = ret.first->second;
+  u.leader = leader;
+  u.ls = ls;
+  u.su = su;
+  if (su == nullptr) {
+    return;
+  }
+  for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
+    uncommitted_peer_rename_olddir[*p]++;
+  for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
+    uncommitted_peer_unlink[*p]++;
+}
+
+void MDCache::finish_uncommitted_peer(metareqid_t reqid, bool assert_exist)
+{
+  auto it = uncommitted_peers.find(reqid);
+  if (it == uncommitted_peers.end()) {
+    ceph_assert(!assert_exist);
+    return;
+  }
+  upeer &u = it->second;
+  MDPeerUpdate* su = u.su;
+
+  if (!u.waiters.empty()) {
+    mds->queue_waiters(u.waiters);
+  }
+  u.ls->uncommitted_peers.erase(reqid);
+  uncommitted_peers.erase(it);
+
+  if (su == nullptr) {
+    return;
+  }
+  // discard the non-auth subtree we renamed out of
+  for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
+    CInode *diri = *p;
+    map<CInode*, int>::iterator it = uncommitted_peer_rename_olddir.find(diri);
+    ceph_assert(it != uncommitted_peer_rename_olddir.end());
+    it->second--;
+    if (it->second == 0) {
+      uncommitted_peer_rename_olddir.erase(it);
+      auto&& ls = diri->get_dirfrags();
+      for (const auto& dir : ls) {
+	CDir *root = get_subtree_root(dir);
+	if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
+	  try_trim_non_auth_subtree(root);
+	  if (dir != root)
+	    break;
+	}
+      }
+    } else
+      ceph_assert(it->second > 0);
+  }
+  // removed the inodes that were unlinked by peer update
+  for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
+    CInode *in = *p;
+    map<CInode*, int>::iterator it = uncommitted_peer_unlink.find(in);
+    ceph_assert(it != uncommitted_peer_unlink.end());
+    it->second--;
+    if (it->second == 0) {
+      uncommitted_peer_unlink.erase(it);
+      if (!in->get_projected_parent_dn())
+	mds->mdcache->remove_inode_recursive(in);
+    } else
+      ceph_assert(it->second > 0);
+  }
+  delete su;
+}
+
+MDPeerUpdate* MDCache::get_uncommitted_peer(metareqid_t reqid, mds_rank_t leader)
+{
+
+  MDPeerUpdate* su = nullptr;
+  auto it = uncommitted_peers.find(reqid);
+  if (it != uncommitted_peers.end() &&
+      it->second.leader == leader) {
+    su = it->second.su;
+  }
+  return su;
+}
+
+void MDCache::finish_rollback(metareqid_t reqid, MDRequestRef& mdr) {
+  auto p = resolve_need_rollback.find(reqid);
+  ceph_assert(p != resolve_need_rollback.end());
+  if (mds->is_resolve()) {
+    finish_uncommitted_peer(reqid, false);
+  } else if (mdr) {
+    finish_uncommitted_peer(mdr->reqid, mdr->more()->peer_update_journaled);
+  }
+  resolve_need_rollback.erase(p);
+  maybe_finish_peer_resolve();
+}
+
+void MDCache::disambiguate_other_imports()
+{
+  dout(10) << "disambiguate_other_imports" << dendl;
+
+  bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+  // other nodes' ambiguous imports
+  for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
+       p != other_ambiguous_imports.end();
+       ++p) {
+    mds_rank_t who = p->first;
+    dout(10) << "ambiguous imports for mds." << who << dendl;
+
+    for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
+	 q != p->second.end();
+	 ++q) {
+      dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
+      // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
+      CDir *dir = get_force_dirfrag(q->first, recovering);
+      if (!dir) continue;
+
+      if (dir->is_ambiguous_auth() ||	// works for me_ambig or if i am a surviving bystander
+	  dir->authority() == CDIR_AUTH_UNDEF) { // resolving
+	dout(10) << "  mds." << who << " did import " << *dir << dendl;
+	adjust_bounded_subtree_auth(dir, q->second, who);
+	try_subtree_merge(dir);
+      } else {
+	dout(10) << "  mds." << who << " did not import " << *dir << dendl;
+      }
+    }
+  }
+  other_ambiguous_imports.clear();
+}
+
+void MDCache::disambiguate_my_imports()
+{
+  dout(10) << "disambiguate_my_imports" << dendl;
+
+  if (!mds->is_resolve()) {
+    ceph_assert(my_ambiguous_imports.empty());
+    return;
+  }
+
+  disambiguate_other_imports();
+
+  // my ambiguous imports
+  mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
+  while (!my_ambiguous_imports.empty()) {
+    map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
+
+    CDir *dir = get_dirfrag(q->first);
+    ceph_assert(dir);
+    
+    if (dir->authority() != me_ambig) {
+      dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
+      cancel_ambiguous_import(dir);
+
+      mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
+
+      // subtree may have been swallowed by another node claiming dir
+      // as their own.
+      CDir *root = get_subtree_root(dir);
+      if (root != dir)
+	dout(10) << "  subtree root is " << *root << dendl;
+      ceph_assert(root->dir_auth.first != mds->get_nodeid());  // no us!
+      try_trim_non_auth_subtree(root);
+    } else {
+      dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
+      finish_ambiguous_import(q->first);
+      mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
+    }
+  }
+  ceph_assert(my_ambiguous_imports.empty());
+  mds->mdlog->flush();
+
+  // verify all my subtrees are unambiguous!
+  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+       p != subtrees.end();
+       ++p) {
+    CDir *dir = p->first;
+    if (dir->is_ambiguous_dir_auth()) {
+      dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
+    }
+    ceph_assert(!dir->is_ambiguous_dir_auth());
+  }
+
+  show_subtrees();
+}
+
+
+void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds) 
+{
+  ceph_assert(my_ambiguous_imports.count(base) == 0);
+  my_ambiguous_imports[base] = bounds;
+}
+
+
+void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
+{
+  // make a list
+  vector<dirfrag_t> binos;
+  for (set<CDir*>::iterator p = bounds.begin();
+       p != bounds.end();
+       ++p) 
+    binos.push_back((*p)->dirfrag());
+  
+  // note: this can get called twice if the exporter fails during recovery
+  if (my_ambiguous_imports.count(base->dirfrag()))
+    my_ambiguous_imports.erase(base->dirfrag());
+
+  add_ambiguous_import(base->dirfrag(), binos);
+}
+
+void MDCache::cancel_ambiguous_import(CDir *dir)
+{
+  dirfrag_t df = dir->dirfrag();
+  ceph_assert(my_ambiguous_imports.count(df));
+  dout(10) << "cancel_ambiguous_import " << df
+	   << " bounds " << my_ambiguous_imports[df]
+	   << " " << *dir
+	   << dendl;
+  my_ambiguous_imports.erase(df);
+}
+
+void MDCache::finish_ambiguous_import(dirfrag_t df)
+{
+  ceph_assert(my_ambiguous_imports.count(df));
+  vector<dirfrag_t> bounds;
+  bounds.swap(my_ambiguous_imports[df]);
+  my_ambiguous_imports.erase(df);
+  
+  dout(10) << "finish_ambiguous_import " << df
+	   << " bounds " << bounds
+	   << dendl;
+  CDir *dir = get_dirfrag(df);
+  ceph_assert(dir);
+  
+  // adjust dir_auth, import maps
+  adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
+  try_subtree_merge(dir);
+}
+
+void MDCache::remove_inode_recursive(CInode *in)
+{
+  dout(10) << "remove_inode_recursive " << *in << dendl;
+  auto&& ls = in->get_dirfrags();
+  for (const auto& subdir : ls) {
+    dout(10) << " removing dirfrag " << *subdir << dendl;
+    auto it = subdir->items.begin();
+    while (it != subdir->items.end()) {
+      CDentry *dn = it->second;
+      ++it;
+      CDentry::linkage_t *dnl = dn->get_linkage();
+      if (dnl->is_primary()) {
+	CInode *tin = dnl->get_inode();
+	subdir->unlink_inode(dn, false);
+	remove_inode_recursive(tin);
+      }
+      subdir->remove_dentry(dn);
+    }
+    
+    if (subdir->is_subtree_root()) 
+      remove_subtree(subdir);
+    in->close_dirfrag(subdir->dirfrag().frag);
+  }
+  remove_inode(in);
+}
+
+bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
+{
+  ceph_assert(!in->is_auth());
+
+  dout(10) << __func__ << ":" << *in << dendl;
+
+  // Recurse into any dirfrags beneath this inode
+  auto&& ls = in->get_dirfrags();
+  for (const auto& subdir : ls) {
+    if (!in->is_mdsdir() && subdir->is_subtree_root()) {
+      dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
+      return true;
+    }
+
+    for (auto it = subdir->items.begin(); it != subdir->items.end();) {
+      CDentry *dn = it->second;
+      it++;
+      CDentry::linkage_t *dnl = dn->get_linkage();
+      if (dnl->is_primary()) {
+	CInode *tin = dnl->get_inode();
+
+        /* Remote strays with linkage (i.e. hardlinks) should not be
+         * expired, because they may be the target of
+         * a rename() as the owning MDS shuts down */
+        if (!tin->is_stray() && tin->get_inode()->nlink) {
+          dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
+          return true;
+        }
+
+	const bool abort = expire_recursive(tin, expiremap);
+        if (abort) {
+          return true;
+        }
+      }
+      if (dn->lru_is_expireable()) {
+        trim_dentry(dn, expiremap);
+      } else {
+        dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+void MDCache::trim_unlinked_inodes()
+{
+  dout(7) << "trim_unlinked_inodes" << dendl;
+  int count = 0;
+  vector<CInode*> q;
+  for (auto &p : inode_map) {
+    CInode *in = p.second;
+    if (in->get_parent_dn() == NULL && !in->is_base()) {
+      dout(7) << " will trim from " << *in << dendl;
+      q.push_back(in);
+    }
+
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+  for (auto& in : q) {
+    remove_inode_recursive(in);
+
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+}
+
+/** recalc_auth_bits()
+ * once subtree auth is disambiguated, we need to adjust all the 
+ * auth and dirty bits in our cache before moving on.
+ */
+void MDCache::recalc_auth_bits(bool replay)
+{
+  dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") <<  dendl;
+
+  if (root) {
+    root->inode_auth.first = mds->mdsmap->get_root();
+    bool auth = mds->get_nodeid() == root->inode_auth.first;
+    if (auth) {
+      root->state_set(CInode::STATE_AUTH);
+    } else {
+      root->state_clear(CInode::STATE_AUTH);
+      if (!replay)
+	root->state_set(CInode::STATE_REJOINING);
+    }
+  }
+
+  set<CInode*> subtree_inodes;
+  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+       p != subtrees.end();
+       ++p) {
+    if (p->first->dir_auth.first == mds->get_nodeid())
+      subtree_inodes.insert(p->first->inode);
+  }
+
+  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+       p != subtrees.end();
+       ++p) {
+    if (p->first->inode->is_mdsdir()) {
+      CInode *in = p->first->inode;
+      bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
+      if (auth) {
+	in->state_set(CInode::STATE_AUTH);
+      } else {
+	in->state_clear(CInode::STATE_AUTH);
+	if (!replay)
+	  in->state_set(CInode::STATE_REJOINING);
+      }
+    }
+
+    std::queue<CDir*> dfq;  // dirfrag queue
+    dfq.push(p->first);
+
+    bool auth = p->first->authority().first == mds->get_nodeid();
+    dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
+
+    while (!dfq.empty()) {
+      CDir *dir = dfq.front();
+      dfq.pop();
+
+      // dir
+      if (auth) {
+	dir->state_set(CDir::STATE_AUTH);
+      } else {
+	dir->state_clear(CDir::STATE_AUTH);
+	if (!replay) {
+	  // close empty non-auth dirfrag
+	  if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
+	    dir->inode->close_dirfrag(dir->get_frag());
+	    continue;
+	  }
+	  dir->state_set(CDir::STATE_REJOINING);
+	  dir->state_clear(CDir::STATE_COMPLETE);
+	  if (dir->is_dirty())
+	    dir->mark_clean();
+	}
+      }
+
+      // dentries in this dir
+      for (auto &p : dir->items) {
+	// dn
+	CDentry *dn = p.second;
+	CDentry::linkage_t *dnl = dn->get_linkage();
+	if (auth) {
+	  dn->state_set(CDentry::STATE_AUTH);
+	} else {
+	  dn->state_clear(CDentry::STATE_AUTH);
+	  if (!replay) {
+	    dn->state_set(CDentry::STATE_REJOINING);
+	    if (dn->is_dirty())
+	      dn->mark_clean();
+	  }
+	}
+
+	if (dnl->is_primary()) {
+	  // inode
+	  CInode *in = dnl->get_inode();
+	  if (auth) {
+	    in->state_set(CInode::STATE_AUTH);
+	  } else {
+	    in->state_clear(CInode::STATE_AUTH);
+	    if (!replay) {
+	      in->state_set(CInode::STATE_REJOINING);
+	      if (in->is_dirty())
+		in->mark_clean();
+	      if (in->is_dirty_parent())
+		in->clear_dirty_parent();
+	      // avoid touching scatterlocks for our subtree roots!
+	      if (subtree_inodes.count(in) == 0)
+		in->clear_scatter_dirty();
+	    }
+	  }
+	  // recurse?
+	  if (in->is_dir()) {
+	    auto&& dfv = in->get_nested_dirfrags();
+            for (const auto& dir : dfv) {
+              dfq.push(dir);
+            }
+          }
+	}
+      }
+    }
+  }
+  
+  show_subtrees();
+  show_cache();
+}
+
+
+
+// ===========================================================================
+// REJOIN
+
+/*
+ * notes on scatterlock recovery:
+ *
+ * - recovering inode replica sends scatterlock data for any subtree
+ *   roots (the only ones that are possibly dirty).
+ *
+ * - surviving auth incorporates any provided scatterlock data.  any
+ *   pending gathers are then finished, as with the other lock types.
+ *
+ * that takes care of surviving auth + (recovering replica)*.
+ *
+ * - surviving replica sends strong_inode, which includes current
+ *   scatterlock state, AND any dirty scatterlock data.  this
+ *   provides the recovering auth with everything it might need.
+ * 
+ * - recovering auth must pick initial scatterlock state based on
+ *   (weak|strong) rejoins.
+ *   - always assimilate scatterlock data (it can't hurt)
+ *   - any surviving replica in SCATTER state -> SCATTER.  otherwise, SYNC.
+ *   - include base inode in ack for all inodes that saw scatterlock content
+ *
+ * also, for scatter gather,
+ *
+ * - auth increments {frag,r}stat.version on completion of any gather.
+ *
+ * - auth incorporates changes in a gather _only_ if the version
+ *   matches.
+ *
+ * - replica discards changes any time the scatterlock syncs, and
+ *   after recovery.
+ */
+
+void MDCache::dump_rejoin_status(Formatter *f) const
+{
+  f->open_object_section("rejoin_status");
+  f->dump_stream("rejoin_gather") << rejoin_gather;
+  f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
+  f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
+  f->close_section();
+}
+
+void MDCache::rejoin_start(MDSContext *rejoin_done_)
+{
+  dout(10) << "rejoin_start" << dendl;
+  ceph_assert(!rejoin_done);
+  rejoin_done.reset(rejoin_done_);
+
+  rejoin_gather = recovery_set;
+  // need finish opening cap inodes before sending cache rejoins
+  rejoin_gather.insert(mds->get_nodeid());
+  process_imported_caps();
+}
+
+/*
+ * rejoin phase!
+ *
+ * this initiates rejoin.  it should be called before we get any
+ * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
+ *
+ * we start out by sending rejoins to everyone in the recovery set.
+ *
+ * if we are rejoin, send for all regions in our cache.
+ * if we are active|stopping, send only to nodes that are rejoining.
+ */
+void MDCache::rejoin_send_rejoins()
+{
+  dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
+
+  if (rejoin_gather.count(mds->get_nodeid())) {
+    dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
+    rejoins_pending = true;
+    return;
+  }
+  if (!resolve_gather.empty()) {
+    dout(7) << "rejoin_send_rejoins still waiting for resolves ("
+	    << resolve_gather << ")" << dendl;
+    rejoins_pending = true;
+    return;
+  }
+
+  ceph_assert(!migrator->is_importing());
+  ceph_assert(!migrator->is_exporting());
+
+  if (!mds->is_rejoin()) {
+    disambiguate_other_imports();
+  }
+
+  map<mds_rank_t, ref_t<MMDSCacheRejoin>> rejoins;
+
+
+  // if i am rejoining, send a rejoin to everyone.
+  // otherwise, just send to others who are rejoining.
+  for (const auto& rank : recovery_set) {
+    if (rank == mds->get_nodeid())  continue;  // nothing to myself!
+    if (rejoin_sent.count(rank)) continue;     // already sent a rejoin to this node!
+    if (mds->is_rejoin())
+      rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_WEAK);
+    else if (mds->mdsmap->is_rejoin(rank))
+      rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_STRONG);
+  }
+
+  if (mds->is_rejoin()) {
+    map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
+    for (auto& p : cap_exports) {
+      mds_rank_t target = p.second.first;
+      if (rejoins.count(target) == 0)
+	continue;
+      for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
+	Session *session = nullptr;
+	auto it = client_exports.find(q->first);
+	if (it != client_exports.end()) {
+	  session = it->second.first;
+	  if (session)
+	    it->second.second.insert(target);
+	} else {
+	  session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+	  auto& r = client_exports[q->first];
+	  r.first = session;
+	  if (session)
+	    r.second.insert(target);
+	}
+	if (session) {
+	  ++q;
+	} else {
+	  // remove reconnect with no session
+	  p.second.second.erase(q++);
+	}
+      }
+      rejoins[target]->cap_exports[p.first] = p.second.second;
+    }
+    for (auto& p : client_exports) {
+      Session *session = p.second.first;
+      for (auto& q : p.second.second) {
+	auto rejoin =  rejoins[q];
+	rejoin->client_map[p.first] = session->info.inst;
+	rejoin->client_metadata_map[p.first] = session->info.client_metadata;
+      }
+    }
+  }
+  
+  
+  // check all subtrees
+  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+       p != subtrees.end();
+       ++p) {
+    CDir *dir = p->first;
+    ceph_assert(dir->is_subtree_root());
+    if (dir->is_ambiguous_dir_auth()) {
+      // exporter is recovering, importer is survivor.
+      ceph_assert(rejoins.count(dir->authority().first));
+      ceph_assert(!rejoins.count(dir->authority().second));
+      continue;
+    }
+
+    // my subtree?
+    if (dir->is_auth())
+      continue;  // skip my own regions!
+
+    mds_rank_t auth = dir->get_dir_auth().first;
+    ceph_assert(auth >= 0);
+    if (rejoins.count(auth) == 0)
+      continue;   // don't care about this node's subtrees
+
+    rejoin_walk(dir, rejoins[auth]);
+  }
+  
+  // rejoin root inodes, too
+  for (auto &p : rejoins) {
+    if (mds->is_rejoin()) {
+      // weak
+      if (p.first == 0 && root) {
+	p.second->add_weak_inode(root->vino());
+	if (root->is_dirty_scattered()) {
+	  dout(10) << " sending scatterlock state on root " << *root << dendl;
+	  p.second->add_scatterlock_state(root);
+	}
+      }
+      if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) { 
+	if (in)
+	  p.second->add_weak_inode(in->vino());
+      }
+    } else {
+      // strong
+      if (p.first == 0 && root) {
+	p.second->add_strong_inode(root->vino(),
+				    root->get_replica_nonce(),
+				    root->get_caps_wanted(),
+				    root->filelock.get_state(),
+				    root->nestlock.get_state(),
+				    root->dirfragtreelock.get_state());
+	root->state_set(CInode::STATE_REJOINING);
+	if (root->is_dirty_scattered()) {
+	  dout(10) << " sending scatterlock state on root " << *root << dendl;
+	  p.second->add_scatterlock_state(root);
+	}
+      }
+
+      if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
+	p.second->add_strong_inode(in->vino(),
+				    in->get_replica_nonce(),
+				    in->get_caps_wanted(),
+				    in->filelock.get_state(),
+				    in->nestlock.get_state(),
+				    in->dirfragtreelock.get_state());
+	in->state_set(CInode::STATE_REJOINING);
+      }
+    }
+  }  
+
+  if (!mds->is_rejoin()) {
+    // i am survivor.  send strong rejoin.
+    // note request remote_auth_pins, xlocks
+    for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+	 p != active_requests.end();
+	 ++p) {
+      MDRequestRef& mdr = p->second;
+      if (mdr->is_peer())
+	continue;
+      // auth pins
+      for (const auto& q : mdr->object_states) {
+	if (q.second.remote_auth_pinned == MDS_RANK_NONE)
+	  continue;
+	if (!q.first->is_auth()) {
+	  mds_rank_t target = q.second.remote_auth_pinned;
+	  ceph_assert(target == q.first->authority().first);
+	  if (rejoins.count(target) == 0) continue;
+	  const auto& rejoin = rejoins[target];
+	  
+	  dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
+	  MDSCacheObjectInfo i;
+	  q.first->set_object_info(i);
+	  if (i.ino)
+	    rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
+	  else
+	    rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
+
+	  if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
+	      mdr->more()->rename_inode == q.first)
+	    rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
+					     mdr->reqid, mdr->attempt);
+	}
+      }
+      // xlocks
+      for (const auto& q : mdr->locks) {
+	auto lock = q.lock;
+	auto obj = lock->get_parent();
+	if (q.is_xlock() && !obj->is_auth()) {
+	  mds_rank_t who = obj->authority().first;
+	  if (rejoins.count(who) == 0) continue;
+	  const auto& rejoin = rejoins[who];
+	  
+	  dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
+	  MDSCacheObjectInfo i;
+	  obj->set_object_info(i);
+	  if (i.ino)
+	    rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
+				    mdr->reqid, mdr->attempt);
+	  else
+	    rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
+				     mdr->reqid, mdr->attempt);
+	} else if (q.is_remote_wrlock()) {
+	  mds_rank_t who = q.wrlock_target;
+	  if (rejoins.count(who) == 0) continue;
+	  const auto& rejoin = rejoins[who];
+
+	  dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
+	  MDSCacheObjectInfo i;
+	  obj->set_object_info(i);
+	  ceph_assert(i.ino);
+	  rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
+				   mdr->reqid, mdr->attempt);
+	}
+      }
+    }
+  }
+
+  // send the messages
+  for (auto &p : rejoins) {
+    ceph_assert(rejoin_sent.count(p.first) == 0);
+    ceph_assert(rejoin_ack_gather.count(p.first) == 0);
+    rejoin_sent.insert(p.first);
+    rejoin_ack_gather.insert(p.first);
+    mds->send_message_mds(p.second, p.first);
+  }
+  rejoin_ack_gather.insert(mds->get_nodeid());   // we need to complete rejoin_gather_finish, too
+  rejoins_pending = false;
+
+  // nothing?
+  if (mds->is_rejoin() && rejoin_gather.empty()) {
+    dout(10) << "nothing to rejoin" << dendl;
+    rejoin_gather_finish();
+  }
+}
+
+
+/** 
+ * rejoin_walk - build rejoin declarations for a subtree
+ * 
+ * @param dir subtree root
+ * @param rejoin rejoin message
+ *
+ * from a rejoining node:
+ *  weak dirfrag
+ *  weak dentries (w/ connectivity)
+ *
+ * from a surviving node:
+ *  strong dirfrag
+ *  strong dentries (no connectivity!)
+ *  strong inodes
+ */
+void MDCache::rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin)
+{
+  dout(10) << "rejoin_walk " << *dir << dendl;
+
+  std::vector<CDir*> nested;  // finish this dir, then do nested items
+  
+  if (mds->is_rejoin()) {
+    // WEAK
+    rejoin->add_weak_dirfrag(dir->dirfrag());
+    for (auto &p : dir->items) {
+      CDentry *dn = p.second;
+      ceph_assert(dn->last == CEPH_NOSNAP);
+      CDentry::linkage_t *dnl = dn->get_linkage();
+      dout(15) << " add_weak_primary_dentry " << *dn << dendl;
+      ceph_assert(dnl->is_primary());
+      CInode *in = dnl->get_inode();
+      ceph_assert(dnl->get_inode()->is_dir());
+      rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
+      {
+        auto&& dirs = in->get_nested_dirfrags();
+        nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
+      }
+      if (in->is_dirty_scattered()) {
+	dout(10) << " sending scatterlock state on " << *in << dendl;
+	rejoin->add_scatterlock_state(in);
+      }
+    }
+  } else {
+    // STRONG
+    dout(15) << " add_strong_dirfrag " << *dir << dendl;
+    rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
+    dir->state_set(CDir::STATE_REJOINING);
+
+    for (auto it = dir->items.begin(); it != dir->items.end(); ) {
+      CDentry *dn = it->second;
+      ++it;
+      dn->state_set(CDentry::STATE_REJOINING);
+      CDentry::linkage_t *dnl = dn->get_linkage();
+      CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
+
+      // trim snap dentries. because they may have been pruned by
+      // their auth mds (snap deleted)
+      if (dn->last != CEPH_NOSNAP) {
+	if (in && !in->remote_parents.empty()) {
+	  // unlink any stale remote snap dentry.
+	  for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
+	    CDentry *remote_dn = *it2;
+	    ++it2;
+	    ceph_assert(remote_dn->last != CEPH_NOSNAP);
+	    remote_dn->unlink_remote(remote_dn->get_linkage());
+	  }
+	}
+	if (dn->lru_is_expireable()) {
+	  if (!dnl->is_null())
+	    dir->unlink_inode(dn, false);
+	  if (in)
+	    remove_inode(in);
+	  dir->remove_dentry(dn);
+	  continue;
+	} else {
+	  // Inventing null/remote dentry shouldn't cause problem
+	  ceph_assert(!dnl->is_primary());
+	}
+      }
+
+      dout(15) << " add_strong_dentry " << *dn << dendl;
+      rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
+                                dn->first, dn->last,
+				dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
+				dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
+				dnl->is_remote() ? dnl->get_remote_d_type():0, 
+				dn->get_replica_nonce(),
+				dn->lock.get_state());
+      dn->state_set(CDentry::STATE_REJOINING);
+      if (dnl->is_primary()) {
+	CInode *in = dnl->get_inode();
+	dout(15) << " add_strong_inode " << *in << dendl;
+	rejoin->add_strong_inode(in->vino(),
+				 in->get_replica_nonce(),
+				 in->get_caps_wanted(),
+				 in->filelock.get_state(),
+				 in->nestlock.get_state(),
+				 in->dirfragtreelock.get_state());
+	in->state_set(CInode::STATE_REJOINING);
+        {
+          auto&& dirs = in->get_nested_dirfrags();
+          nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
+        }
+	if (in->is_dirty_scattered()) {
+	  dout(10) << " sending scatterlock state on " << *in << dendl;
+	  rejoin->add_scatterlock_state(in);
+	}
+      }
+    }
+  }
+
+  // recurse into nested dirs
+  for (const auto& dir : nested) {
+    rejoin_walk(dir, rejoin);
+  }
+}
+
+
+/*
+ * i got a rejoin.
+ *  - reply with the lockstate
+ *
+ * if i am active|stopping, 
+ *  - remove source from replica list for everything not referenced here.
+ */
+void MDCache::handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m)
+{
+  dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source() 
+	  << " (" << m->get_payload().length() << " bytes)"
+	  << dendl;
+
+  switch (m->op) {
+  case MMDSCacheRejoin::OP_WEAK:
+    handle_cache_rejoin_weak(m);
+    break;
+  case MMDSCacheRejoin::OP_STRONG:
+    handle_cache_rejoin_strong(m);
+    break;
+  case MMDSCacheRejoin::OP_ACK:
+    handle_cache_rejoin_ack(m);
+    break;
+
+  default: 
+    ceph_abort();
+  }
+}
+
+
+/*
+ * handle_cache_rejoin_weak
+ *
+ * the sender 
+ *  - is recovering from their journal.
+ *  - may have incorrect (out of date) inode contents
+ *  - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
+ *
+ * if the sender didn't trim_non_auth(), they
+ *  - may have incorrect (out of date) dentry/inode linkage
+ *  - may have deleted/purged inodes
+ * and i may have to go to disk to get accurate inode contents.  yuck.
+ */
+void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
+{
+  mds_rank_t from = mds_rank_t(weak->get_source().num());
+
+  // possible response(s)
+  ref_t<MMDSCacheRejoin> ack;      // if survivor
+  set<vinodeno_t> acked_inodes;  // if survivor
+  set<SimpleLock *> gather_locks;  // if survivor
+  bool survivor = false;  // am i a survivor?
+
+  if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
+    survivor = true;
+    dout(10) << "i am a surivivor, and will ack immediately" << dendl;
+    ack = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
+
+    map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
+
+    // check cap exports
+    for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
+      CInode *in = get_inode(p->first);
+      ceph_assert(!in || in->is_auth());
+      for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+	dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
+	Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
+	Capability::Import& im = imported_caps[p->first][q->first];
+	if (cap) {
+	  im.cap_id = cap->get_cap_id();
+	  im.issue_seq = cap->get_last_seq();
+	  im.mseq = cap->get_mseq();
+	} else {
+	  // all are zero
+	}
+      }
+      mds->locker->eval(in, CEPH_CAP_LOCKS, true);
+    }
+
+    encode(imported_caps, ack->imported_caps);
+  } else {
+    ceph_assert(mds->is_rejoin());
+
+    // we may have already received a strong rejoin from the sender.
+    rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
+    ceph_assert(gather_locks.empty());
+
+    // check cap exports.
+    rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
+    rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
+				      weak->client_metadata_map.end());
+
+    for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
+      CInode *in = get_inode(p->first);
+      ceph_assert(!in || in->is_auth());
+      // note
+      for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+	dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
+	cap_imports[p->first][q->first][from] = q->second;
+      }
+    }
+  }
+
+  // assimilate any potentially dirty scatterlock state
+  for (const auto &p : weak->inode_scatterlocks) {
+    CInode *in = get_inode(p.first);
+    ceph_assert(in);
+    in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
+    in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
+    in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
+    if (!survivor)
+      rejoin_potential_updated_scatterlocks.insert(in);
+  }
+
+  // recovering peer may send incorrect dirfrags here.  we need to
+  // infer which dirfrag they meant.  the ack will include a
+  // strong_dirfrag that will set them straight on the fragmentation.
+  
+  // walk weak map
+  set<CDir*> dirs_to_share;
+  for (const auto &p : weak->weak_dirfrags) {
+    CInode *diri = get_inode(p.ino);
+    if (!diri)
+      dout(0) << " missing dir ino " << p.ino << dendl;
+    ceph_assert(diri);
+
+    frag_vec_t leaves;
+    if (diri->dirfragtree.is_leaf(p.frag)) {
+      leaves.push_back(p.frag);
+    } else {
+      diri->dirfragtree.get_leaves_under(p.frag, leaves);
+      if (leaves.empty())
+	leaves.push_back(diri->dirfragtree[p.frag.value()]);
+    }
+    for (const auto& leaf : leaves) {
+      CDir *dir = diri->get_dirfrag(leaf);
+      if (!dir) {
+	dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
+	continue;
+      }
+      ceph_assert(dir);
+      if (dirs_to_share.count(dir)) {
+	dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
+      } else {
+	dirs_to_share.insert(dir);
+	unsigned nonce = dir->add_replica(from);
+	dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
+	if (ack) {
+	  ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
+	  ack->add_dirfrag_base(dir);
+	}
+      }
+    }
+  }
+
+  for (const auto &p : weak->weak) {
+    CInode *diri = get_inode(p.first);
+    if (!diri)
+      dout(0) << " missing dir ino " << p.first << dendl;
+    ceph_assert(diri);
+
+    // weak dentries
+    CDir *dir = 0;
+    for (const auto &q : p.second) {
+      // locate proper dirfrag.
+      //  optimize for common case (one dirfrag) to avoid dirs_to_share set check
+      frag_t fg = diri->pick_dirfrag(q.first.name);
+      if (!dir || dir->get_frag() != fg) {
+	dir = diri->get_dirfrag(fg);
+	if (!dir)
+	  dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
+	ceph_assert(dir);
+	ceph_assert(dirs_to_share.count(dir));
+      }
+
+      // and dentry
+      CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
+      ceph_assert(dn);
+      CDentry::linkage_t *dnl = dn->get_linkage();
+      ceph_assert(dnl->is_primary());
+      
+      if (survivor && dn->is_replica(from)) 
+	dentry_remove_replica(dn, from, gather_locks);
+      unsigned dnonce = dn->add_replica(from);
+      dout(10) << " have " << *dn << dendl;
+      if (ack) 
+	ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
+                               dn->first, dn->last,
+			       dnl->get_inode()->ino(), inodeno_t(0), 0, 
+			       dnonce, dn->lock.get_replica_state());
+
+      // inode
+      CInode *in = dnl->get_inode();
+      ceph_assert(in);
+
+      if (survivor && in->is_replica(from)) 
+	inode_remove_replica(in, from, true, gather_locks);
+      unsigned inonce = in->add_replica(from);
+      dout(10) << " have " << *in << dendl;
+
+      // scatter the dirlock, just in case?
+      if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
+	in->filelock.set_state(LOCK_MIX);
+
+      if (ack) {
+	acked_inodes.insert(in->vino());
+	ack->add_inode_base(in, mds->mdsmap->get_up_features());
+	bufferlist bl;
+	in->_encode_locks_state_for_rejoin(bl, from);
+	ack->add_inode_locks(in, inonce, bl);
+      }
+    }
+  }
+  
+  // weak base inodes?  (root, stray, etc.)
+  for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
+       p != weak->weak_inodes.end();
+       ++p) {
+    CInode *in = get_inode(*p);
+    ceph_assert(in);   // hmm fixme wrt stray?
+    if (survivor && in->is_replica(from)) 
+      inode_remove_replica(in, from, true, gather_locks);
+    unsigned inonce = in->add_replica(from);
+    dout(10) << " have base " << *in << dendl;
+    
+    if (ack) {
+      acked_inodes.insert(in->vino());
+      ack->add_inode_base(in, mds->mdsmap->get_up_features());
+      bufferlist bl;
+      in->_encode_locks_state_for_rejoin(bl, from);
+      ack->add_inode_locks(in, inonce, bl);
+    }
+  }
+
+  ceph_assert(rejoin_gather.count(from));
+  rejoin_gather.erase(from);
+  if (survivor) {
+    // survivor.  do everything now.
+    for (const auto &p : weak->inode_scatterlocks) {
+      CInode *in = get_inode(p.first);
+      ceph_assert(in);
+      dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
+      acked_inodes.insert(in->vino());
+      ack->add_inode_base(in, mds->mdsmap->get_up_features());
+    }
+
+    rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
+    mds->send_message(ack, weak->get_connection());
+
+    for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
+      if (!(*p)->is_stable())
+	mds->locker->eval_gather(*p);
+    }
+  } else {
+    // done?
+    if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
+      rejoin_gather_finish();
+    } else {
+      dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
+    }
+  }
+}
+
+/*
+ * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
+ *
+ * all validated replicas are acked with a strong nonce, etc.  if that isn't in the
+ * ack, the replica dne, and we can remove it from our replica maps.
+ */
+void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
+					     set<vinodeno_t>& acked_inodes,
+					     set<SimpleLock *>& gather_locks)
+{
+  dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
+
+  auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
+    // inode?
+    if (in->is_auth() &&
+	in->is_replica(from) &&
+	(ack == NULL || acked_inodes.count(in->vino()) == 0)) {
+      inode_remove_replica(in, from, false, gather_locks);
+      dout(10) << " rem " << *in << dendl;
+    }
+
+    if (!in->is_dir())
+      return;
+    
+    const auto&& dfs = in->get_dirfrags();
+    for (const auto& dir : dfs) {
+      if (!dir->is_auth())
+	continue;
+      
+      if (dir->is_replica(from) &&
+	  (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
+	dir->remove_replica(from);
+	dout(10) << " rem " << *dir << dendl;
+      } 
+      
+      // dentries
+      for (auto &p : dir->items) {
+	CDentry *dn = p.second;
+	
+	if (dn->is_replica(from)) {
+          if (ack) {
+            const auto it = ack->strong_dentries.find(dir->dirfrag());
+            if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
+              continue;
+            }
+          }
+	  dentry_remove_replica(dn, from, gather_locks);
+	  dout(10) << " rem " << *dn << dendl;
+	}
+      }
+    }
+  };
+
+  for (auto &p : inode_map)
+    scour_func(p.second);
+  for (auto &p : snap_inode_map)
+    scour_func(p.second);
+}
+
+
+CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
+{
+  CInode *in = new CInode(this, true, 2, last);
+  in->_get_inode()->ino = ino;
+  in->state_set(CInode::STATE_REJOINUNDEF);
+  add_inode(in);
+  rejoin_undef_inodes.insert(in);
+  dout(10) << " invented " << *in << dendl;
+  return in;
+}
+
+CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
+{
+  CInode *in = get_inode(df.ino);
+  if (!in)
+    in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
+  if (!in->is_dir()) {
+    ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
+    in->_get_inode()->mode = S_IFDIR;
+    in->_get_inode()->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+  }
+  CDir *dir = in->get_or_open_dirfrag(this, df.frag);
+  dir->state_set(CDir::STATE_REJOINUNDEF);
+  rejoin_undef_dirfrags.insert(dir);
+  dout(10) << " invented " << *dir << dendl;
+  return dir;
+}
+
+void MDCache::handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &strong)
+{
+  mds_rank_t from = mds_rank_t(strong->get_source().num());
+
+  // only a recovering node will get a strong rejoin.
+  if (!mds->is_rejoin()) {
+    if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
+      mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
+      return;
+    }
+    ceph_abort_msg("got unexpected rejoin message during recovery");
+  }
+
+  // assimilate any potentially dirty scatterlock state
+  for (const auto &p : strong->inode_scatterlocks) {
+    CInode *in = get_inode(p.first);
+    ceph_assert(in);
+    in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
+    in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
+    in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
+    rejoin_potential_updated_scatterlocks.insert(in);
+  }
+
+  rejoin_unlinked_inodes[from].clear();
+
+  // surviving peer may send incorrect dirfrag here (maybe they didn't
+  // get the fragment notify, or maybe we rolled back?).  we need to
+  // infer the right frag and get them with the program.  somehow.
+  // we don't normally send ACK.. so we'll need to bundle this with
+  // MISSING or something.
+
+  // strong dirfrags/dentries.
+  //  also process auth_pins, xlocks.
+  for (const auto &p : strong->strong_dirfrags) {
+    auto& dirfrag = p.first;
+    CInode *diri = get_inode(dirfrag.ino);
+    if (!diri)
+      diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
+    CDir *dir = diri->get_dirfrag(dirfrag.frag);
+    bool refragged = false;
+    if (dir) {
+      dout(10) << " have " << *dir << dendl;
+    } else {
+      if (diri->state_test(CInode::STATE_REJOINUNDEF))
+	dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
+      else if (diri->dirfragtree.is_leaf(dirfrag.frag))
+	dir = rejoin_invent_dirfrag(dirfrag);
+    }
+    if (dir) {
+      dir->add_replica(from, p.second.nonce);
+      dir->dir_rep = p.second.dir_rep;
+    } else {
+      dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
+      frag_vec_t leaves;
+      diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
+      if (leaves.empty())
+	leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
+      dout(10) << " maps to frag(s) " << leaves << dendl;
+      for (const auto& leaf : leaves) {
+	CDir *dir = diri->get_dirfrag(leaf);
+	if (!dir)
+	  dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
+	else
+	  dout(10) << " have(approx) " << *dir << dendl;
+	dir->add_replica(from, p.second.nonce);
+	dir->dir_rep = p.second.dir_rep;
+      }
+      refragged = true;
+    }
+    
+    const auto it = strong->strong_dentries.find(dirfrag);
+    if (it != strong->strong_dentries.end()) {
+      const auto& dmap = it->second;
+      for (const auto &q : dmap) {
+        const string_snap_t& ss = q.first;
+        const MMDSCacheRejoin::dn_strong& d = q.second;
+        CDentry *dn;
+        if (!refragged)
+	  dn = dir->lookup(ss.name, ss.snapid);
+        else {
+	  frag_t fg = diri->pick_dirfrag(ss.name);
+	  dir = diri->get_dirfrag(fg);
+	  ceph_assert(dir);
+	  dn = dir->lookup(ss.name, ss.snapid);
+        }
+        if (!dn) {
+	  if (d.is_remote()) {
+	    dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, mempool::mds_co::string(d.alternate_name), d.first, ss.snapid);
+	  } else if (d.is_null()) {
+	    dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
+	  } else {
+	    CInode *in = get_inode(d.ino, ss.snapid);
+	    if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
+	    dn = dir->add_primary_dentry(ss.name, in, mempool::mds_co::string(d.alternate_name), d.first, ss.snapid);
+	  }
+	  dout(10) << " invented " << *dn << dendl;
+        }
+        CDentry::linkage_t *dnl = dn->get_linkage();
+
+        // dn auth_pin?
+        const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
+        if (pinned_it != strong->authpinned_dentries.end()) {
+          const auto peer_reqid_it = pinned_it->second.find(ss);
+          if (peer_reqid_it != pinned_it->second.end()) {
+            for (const auto &r : peer_reqid_it->second) {
+	      dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
+
+	      // get/create peer mdrequest
+	      MDRequestRef mdr;
+	      if (have_request(r.reqid))
+	        mdr = request_get(r.reqid);
+	      else
+	        mdr = request_start_peer(r.reqid, r.attempt, strong);
+	      mdr->auth_pin(dn);
+            }
+          }
+	}
+
+        // dn xlock?
+        const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
+        if (xlocked_it != strong->xlocked_dentries.end()) {
+          const auto ss_req_it = xlocked_it->second.find(ss);
+          if (ss_req_it != xlocked_it->second.end()) {
+	    const MMDSCacheRejoin::peer_reqid& r = ss_req_it->second;
+	    dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
+	    MDRequestRef mdr = request_get(r.reqid);  // should have this from auth_pin above.
+	    ceph_assert(mdr->is_auth_pinned(dn));
+	    if (!mdr->is_xlocked(&dn->versionlock)) {
+	      ceph_assert(dn->versionlock.can_xlock_local());
+	      dn->versionlock.get_xlock(mdr, mdr->get_client());
+	      mdr->emplace_lock(&dn->versionlock, MutationImpl::LockOp::XLOCK);
+	    }
+	    if (dn->lock.is_stable())
+	      dn->auth_pin(&dn->lock);
+	    dn->lock.set_state(LOCK_XLOCK);
+	    dn->lock.get_xlock(mdr, mdr->get_client());
+	    mdr->emplace_lock(&dn->lock, MutationImpl::LockOp::XLOCK);
+          }
+        }
+
+        dn->add_replica(from, d.nonce);
+        dout(10) << " have " << *dn << dendl;
+
+        if (dnl->is_primary()) {
+	  if (d.is_primary()) {
+	    if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
+	      // the survivor missed MDentryUnlink+MDentryLink messages ?
+	      ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
+	      CInode *in = get_inode(d.ino, ss.snapid);
+	      ceph_assert(in);
+	      ceph_assert(in->get_parent_dn());
+	      rejoin_unlinked_inodes[from].insert(in);
+	      dout(7) << " sender has primary dentry but wrong inode" << dendl;
+	    }
+	  } else {
+	    // the survivor missed MDentryLink message ?
+	    ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
+	    dout(7) << " sender doesn't have primay dentry" << dendl;
+	  }
+        } else {
+	  if (d.is_primary()) {
+	    // the survivor missed MDentryUnlink message ?
+	    CInode *in = get_inode(d.ino, ss.snapid);
+	    ceph_assert(in);
+	    ceph_assert(in->get_parent_dn());
+	    rejoin_unlinked_inodes[from].insert(in);
+	    dout(7) << " sender has primary dentry but we don't" << dendl;
+	  }
+        }
+      }
+    }
+  }
+
+  for (const auto &p : strong->strong_inodes) {
+    CInode *in = get_inode(p.first);
+    ceph_assert(in);
+    in->add_replica(from, p.second.nonce);
+    dout(10) << " have " << *in << dendl;
+
+    const MMDSCacheRejoin::inode_strong& is = p.second;
+
+    // caps_wanted
+    if (is.caps_wanted) {
+      in->set_mds_caps_wanted(from, is.caps_wanted);
+      dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
+	       << " on " << *in << dendl;
+    }
+
+    // scatterlocks?
+    //  infer state from replica state:
+    //   * go to MIX if they might have wrlocks
+    //   * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
+    in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir());  // maybe also go to LOCK
+    in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
+    in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
+
+    // auth pin?
+    const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
+    if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
+      for (const auto& r : authpinned_inodes_it->second) {
+	dout(10) << " inode authpin by " << r << " on " << *in << dendl;
+
+	// get/create peer mdrequest
+	MDRequestRef mdr;
+	if (have_request(r.reqid))
+	  mdr = request_get(r.reqid);
+	else
+	  mdr = request_start_peer(r.reqid, r.attempt, strong);
+	if (strong->frozen_authpin_inodes.count(in->vino())) {
+	  ceph_assert(!in->get_num_auth_pins());
+	  mdr->freeze_auth_pin(in);
+	} else {
+	  ceph_assert(!in->is_frozen_auth_pin());
+	}
+	mdr->auth_pin(in);
+      }
+    }
+    // xlock(s)?
+    const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
+    if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
+      for (const auto &q : xlocked_inodes_it->second) {
+	SimpleLock *lock = in->get_lock(q.first);
+	dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
+	MDRequestRef mdr = request_get(q.second.reqid);  // should have this from auth_pin above.
+	ceph_assert(mdr->is_auth_pinned(in));
+	if (!mdr->is_xlocked(&in->versionlock)) {
+	  ceph_assert(in->versionlock.can_xlock_local());
+	  in->versionlock.get_xlock(mdr, mdr->get_client());
+	  mdr->emplace_lock(&in->versionlock, MutationImpl::LockOp::XLOCK);
+	}
+	if (lock->is_stable())
+	  in->auth_pin(lock);
+	lock->set_state(LOCK_XLOCK);
+	if (lock == &in->filelock)
+	  in->loner_cap = -1;
+	lock->get_xlock(mdr, mdr->get_client());
+	mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
+      }
+    }
+  }
+  // wrlock(s)?
+  for (const auto &p : strong->wrlocked_inodes) {
+    CInode *in = get_inode(p.first);
+    for (const auto &q : p.second) {
+      SimpleLock *lock = in->get_lock(q.first);
+      for (const auto &r : q.second) {
+	dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
+	MDRequestRef mdr = request_get(r.reqid);  // should have this from auth_pin above.
+	if (in->is_auth())
+	  ceph_assert(mdr->is_auth_pinned(in));
+	lock->set_state(LOCK_MIX);
+	if (lock == &in->filelock)
+	  in->loner_cap = -1;
+	lock->get_wrlock(true);
+	mdr->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
+      }
+    }
+  }
+
+  // done?
+  ceph_assert(rejoin_gather.count(from));
+  rejoin_gather.erase(from);
+  if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
+    rejoin_gather_finish();
+  } else {
+    dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
+  }
+}
+
+void MDCache::handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &ack)
+{
+  dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
+  mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+  ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
+  bool survivor = !mds->is_rejoin();
+
+  // for sending cache expire message
+  set<CInode*> isolated_inodes;
+  set<CInode*> refragged_inodes;
+  list<pair<CInode*,int> > updated_realms;
+
+  // dirs
+  for (const auto &p : ack->strong_dirfrags) {
+    // we may have had incorrect dir fragmentation; refragment based
+    // on what they auth tells us.
+    CDir *dir = get_dirfrag(p.first);
+    if (!dir) {
+      dir = get_force_dirfrag(p.first, false);
+      if (dir)
+	refragged_inodes.insert(dir->get_inode());
+    }
+    if (!dir) {
+      CInode *diri = get_inode(p.first.ino);
+      if (!diri) {
+	// barebones inode; the full inode loop below will clean up.
+	diri = new CInode(this, false);
+	auto _inode = diri->_get_inode();
+	_inode->ino = p.first.ino;
+	_inode->mode = S_IFDIR;
+	_inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+
+	add_inode(diri);
+	if (MDS_INO_MDSDIR(from) == p.first.ino) {
+	  diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
+	  dout(10) << " add inode " << *diri << dendl;
+	} else {
+	  diri->inode_auth = CDIR_AUTH_DEFAULT;
+	  isolated_inodes.insert(diri);
+	  dout(10) << " unconnected dirfrag " << p.first << dendl;
+	}
+      }
+      // barebones dirfrag; the full dirfrag loop below will clean up.
+      dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
+      if (MDS_INO_MDSDIR(from) == p.first.ino ||
+	  (dir->authority() != CDIR_AUTH_UNDEF &&
+	   dir->authority().first != from))
+	adjust_subtree_auth(dir, from);
+      dout(10) << " add dirfrag " << *dir << dendl;
+    }
+
+    dir->set_replica_nonce(p.second.nonce);
+    dir->state_clear(CDir::STATE_REJOINING);
+    dout(10) << " got " << *dir << dendl;
+
+    // dentries
+    auto it = ack->strong_dentries.find(p.first);
+    if (it != ack->strong_dentries.end()) {
+      for (const auto &q : it->second) {
+        CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
+        if(!dn)
+	  dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
+
+        CDentry::linkage_t *dnl = dn->get_linkage();
+
+        ceph_assert(dn->last == q.first.snapid);
+        if (dn->first != q.second.first) {
+	  dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
+	  dn->first = q.second.first;
+        }
+
+        // may have bad linkage if we missed dentry link/unlink messages
+        if (dnl->is_primary()) {
+	  CInode *in = dnl->get_inode();
+	  if (!q.second.is_primary() ||
+	      vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
+	    dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
+	    dir->unlink_inode(dn);
+	  }
+        } else if (dnl->is_remote()) {
+	  if (!q.second.is_remote() ||
+	      q.second.remote_ino != dnl->get_remote_ino() ||
+	      q.second.remote_d_type != dnl->get_remote_d_type()) {
+	    dout(10) << " had bad linkage for " << *dn <<  dendl;
+	    dir->unlink_inode(dn);
+	  }
+        } else {
+	  if (!q.second.is_null())
+	    dout(10) << " had bad linkage for " << *dn <<  dendl;
+        }
+
+	// hmm, did we have the proper linkage here?
+	if (dnl->is_null() && !q.second.is_null()) {
+	  if (q.second.is_remote()) {
+	    dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
+	  } else {
+	    CInode *in = get_inode(q.second.ino, q.first.snapid);
+	    if (!in) {
+	      // barebones inode; assume it's dir, the full inode loop below will clean up.
+	      in = new CInode(this, false, q.second.first, q.first.snapid);
+	      auto _inode = in->_get_inode();
+	      _inode->ino = q.second.ino;
+	      _inode->mode = S_IFDIR;
+	      _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+	      add_inode(in);
+	      dout(10) << " add inode " << *in << dendl;
+	    } else if (in->get_parent_dn()) {
+	      dout(10) << " had bad linkage for " << *(in->get_parent_dn())
+		       << ", unlinking " << *in << dendl;
+	      in->get_parent_dir()->unlink_inode(in->get_parent_dn());
+	    }
+	    dn->dir->link_primary_inode(dn, in);
+	    isolated_inodes.erase(in);
+	  }
+	}
+
+        dn->set_replica_nonce(q.second.nonce);
+        dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
+        dn->state_clear(CDentry::STATE_REJOINING);
+        dout(10) << " got " << *dn << dendl;
+      }
+    }
+  }
+
+  for (const auto& in : refragged_inodes) {
+    auto&& ls = in->get_nested_dirfrags();
+    for (const auto& dir : ls) {
+      if (dir->is_auth() || ack->strong_dirfrags.count(dir->dirfrag()))
+	continue;
+      ceph_assert(dir->get_num_any() == 0);
+      in->close_dirfrag(dir->get_frag());
+    }
+  }
+
+  // full dirfrags
+  for (const auto &p : ack->dirfrag_bases) {
+    CDir *dir = get_dirfrag(p.first);
+    ceph_assert(dir);
+    auto q = p.second.cbegin();
+    dir->_decode_base(q);
+    dout(10) << " got dir replica " << *dir << dendl;
+  }
+
+  // full inodes
+  auto p = ack->inode_base.cbegin();
+  while (!p.end()) {
+    inodeno_t ino;
+    snapid_t last;
+    bufferlist basebl;
+    decode(ino, p);
+    decode(last, p);
+    decode(basebl, p);
+    CInode *in = get_inode(ino, last);
+    ceph_assert(in);
+    auto q = basebl.cbegin();
+    snapid_t sseq = 0;
+    if (in->snaprealm)
+      sseq = in->snaprealm->srnode.seq;
+    in->_decode_base(q);
+    if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
+      int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
+      updated_realms.push_back(pair<CInode*,int>(in, snap_op));
+    }
+    dout(10) << " got inode base " << *in << dendl;
+  }
+
+  // inodes
+  p = ack->inode_locks.cbegin();
+  //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
+  while (!p.end()) {
+    inodeno_t ino;
+    snapid_t last;
+    __u32 nonce;
+    bufferlist lockbl;
+    decode(ino, p);
+    decode(last, p);
+    decode(nonce, p);
+    decode(lockbl, p);
+    
+    CInode *in = get_inode(ino, last);
+    ceph_assert(in);
+    in->set_replica_nonce(nonce);
+    auto q = lockbl.cbegin();
+    in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
+    in->state_clear(CInode::STATE_REJOINING);
+    dout(10) << " got inode locks " << *in << dendl;
+  }
+
+  // FIXME: This can happen if entire subtree, together with the inode subtree root
+  // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
+  ceph_assert(isolated_inodes.empty());
+
+  map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
+  auto bp = ack->imported_caps.cbegin();
+  decode(peer_imported, bp);
+
+  for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
+       p != peer_imported.end();
+       ++p) {
+    auto& ex = cap_exports.at(p->first);
+    ceph_assert(ex.first == from);
+    for (map<client_t,Capability::Import>::iterator q = p->second.begin();
+	 q != p->second.end();
+	 ++q) {
+      auto r = ex.second.find(q->first);
+      ceph_assert(r != ex.second.end());
+
+      dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
+      Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+      if (!session) {
+	dout(10) << " no session for client." << p->first << dendl;
+	ex.second.erase(r);
+	continue;
+      }
+
+      // mark client caps stale.
+      auto m = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first, 0,
+				       r->second.capinfo.cap_id, 0,
+                                       mds->get_osd_epoch_barrier());
+      m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
+		      (q->second.cap_id > 0 ? from : -1), 0);
+      mds->send_message_client_counted(m, session);
+
+      ex.second.erase(r);
+    }
+    ceph_assert(ex.second.empty());
+  }
+
+  for (auto p : updated_realms) {
+    CInode *in = p.first;
+    bool notify_clients;
+    if (mds->is_rejoin()) {
+      if (!rejoin_pending_snaprealms.count(in)) {
+	in->get(CInode::PIN_OPENINGSNAPPARENTS);
+	rejoin_pending_snaprealms.insert(in);
+      }
+      notify_clients = false;
+    } else {
+      // notify clients if I'm survivor
+      notify_clients = true;
+    }
+    do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
+  }
+
+  // done?
+  ceph_assert(rejoin_ack_gather.count(from));
+  rejoin_ack_gather.erase(from);
+  if (!survivor) {
+    if (rejoin_gather.empty()) {
+      // eval unstable scatter locks after all wrlocks are rejoined.
+      while (!rejoin_eval_locks.empty()) {
+	SimpleLock *lock = rejoin_eval_locks.front();
+	rejoin_eval_locks.pop_front();
+	if (!lock->is_stable())
+	  mds->locker->eval_gather(lock);
+      }
+    }
+
+    if (rejoin_gather.empty() &&     // make sure we've gotten our FULL inodes, too.
+	rejoin_ack_gather.empty()) {
+      // finally, kickstart past snap parent opens
+      open_snaprealms();
+    } else {
+      dout(7) << "still need rejoin from (" << rejoin_gather << ")"
+	      << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
+    }
+  } else {
+    // survivor.
+    mds->queue_waiters(rejoin_waiters);
+  }
+}
+
+/**
+ * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
+ *
+ * FIXME: wait, can this actually happen?  a survivor should generate cache trim
+ * messages that clean these guys up...
+ */
+void MDCache::rejoin_trim_undef_inodes()
+{
+  dout(10) << "rejoin_trim_undef_inodes" << dendl;
+
+  while (!rejoin_undef_inodes.empty()) {
+    set<CInode*>::iterator p = rejoin_undef_inodes.begin();
+    CInode *in = *p;
+    rejoin_undef_inodes.erase(p);
+
+    in->clear_replica_map();
+    
+    // close out dirfrags
+    if (in->is_dir()) {
+      const auto&& dfls = in->get_dirfrags();
+      for (const auto& dir : dfls) {
+	dir->clear_replica_map();
+
+	for (auto &p : dir->items) {
+	  CDentry *dn = p.second;
+	  dn->clear_replica_map();
+
+	  dout(10) << " trimming " << *dn << dendl;
+	  dir->remove_dentry(dn);
+	}
+
+	dout(10) << " trimming " << *dir << dendl;
+	in->close_dirfrag(dir->dirfrag().frag);
+      }
+    }
+    
+    CDentry *dn = in->get_parent_dn();
+    if (dn) {
+      dn->clear_replica_map();
+      dout(10) << " trimming " << *dn << dendl;
+      dn->dir->remove_dentry(dn);
+    } else {
+      dout(10) << " trimming " << *in << dendl;
+      remove_inode(in);
+    }
+  }
+
+  ceph_assert(rejoin_undef_inodes.empty());
+}
+
+void MDCache::rejoin_gather_finish() 
+{
+  dout(10) << "rejoin_gather_finish" << dendl;
+  ceph_assert(mds->is_rejoin());
+  ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
+
+  if (open_undef_inodes_dirfrags())
+    return;
+
+  if (process_imported_caps())
+    return;
+
+  choose_lock_states_and_reconnect_caps();
+
+  identify_files_to_recover();
+  rejoin_send_acks();
+  
+  // signal completion of fetches, rejoin_gather_finish, etc.
+  rejoin_ack_gather.erase(mds->get_nodeid());
+
+  // did we already get our acks too?
+  if (rejoin_ack_gather.empty()) {
+    // finally, open snaprealms
+    open_snaprealms();
+  }
+}
+
+class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
+  inodeno_t ino;
+public:
+  C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
+  void finish(int r) override {
+    mdcache->rejoin_open_ino_finish(ino, r);
+  }
+};
+
+void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
+{
+  dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
+
+  if (ret < 0) {
+    cap_imports_missing.insert(ino);
+  } else if (ret == mds->get_nodeid()) {
+    ceph_assert(get_inode(ino));
+  } else {
+    auto p = cap_imports.find(ino);
+    ceph_assert(p != cap_imports.end());
+    for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+      ceph_assert(q->second.count(MDS_RANK_NONE));
+      ceph_assert(q->second.size() == 1);
+      rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
+    }
+    cap_imports.erase(p);
+  }
+
+  ceph_assert(cap_imports_num_opening > 0);
+  cap_imports_num_opening--;
+
+  if (cap_imports_num_opening == 0) {
+    if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
+      rejoin_gather_finish();
+    else if (rejoin_gather.count(mds->get_nodeid()))
+      process_imported_caps();
+  }
+}
+
+class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
+public:
+  map<client_t,pair<Session*,uint64_t> > session_map;
+  C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
+  void finish(int r) override {
+    ceph_assert(r == 0);
+    mdcache->rejoin_open_sessions_finish(session_map);
+  }
+};
+
+void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
+{
+  dout(10) << "rejoin_open_sessions_finish" << dendl;
+  mds->server->finish_force_open_sessions(session_map);
+  rejoin_session_map.swap(session_map);
+  if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
+    rejoin_gather_finish();
+}
+
+void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
+{
+  auto p = cap_imports.find(ino);
+  if (p != cap_imports.end()) {
+    dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
+    if (ret < 0) {
+      cap_imports_missing.insert(ino);
+    } else if (ret != mds->get_nodeid()) {
+      for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+	ceph_assert(q->second.count(MDS_RANK_NONE));
+	ceph_assert(q->second.size() == 1);
+	rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
+      }
+      cap_imports.erase(p);
+    }
+  }
+}
+
+bool MDCache::process_imported_caps()
+{
+  dout(10) << "process_imported_caps" << dendl;
+
+  if (!open_file_table.is_prefetched() &&
+      open_file_table.prefetch_inodes()) {
+    open_file_table.wait_for_prefetch(
+	new MDSInternalContextWrapper(mds,
+	  new LambdaContext([this](int r) {
+	    ceph_assert(rejoin_gather.count(mds->get_nodeid()));
+	    process_imported_caps();
+	    })
+	  )
+	);
+    return true;
+  }
+
+  for (auto& p : cap_imports) {
+    CInode *in = get_inode(p.first);
+    if (in) {
+      ceph_assert(in->is_auth());
+      cap_imports_missing.erase(p.first);
+      continue;
+    }
+    if (cap_imports_missing.count(p.first) > 0)
+      continue;
+
+    uint64_t parent_ino = 0;
+    std::string_view d_name;
+    for (auto& q : p.second) {
+      for (auto& r : q.second) {
+	auto &icr = r.second;
+	if (icr.capinfo.pathbase &&
+	    icr.path.length() > 0 &&
+	    icr.path.find('/') == string::npos) {
+	  parent_ino = icr.capinfo.pathbase;
+	  d_name = icr.path;
+	  break;
+	}
+      }
+      if (parent_ino)
+	break;
+    }
+
+    dout(10) << "  opening missing ino " << p.first << dendl;
+    cap_imports_num_opening++;
+    auto fin = new C_MDC_RejoinOpenInoFinish(this, p.first);
+    if (parent_ino) {
+      vector<inode_backpointer_t> ancestors;
+      ancestors.push_back(inode_backpointer_t(parent_ino, string{d_name}, 0));
+      open_ino(p.first, (int64_t)-1, fin, false, false, &ancestors);
+    } else {
+      open_ino(p.first, (int64_t)-1, fin, false);
+    }
+    if (!(cap_imports_num_opening % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+
+  if (cap_imports_num_opening > 0)
+    return true;
+
+  // called by rejoin_gather_finish() ?
+  if (rejoin_gather.count(mds->get_nodeid()) == 0) {
+    if (!rejoin_client_map.empty() &&
+	rejoin_session_map.empty()) {
+      C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
+      version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
+							      rejoin_client_metadata_map,
+							      finish->session_map);
+      ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
+				    std::move(rejoin_client_metadata_map));
+      mds->mdlog->start_submit_entry(le, finish);
+      mds->mdlog->flush();
+      rejoin_client_map.clear();
+      rejoin_client_metadata_map.clear();
+      return true;
+    }
+
+    // process caps that were exported by peer rename
+    for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_peer_exports.begin();
+	 p != rejoin_peer_exports.end();
+	 ++p) {
+      CInode *in = get_inode(p->first);
+      ceph_assert(in);
+      for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
+	   q != p->second.second.end();
+	   ++q) {
+	auto r = rejoin_session_map.find(q->first);
+	if (r == rejoin_session_map.end())
+	  continue;
+
+	Session *session = r->second.first;
+	Capability *cap = in->get_client_cap(q->first);
+	if (!cap) {
+	  cap = in->add_client_cap(q->first, session);
+	  // add empty item to reconnected_caps
+	  (void)reconnected_caps[p->first][q->first];
+	}
+	cap->merge(q->second, true);
+
+	Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
+	ceph_assert(cap->get_last_seq() == im.issue_seq);
+	ceph_assert(cap->get_mseq() == im.mseq);
+	cap->set_cap_id(im.cap_id);
+	// send cap import because we assigned a new cap ID
+	do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
+		      p->second.first, CEPH_CAP_FLAG_AUTH);
+      }
+    }
+    rejoin_peer_exports.clear();
+    rejoin_imported_caps.clear();
+
+    // process cap imports
+    //  ino -> client -> frommds -> capex
+    for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
+      CInode *in = get_inode(p->first);
+      if (!in) {
+	dout(10) << " still missing ino " << p->first
+	         << ", will try again after replayed client requests" << dendl;
+	++p;
+	continue;
+      }
+      ceph_assert(in->is_auth());
+      for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+	Session *session;
+	{
+	  auto r = rejoin_session_map.find(q->first);
+	  session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
+	}
+
+	for (auto r = q->second.begin(); r != q->second.end(); ++r) {
+	  if (!session) {
+	    if (r->first >= 0)
+	      (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
+	    continue;
+	  }
+
+	  Capability *cap = in->reconnect_cap(q->first, r->second, session);
+	  add_reconnected_cap(q->first, in->ino(), r->second);
+	  if (r->first >= 0) {
+	    if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
+	      cap->inc_mseq();
+	    do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
+
+	    Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
+	    im.cap_id = cap->get_cap_id();
+	    im.issue_seq = cap->get_last_seq();
+	    im.mseq = cap->get_mseq();
+	  }
+	}
+      }
+      cap_imports.erase(p++);  // remove and move on
+    }
+  } else {
+    trim_non_auth();
+
+    ceph_assert(rejoin_gather.count(mds->get_nodeid()));
+    rejoin_gather.erase(mds->get_nodeid());
+    ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
+    maybe_send_pending_rejoins();
+  }
+  return false;
+}
+
+void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
+				     client_t client, snapid_t snap_follows)
+{
+  dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
+
+  if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
+    return;
+
+  const set<snapid_t>& snaps = realm->get_snaps();
+  snapid_t follows = snap_follows;
+
+  while (true) {
+    CInode *in = pick_inode_snap(head_in, follows);
+    if (in == head_in)
+      break;
+
+    bool need_snapflush = false;
+    for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
+	 p != snaps.end() && *p <= in->last;
+	 ++p) {
+      head_in->add_need_snapflush(in, *p, client);
+      need_snapflush = true;
+    }
+    follows = in->last;
+    if (!need_snapflush)
+      continue;
+
+    dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
+
+    if (in->client_snap_caps.empty()) {
+      for (int i = 0; i < num_cinode_locks; i++) {
+	int lockid = cinode_lock_info[i].lock;
+	SimpleLock *lock = in->get_lock(lockid);
+	ceph_assert(lock);
+	in->auth_pin(lock);
+	lock->set_state(LOCK_SNAP_SYNC);
+	lock->get_wrlock(true);
+      }
+    }
+    in->client_snap_caps.insert(client);
+    mds->locker->mark_need_snapflush_inode(in);
+  }
+}
+
+/*
+ * choose lock states based on reconnected caps
+ */
+void MDCache::choose_lock_states_and_reconnect_caps()
+{
+  dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
+
+  int count = 0;
+  for (auto p : inode_map) {
+    CInode *in = p.second;
+    if (in->last != CEPH_NOSNAP)
+      continue;
+ 
+    if (in->is_auth() && !in->is_base() && in->get_inode()->is_dirty_rstat())
+      in->mark_dirty_rstat();
+
+    int dirty_caps = 0;
+    auto q = reconnected_caps.find(in->ino());
+    if (q != reconnected_caps.end()) {
+      for (const auto &it : q->second)
+	dirty_caps |= it.second.dirty_caps;
+    }
+    in->choose_lock_states(dirty_caps);
+    dout(15) << " chose lock states on " << *in << dendl;
+
+    if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
+      in->get(CInode::PIN_OPENINGSNAPPARENTS);
+      rejoin_pending_snaprealms.insert(in);
+    }
+
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+}
+
+void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
+				  map<client_t,ref_t<MClientSnap>>& splits)
+{
+  ref_t<MClientSnap> snap;
+  auto it = splits.find(client);
+  if (it != splits.end()) {
+    snap = it->second;
+    snap->head.op = CEPH_SNAP_OP_SPLIT;
+  } else {
+    snap = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
+    splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
+    snap->head.split = realm->inode->ino();
+    snap->bl = realm->get_snap_trace();
+
+    for (const auto& child : realm->open_children)
+      snap->split_realms.push_back(child->inode->ino());
+  }
+  snap->split_inos.push_back(ino);	
+}
+
+void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
+				  map<client_t,ref_t<MClientSnap>>& splits)
+{
+  ceph_assert(parent_realm);
+
+  vector<inodeno_t> split_inos;
+  vector<inodeno_t> split_realms;
+
+  for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p)
+    split_inos.push_back((*p)->ino());
+  for (set<SnapRealm*>::iterator p = realm->open_children.begin();
+       p != realm->open_children.end();
+       ++p)
+    split_realms.push_back((*p)->inode->ino());
+
+  for (const auto& p : realm->client_caps) {
+    ceph_assert(!p.second->empty());
+    auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
+    if (em.second) {
+      auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
+      update->head.split = parent_realm->inode->ino();
+      update->split_inos = split_inos;
+      update->split_realms = split_realms;
+      update->bl = parent_realm->get_snap_trace();
+      em.first->second = std::move(update);
+    }
+  }
+}
+
+void MDCache::send_snaps(map<client_t,ref_t<MClientSnap>>& splits)
+{
+  dout(10) << "send_snaps" << dendl;
+  
+  for (auto &p : splits) {
+    Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
+    if (session) {
+      dout(10) << " client." << p.first
+	       << " split " << p.second->head.split
+	       << " inos " << p.second->split_inos
+	       << dendl;
+      mds->send_message_client_counted(p.second, session);
+    } else {
+      dout(10) << " no session for client." << p.first << dendl;
+    }
+  }
+  splits.clear();
+}
+
+
+/*
+ * remove any items from logsegment open_file lists that don't have
+ * any caps
+ */
+void MDCache::clean_open_file_lists()
+{
+  dout(10) << "clean_open_file_lists" << dendl;
+  
+  for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
+       p != mds->mdlog->segments.end();
+       ++p) {
+    LogSegment *ls = p->second;
+
+    elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
+    while (!q.end()) {
+      CInode *in = *q;
+      ++q;
+      if (in->last == CEPH_NOSNAP) {
+	dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
+	in->item_open_file.remove_myself();
+      } else {
+	if (in->client_snap_caps.empty()) {
+	  dout(10) << " unlisting flushed snap inode " << *in << dendl;
+	  in->item_open_file.remove_myself();
+	}
+      }
+    }
+  }
+}
+
+void MDCache::dump_openfiles(Formatter *f)
+{
+  f->open_array_section("openfiles");
+  for (auto p = mds->mdlog->segments.begin();
+       p != mds->mdlog->segments.end();
+       ++p) {
+    LogSegment *ls = p->second;
+    
+    auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
+    while (!q.end()) {
+      CInode *in = *q;
+      ++q;
+      if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
+          || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty())) 
+        continue;
+      f->open_object_section("file");
+      in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
+      f->close_section();
+    }
+  }
+  f->close_section();
+}
+
+Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
+{
+  dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
+	   << " on " << *in << dendl;
+  Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
+  if (!session) {
+    dout(10) << " no session for client." << client << dendl;
+    return NULL;
+  }
+
+  Capability *cap = in->reconnect_cap(client, icr, session);
+
+  if (frommds >= 0) {
+    if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
+      cap->inc_mseq();
+    do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
+  }
+
+  return cap;
+}
+
+void MDCache::export_remaining_imported_caps()
+{
+  dout(10) << "export_remaining_imported_caps" << dendl;
+
+  CachedStackStringStream css;
+
+  int count = 0;
+  for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
+    *css << " ino " << p->first << "\n";
+    for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+      Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+      if (session) {
+	// mark client caps stale.
+	auto stale = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first,
+					       0, 0, 0,
+					       mds->get_osd_epoch_barrier());
+	stale->set_cap_peer(0, 0, 0, -1, 0);
+	mds->send_message_client_counted(stale, q->first);
+      }
+    }
+
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+
+  for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
+       p != cap_reconnect_waiters.end();
+       ++p)
+    mds->queue_waiters(p->second);
+
+  cap_imports.clear();
+  cap_reconnect_waiters.clear();
+
+  if (css->strv().length()) {
+    mds->clog->warn() << "failed to reconnect caps for missing inodes:"
+                      << css->strv();
+  }
+}
+
+Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
+{
+  client_t client = session->info.get_client();
+  Capability *cap = nullptr;
+  const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
+  if (rc) {
+    cap = in->reconnect_cap(client, *rc, session);
+    dout(10) << "try_reconnect_cap client." << client
+	     << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
+	     << " issue " << ccap_string(rc->capinfo.issued)
+	     << " on " << *in << dendl;
+    remove_replay_cap_reconnect(in->ino(), client);
+
+    if (in->is_replicated()) {
+      mds->locker->try_eval(in, CEPH_CAP_LOCKS);
+    } else {
+      int dirty_caps = 0;
+      auto p = reconnected_caps.find(in->ino());
+      if (p != reconnected_caps.end()) {
+	auto q = p->second.find(client);
+	if (q != p->second.end())
+	  dirty_caps = q->second.dirty_caps;
+      }
+      in->choose_lock_states(dirty_caps);
+      dout(15) << " chose lock states on " << *in << dendl;
+    }
+
+    map<inodeno_t, MDSContext::vec >::iterator it =
+      cap_reconnect_waiters.find(in->ino());
+    if (it != cap_reconnect_waiters.end()) {
+      mds->queue_waiters(it->second);
+      cap_reconnect_waiters.erase(it);
+    }
+  }
+  return cap;
+}
+
+
+
+// -------
+// cap imports and delayed snap parent opens
+
+void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
+			    uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
+			    int peer, int p_flags)
+{
+  SnapRealm *realm = in->find_snaprealm();
+  dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
+  if (cap->get_last_seq() == 0) // reconnected cap
+    cap->inc_last_seq();
+  cap->set_last_issue();
+  cap->set_last_issue_stamp(ceph_clock_now());
+  cap->clear_new();
+  auto reap = make_message<MClientCaps>(CEPH_CAP_OP_IMPORT,
+					in->ino(), realm->inode->ino(), cap->get_cap_id(),
+					cap->get_last_seq(), cap->pending(), cap->wanted(),
+					0, cap->get_mseq(), mds->get_osd_epoch_barrier());
+  in->encode_cap_message(reap, cap);
+  reap->snapbl = realm->get_snap_trace();
+  reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
+  mds->send_message_client_counted(reap, session);
+}
+
+void MDCache::do_delayed_cap_imports()
+{
+  dout(10) << "do_delayed_cap_imports" << dendl;
+
+  ceph_assert(delayed_imported_caps.empty());
+}
+
+struct C_MDC_OpenSnapRealms : public MDCacheContext {
+  explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
+  void finish(int r) override {
+    mdcache->open_snaprealms();
+  }
+};
+
+void MDCache::open_snaprealms()
+{
+  dout(10) << "open_snaprealms" << dendl;
+  
+  auto it = rejoin_pending_snaprealms.begin();
+  while (it != rejoin_pending_snaprealms.end()) {
+    CInode *in = *it;
+    SnapRealm *realm = in->snaprealm;
+    ceph_assert(realm);
+
+    map<client_t,ref_t<MClientSnap>> splits;
+    // finish off client snaprealm reconnects?
+    auto q = reconnected_snaprealms.find(in->ino());
+    if (q != reconnected_snaprealms.end()) {
+      for (const auto& r : q->second)
+	finish_snaprealm_reconnect(r.first, realm, r.second, splits);
+      reconnected_snaprealms.erase(q);
+    }
+
+    for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p) {
+      CInode *child = *p;
+      auto q = reconnected_caps.find(child->ino());
+      ceph_assert(q != reconnected_caps.end());
+      for (auto r = q->second.begin(); r != q->second.end(); ++r) {
+	Capability *cap = child->get_client_cap(r->first);
+	if (!cap)
+	  continue;
+	if (r->second.snap_follows > 0) {
+	  if (r->second.snap_follows < child->first - 1) {
+	    rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
+	  } else if (r->second.snapflush) {
+	    // When processing a cap flush message that is re-sent, it's possble
+	    // that the sender has already released all WR caps. So we should
+	    // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
+	    cap->mark_needsnapflush();
+	  }
+	}
+	// make sure client's cap is in the correct snaprealm.
+	if (r->second.realm_ino != in->ino()) {
+	  prepare_realm_split(realm, r->first, child->ino(), splits);
+	}
+      }
+    }
+
+    rejoin_pending_snaprealms.erase(it++);
+    in->put(CInode::PIN_OPENINGSNAPPARENTS);
+
+    send_snaps(splits);
+  }
+
+  notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
+
+  if (!reconnected_snaprealms.empty()) {
+    dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
+    for (auto& p : reconnected_snaprealms) {
+      CachedStackStringStream css;
+      *css << " " << p.first << " {";
+      bool first = true;
+      for (auto& q : p.second) {
+        if (!first)
+          *css << ", ";
+        *css << "client." << q.first << "/" << q.second;
+      }
+      *css << "}";
+      dout(5) << css->strv() << dendl;
+    }
+  }
+  ceph_assert(rejoin_waiters.empty());
+  ceph_assert(rejoin_pending_snaprealms.empty());
+  dout(10) << "open_snaprealms - all open" << dendl;
+  do_delayed_cap_imports();
+
+  ceph_assert(rejoin_done);
+  rejoin_done.release()->complete(0);
+  reconnected_caps.clear();
+}
+
+bool MDCache::open_undef_inodes_dirfrags()
+{
+  dout(10) << "open_undef_inodes_dirfrags "
+	   << rejoin_undef_inodes.size() << " inodes "
+	   << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
+
+  set<CDir*> fetch_queue = rejoin_undef_dirfrags;
+
+  for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
+       p != rejoin_undef_inodes.end();
+       ++p) {
+    CInode *in = *p;
+    ceph_assert(!in->is_base());
+    ceph_assert(in->get_parent_dir());
+    fetch_queue.insert(in->get_parent_dir());
+  }
+
+  if (fetch_queue.empty())
+    return false;
+
+  MDSGatherBuilder gather(g_ceph_context,
+      new MDSInternalContextWrapper(mds,
+	new LambdaContext([this](int r) {
+	    if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
+	      rejoin_gather_finish();
+	  })
+	)
+      );
+
+  for (set<CDir*>::iterator p = fetch_queue.begin();
+       p != fetch_queue.end();
+       ++p) {
+    CDir *dir = *p;
+    CInode *diri = dir->get_inode();
+    if (diri->state_test(CInode::STATE_REJOINUNDEF))
+      continue;
+    if (dir->state_test(CDir::STATE_REJOINUNDEF))
+      ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
+    dir->fetch(gather.new_sub());
+  }
+  ceph_assert(gather.has_subs());
+  gather.activate();
+  return true;
+}
+
+void MDCache::opened_undef_inode(CInode *in) {
+  dout(10) << "opened_undef_inode " << *in << dendl;
+  rejoin_undef_inodes.erase(in);
+  if (in->is_dir()) {
+    // FIXME: re-hash dentries if necessary
+    ceph_assert(in->get_inode()->dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
+    if (in->get_num_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
+      CDir *dir = in->get_dirfrag(frag_t());
+      ceph_assert(dir);
+      rejoin_undef_dirfrags.erase(dir);
+      in->force_dirfrags();
+      auto&& ls = in->get_dirfrags();
+      for (const auto& dir : ls) {
+	rejoin_undef_dirfrags.insert(dir);
+      }
+    }
+  }
+}
+
+void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
+					 map<client_t,ref_t<MClientSnap>>& updates)
+{
+  if (seq < realm->get_newest_seq()) {
+    dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < " 
+	     << realm->get_newest_seq() << " on " << *realm << dendl;
+    auto snap = make_message<MClientSnap>(CEPH_SNAP_OP_UPDATE);
+    snap->bl = realm->get_snap_trace();
+    for (const auto& child : realm->open_children)
+      snap->split_realms.push_back(child->inode->ino());
+    updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
+  } else {
+    dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
+	     << " on " << *realm << dendl;
+  }
+}
+
+
+
+void MDCache::rejoin_send_acks()
+{
+  dout(7) << "rejoin_send_acks" << dendl;
+
+  // replicate stray
+  for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
+       p != rejoin_unlinked_inodes.end();
+       ++p) {
+    for (set<CInode*>::iterator q = p->second.begin();
+	 q != p->second.end();
+	 ++q) {
+      CInode *in = *q;
+      dout(7) << " unlinked inode " << *in << dendl;
+      // inode expired
+      if (!in->is_replica(p->first))
+	continue;
+      while (1) {
+	CDentry *dn = in->get_parent_dn();
+	if (dn->is_replica(p->first))
+	  break;
+	dn->add_replica(p->first);
+	CDir *dir = dn->get_dir();
+	if (dir->is_replica(p->first))
+	  break;
+	dir->add_replica(p->first);
+	in = dir->get_inode();
+	if (in->is_replica(p->first))
+	  break;
+	in->add_replica(p->first);
+	if (in->is_base())
+	  break;
+      }
+    }
+  }
+  rejoin_unlinked_inodes.clear();
+  
+  // send acks to everyone in the recovery set
+  map<mds_rank_t,ref_t<MMDSCacheRejoin>> acks;
+  for (set<mds_rank_t>::iterator p = recovery_set.begin();
+       p != recovery_set.end();
+       ++p) {
+    if (rejoin_ack_sent.count(*p))
+      continue;
+    acks[*p] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
+  }
+
+  rejoin_ack_sent = recovery_set;
+  
+  // walk subtrees
+  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); 
+       p != subtrees.end();
+       ++p) {
+    CDir *dir = p->first;
+    if (!dir->is_auth())
+      continue;
+    dout(10) << "subtree " << *dir << dendl;
+    
+    // auth items in this subtree
+    std::queue<CDir*> dq;
+    dq.push(dir);
+
+    while (!dq.empty()) {
+      CDir *dir = dq.front();
+      dq.pop();
+      
+      // dir
+      for (auto &r : dir->get_replicas()) {
+	auto it = acks.find(r.first);
+	if (it == acks.end())
+	  continue;
+	it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
+	it->second->add_dirfrag_base(dir);
+      }
+	   
+      for (auto &p : dir->items) {
+	CDentry *dn = p.second;
+	CDentry::linkage_t *dnl = dn->get_linkage();
+
+	// inode
+	CInode *in = NULL;
+	if (dnl->is_primary())
+	  in = dnl->get_inode();
+
+	// dentry
+	for (auto &r : dn->get_replicas()) {
+	  auto it = acks.find(r.first);
+	  if (it == acks.end())
+	    continue;
+	  it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
+                                           dn->first, dn->last,
+					   dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
+					   dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
+					   dnl->is_remote() ? dnl->get_remote_d_type():0,
+					   ++r.second,
+					   dn->lock.get_replica_state());
+	  // peer missed MDentrylink message ?
+	  if (in && !in->is_replica(r.first))
+	    in->add_replica(r.first);
+	}
+	
+	if (!in)
+	  continue;
+
+	for (auto &r : in->get_replicas()) {
+	  auto it = acks.find(r.first);
+	  if (it == acks.end())
+	    continue;
+	  it->second->add_inode_base(in, mds->mdsmap->get_up_features());
+	  bufferlist bl;
+	  in->_encode_locks_state_for_rejoin(bl, r.first);
+	  it->second->add_inode_locks(in, ++r.second, bl);
+	}
+	
+	// subdirs in this subtree?
+	{
+          auto&& dirs = in->get_nested_dirfrags();
+          for (const auto& dir : dirs) {
+            dq.push(dir);
+          }
+        }
+      }
+    }
+  }
+
+  // base inodes too
+  if (root && root->is_auth()) 
+    for (auto &r : root->get_replicas()) {
+      auto it = acks.find(r.first);
+      if (it == acks.end())
+	continue;
+      it->second->add_inode_base(root, mds->mdsmap->get_up_features());
+      bufferlist bl;
+      root->_encode_locks_state_for_rejoin(bl, r.first);
+      it->second->add_inode_locks(root, ++r.second, bl);
+    }
+  if (myin)
+    for (auto &r : myin->get_replicas()) {
+      auto it = acks.find(r.first);
+      if (it == acks.end())
+	continue;
+      it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
+      bufferlist bl;
+      myin->_encode_locks_state_for_rejoin(bl, r.first);
+      it->second->add_inode_locks(myin, ++r.second, bl);
+    }
+
+  // include inode base for any inodes whose scatterlocks may have updated
+  for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
+       p != rejoin_potential_updated_scatterlocks.end();
+       ++p) {
+    CInode *in = *p;
+    for (const auto &r : in->get_replicas()) {
+      auto it = acks.find(r.first);
+      if (it == acks.end())
+	continue;
+      it->second->add_inode_base(in, mds->mdsmap->get_up_features());
+    }
+  }
+
+  // send acks
+  for (auto p = acks.begin(); p != acks.end(); ++p) {
+    encode(rejoin_imported_caps[p->first], p->second->imported_caps);
+    mds->send_message_mds(p->second, p->first);
+  }
+
+  rejoin_imported_caps.clear();
+}
+
+class C_MDC_ReIssueCaps : public MDCacheContext {
+  CInode *in;
+public:
+  C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
+    MDCacheContext(mdc), in(i)
+  {
+    in->get(CInode::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
+      mdcache->mds->locker->issue_caps(in);
+    in->put(CInode::PIN_PTRWAITER);
+  }
+};
+
+void MDCache::reissue_all_caps()
+{
+  dout(10) << "reissue_all_caps" << dendl;
+
+  int count = 0;
+  for (auto &p : inode_map) {
+    int n = 1;
+    CInode *in = p.second;
+    if (in->is_head() && in->is_any_caps()) {
+      // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
+      if (in->is_frozen_inode()) {
+	in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
+	continue;
+      }
+      if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
+	n += mds->locker->issue_caps(in);
+    }
+
+    if ((count % mds->heartbeat_reset_grace()) + n >= mds->heartbeat_reset_grace())
+      mds->heartbeat_reset();
+    count += n;
+  }
+}
+
+
+// ===============================================================================
+
+struct C_MDC_QueuedCow : public MDCacheContext {
+  CInode *in;
+  MutationRef mut;
+  C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
+    MDCacheContext(mdc), in(i), mut(m) {}
+  void finish(int r) override {
+    mdcache->_queued_file_recover_cow(in, mut);
+  }
+};
+
+
+void MDCache::queue_file_recover(CInode *in)
+{
+  dout(10) << "queue_file_recover " << *in << dendl;
+  ceph_assert(in->is_auth());
+
+  // cow?
+  /*
+  SnapRealm *realm = in->find_snaprealm();
+  set<snapid_t> s = realm->get_snaps();
+  while (!s.empty() && *s.begin() < in->first)
+    s.erase(s.begin());
+  while (!s.empty() && *s.rbegin() > in->last)
+    s.erase(*s.rbegin());
+  dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
+  if (s.size() > 1) {
+    auto pi = in->project_inode(mut);
+    pi.inode.version = in->pre_dirty();
+
+    auto mut(std::make_shared<MutationImpl>());
+    mut->ls = mds->mdlog->get_current_segment();
+    EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
+    mds->mdlog->start_entry(le);
+    predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
+
+    s.erase(*s.begin());
+    while (!s.empty()) {
+      snapid_t snapid = *s.begin();
+      CInode *cow_inode = 0;
+      journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
+      ceph_assert(cow_inode);
+      recovery_queue.enqueue(cow_inode);
+      s.erase(*s.begin());
+    }
+    
+    in->parent->first = in->first;
+    le->metablob.add_primary_dentry(in->parent, in, true);
+    mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
+    mds->mdlog->flush();
+  }
+  */
+
+  recovery_queue.enqueue(in);
+}
+
+void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
+{
+  mut->apply();
+  mds->locker->drop_locks(mut.get());
+  mut->cleanup();
+}
+
+
+/*
+ * called after recovery to recover file sizes for previously opened (for write)
+ * files.  that is, those where max_size > size.
+ */
+void MDCache::identify_files_to_recover()
+{
+  dout(10) << "identify_files_to_recover" << dendl;
+  int count = 0;
+
+  // Clear the recover and check queues in case the monitor sends rejoin mdsmap twice.
+  rejoin_recover_q.clear();
+  rejoin_check_q.clear();
+
+  for (auto &p : inode_map) {
+    CInode *in = p.second;
+    if (!in->is_auth())
+      continue;
+
+    if (in->last != CEPH_NOSNAP)
+      continue;
+
+    // Only normal files need file size recovery
+    if (!in->is_file()) {
+      continue;
+    }
+    
+    bool recover = false;
+    const auto& client_ranges = in->get_projected_inode()->client_ranges;
+    if (!client_ranges.empty()) {
+      in->mark_clientwriteable();
+      for (auto& p : client_ranges) {
+	Capability *cap = in->get_client_cap(p.first);
+	if (cap) {
+	  cap->mark_clientwriteable();
+	} else {
+	  dout(10) << " client." << p.first << " has range " << p.second << " but no cap on " << *in << dendl;
+	  recover = true;
+	  break;
+	}
+      }
+    }
+
+    if (recover) {
+      if (in->filelock.is_stable()) {
+	in->auth_pin(&in->filelock);
+      } else {
+	ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
+      }
+      in->filelock.set_state(LOCK_PRE_SCAN);
+      rejoin_recover_q.push_back(in);
+    } else {
+      rejoin_check_q.push_back(in);
+    }
+
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+}
+
+void MDCache::start_files_to_recover()
+{
+  int count = 0;
+  for (CInode *in : rejoin_check_q) {
+    if (in->filelock.get_state() == LOCK_XLOCKSNAP)
+      mds->locker->issue_caps(in);
+    mds->locker->check_inode_max_size(in);
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+  rejoin_check_q.clear();
+  for (CInode *in : rejoin_recover_q) {
+    mds->locker->file_recover(&in->filelock);
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+  if (!rejoin_recover_q.empty()) {
+    rejoin_recover_q.clear();
+    do_file_recover();
+  }
+}
+
+void MDCache::do_file_recover()
+{
+  recovery_queue.advance();
+}
+
+// ===============================================================================
+
+
+// ----------------------------
+// truncate
+
+class C_MDC_RetryTruncate : public MDCacheContext {
+  CInode *in;
+  LogSegment *ls;
+public:
+  C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
+    MDCacheContext(c), in(i), ls(l) {}
+  void finish(int r) override {
+    mdcache->_truncate_inode(in, ls);
+  }
+};
+
+void MDCache::truncate_inode(CInode *in, LogSegment *ls)
+{
+  const auto& pi = in->get_projected_inode();
+  dout(10) << "truncate_inode "
+	   << pi->truncate_from << " -> " << pi->truncate_size
+	   << " on " << *in
+	   << dendl;
+
+  ls->truncating_inodes.insert(in);
+  in->get(CInode::PIN_TRUNCATING);
+  in->auth_pin(this);
+
+  if (!in->client_need_snapflush.empty() &&
+      (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
+    ceph_assert(in->filelock.is_xlocked());
+    in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
+    mds->locker->issue_caps(in);
+    return;
+  }
+
+  _truncate_inode(in, ls);
+}
+
+struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
+  CInode *in;
+  LogSegment *ls;
+  C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
+    MDCacheIOContext(c, false), in(i), ls(l) {
+  }
+  void finish(int r) override {
+    ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
+    mdcache->truncate_inode_finish(in, ls);
+  }
+  void print(ostream& out) const override {
+    out << "file_truncate(" << in->ino() << ")";
+  }
+};
+
+void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
+{
+  const auto& pi = in->get_inode();
+  dout(10) << "_truncate_inode "
+	   << pi->truncate_from << " -> " << pi->truncate_size
+	   << " on " << *in << dendl;
+
+  ceph_assert(pi->is_truncating());
+  ceph_assert(pi->truncate_size < (1ULL << 63));
+  ceph_assert(pi->truncate_from < (1ULL << 63));
+  ceph_assert(pi->truncate_size < pi->truncate_from);
+
+
+  SnapRealm *realm = in->find_snaprealm();
+  SnapContext nullsnap;
+  const SnapContext *snapc;
+  if (realm) {
+    dout(10) << " realm " << *realm << dendl;
+    snapc = &realm->get_snap_context();
+  } else {
+    dout(10) << " NO realm, using null context" << dendl;
+    snapc = &nullsnap;
+    ceph_assert(in->last == CEPH_NOSNAP);
+  }
+  dout(10) << "_truncate_inode  snapc " << snapc << " on " << *in << dendl;
+  auto layout = pi->layout;
+  filer.truncate(in->ino(), &layout, *snapc,
+		 pi->truncate_size, pi->truncate_from-pi->truncate_size,
+		 pi->truncate_seq, ceph::real_time::min(), 0,
+		 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
+				  mds->finisher));
+}
+
+struct C_MDC_TruncateLogged : public MDCacheLogContext {
+  CInode *in;
+  MutationRef mut;
+  C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
+    MDCacheLogContext(m), in(i), mut(mu) {}
+  void finish(int r) override {
+    mdcache->truncate_inode_logged(in, mut);
+  }
+};
+
+void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
+{
+  dout(10) << "truncate_inode_finish " << *in << dendl;
+  
+  set<CInode*>::iterator p = ls->truncating_inodes.find(in);
+  ceph_assert(p != ls->truncating_inodes.end());
+  ls->truncating_inodes.erase(p);
+
+  MutationRef mut(new MutationImpl());
+  mut->ls = mds->mdlog->get_current_segment();
+
+  // update
+  auto pi = in->project_inode(mut);
+  pi.inode->version = in->pre_dirty();
+  pi.inode->truncate_from = 0;
+  pi.inode->truncate_pending--;
+
+  EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
+  mds->mdlog->start_entry(le);
+
+  predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
+  journal_dirty_inode(mut.get(), &le->metablob, in);
+  le->metablob.add_truncate_finish(in->ino(), ls->seq);
+  mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
+
+  // flush immediately if there are readers/writers waiting
+  if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
+      (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+    mds->mdlog->flush();
+}
+
+void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
+{
+  dout(10) << "truncate_inode_logged " << *in << dendl;
+  mut->apply();
+  mds->locker->drop_locks(mut.get());
+  mut->cleanup();
+
+  in->put(CInode::PIN_TRUNCATING);
+  in->auth_unpin(this);
+
+  MDSContext::vec waiters;
+  in->take_waiting(CInode::WAIT_TRUNC, waiters);
+  mds->queue_waiters(waiters);
+}
+
+
+void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
+{
+  dout(20) << "add_recovered_truncate " << *in << " in log segment "
+	   << ls->seq << "/" << ls->offset << dendl;
+  ls->truncating_inodes.insert(in);
+  in->get(CInode::PIN_TRUNCATING);
+}
+
+void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
+{
+  dout(20) << "remove_recovered_truncate " << *in << " in log segment "
+	   << ls->seq << "/" << ls->offset << dendl;
+  // if we have the logseg the truncate started in, it must be in our list.
+  set<CInode*>::iterator p = ls->truncating_inodes.find(in);
+  ceph_assert(p != ls->truncating_inodes.end());
+  ls->truncating_inodes.erase(p);
+  in->put(CInode::PIN_TRUNCATING);
+}
+
+void MDCache::start_recovered_truncates()
+{
+  dout(10) << "start_recovered_truncates" << dendl;
+  for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
+       p != mds->mdlog->segments.end();
+       ++p) {
+    LogSegment *ls = p->second;
+    for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
+	 q != ls->truncating_inodes.end();
+	 ++q) {
+      CInode *in = *q;
+      in->auth_pin(this);
+
+      if (!in->client_need_snapflush.empty() &&
+	  (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
+	ceph_assert(in->filelock.is_stable());
+	in->filelock.set_state(LOCK_XLOCKDONE);
+	in->auth_pin(&in->filelock);
+	in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
+	// start_files_to_recover will revoke caps
+	continue;
+      }
+      _truncate_inode(in, ls);
+    }
+  }
+}
+
+
+class C_MDS_purge_completed_finish : public MDCacheLogContext {
+  interval_set<inodeno_t> inos;
+  LogSegment *ls; 
+  version_t inotablev;
+public:
+  C_MDS_purge_completed_finish(MDCache *m, const interval_set<inodeno_t>& _inos,
+			       LogSegment *_ls, version_t iv)
+    : MDCacheLogContext(m), inos(_inos), ls(_ls), inotablev(iv) {}
+  void finish(int r) override {
+    assert(r == 0);
+    if (inotablev) {
+      get_mds()->inotable->apply_release_ids(inos);
+      assert(get_mds()->inotable->get_version() == inotablev);
+    }
+    ls->purge_inodes_finish(inos);
+  }
+};
+
+void MDCache::start_purge_inodes(){
+  dout(10) << "start_purge_inodes" << dendl;
+  for (auto& p : mds->mdlog->segments){
+    LogSegment *ls = p.second;
+    if (ls->purging_inodes.size()){
+      purge_inodes(ls->purging_inodes, ls);
+    }
+  }
+}
+
+void MDCache::purge_inodes(const interval_set<inodeno_t>& inos, LogSegment *ls)
+{
+  dout(10) << __func__ << " purging inos " << inos << " logseg " << ls->seq << dendl;
+  // FIXME: handle non-default data pool and namespace
+
+  auto cb = new LambdaContext([this, inos, ls](int r){
+      assert(r == 0 || r == -2);
+      mds->inotable->project_release_ids(inos);
+      version_t piv = mds->inotable->get_projected_version();
+      assert(piv != 0);
+      mds->mdlog->start_submit_entry(new EPurged(inos, ls->seq, piv),
+				     new C_MDS_purge_completed_finish(this, inos, ls, piv));
+      mds->mdlog->flush();
+    });
+  
+  C_GatherBuilder gather(g_ceph_context,
+			  new C_OnFinisher(new MDSIOContextWrapper(mds, cb), mds->finisher));
+  SnapContext nullsnapc;
+  for (const auto& [start, len] : inos) {
+    for (auto i = start; i < start + len ; i += 1) {
+      filer.purge_range(i, &default_file_layout, nullsnapc, 0, 1,
+			ceph::real_clock::now(), 0, gather.new_sub());
+    }
+  }
+  gather.activate();
+}
+
+// ================================================================================
+// cache trimming
+
+std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
+{
+  bool is_standby_replay = mds->is_standby_replay();
+  std::vector<CDentry *> unexpirables;
+  uint64_t trimmed = 0;
+
+  auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
+
+  dout(7) << "trim_lru trimming " << count
+          << " items from LRU"
+          << " size=" << lru.lru_get_size()
+          << " mid=" << lru.lru_get_top()
+          << " pintail=" << lru.lru_get_pintail()
+          << " pinned=" << lru.lru_get_num_pinned()
+          << dendl;
+
+  const uint64_t trim_counter_start = trim_counter.get();
+  bool throttled = false;
+  while (1) {
+    throttled |= trim_counter_start+trimmed >= trim_threshold;
+    if (throttled) break;
+    CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
+    if (!dn)
+      break;
+    if (trim_dentry(dn, expiremap)) {
+      unexpirables.push_back(dn);
+    } else {
+      trimmed++;
+    }
+  }
+
+  for (auto &dn : unexpirables) {
+    bottom_lru.lru_insert_mid(dn);
+  }
+  unexpirables.clear();
+
+  // trim dentries from the LRU until count is reached
+  // if mds is in standby_replay and skip trimming the inodes
+  while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) {
+    throttled |= trim_counter_start+trimmed >= trim_threshold;
+    if (throttled) break;
+    CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
+    if (!dn) {
+      break;
+    }
+    if (is_standby_replay && dn->get_linkage()->inode) {
+      // we move the inodes that need to be trimmed to the end of the lru queue.
+      // refer to MDCache::standby_trim_segment
+      lru.lru_insert_bot(dn);
+      break;
+    } else if (trim_dentry(dn, expiremap)) {
+      unexpirables.push_back(dn);
+    } else {
+      trimmed++;
+      if (count > 0) count--;
+    }
+  }
+  trim_counter.hit(trimmed);
+
+  for (auto &dn : unexpirables) {
+    lru.lru_insert_mid(dn);
+  }
+  unexpirables.clear();
+
+  dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
+  return std::pair<bool, uint64_t>(throttled, trimmed);
+}
+
+/*
+ * note: only called while MDS is active or stopping... NOT during recovery.
+ * however, we may expire a replica whose authority is recovering.
+ *
+ * @param count is number of dentries to try to expire
+ */
+std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
+{
+  uint64_t used = cache_size();
+  uint64_t limit = cache_memory_limit;
+  expiremap expiremap;
+
+  dout(7) << "trim bytes_used=" << bytes2str(used)
+          << " limit=" << bytes2str(limit)
+          << " reservation=" << cache_reservation
+          << "% count=" << count << dendl;
+
+  // process delayed eval_stray()
+  stray_manager.advance_delayed();
+
+  auto result = trim_lru(count, expiremap);
+  auto& trimmed = result.second;
+
+  // trim non-auth, non-bound subtrees
+  for (auto p = subtrees.begin(); p != subtrees.end();) {
+    CDir *dir = p->first;
+    ++p;
+    CInode *diri = dir->get_inode();
+    if (dir->is_auth()) {
+      if (diri->is_auth() && !diri->is_base()) {
+        /* this situation should correspond to an export pin */
+        if (dir->get_num_head_items() == 0 && dir->get_num_ref() == 1) {
+          /* pinned empty subtree, try to drop */
+          if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
+            dout(20) << "trimming empty pinned subtree " << *dir << dendl;
+            dir->state_clear(CDir::STATE_AUXSUBTREE);
+            remove_subtree(dir);
+            diri->close_dirfrag(dir->dirfrag().frag);
+          }
+        }
+      } else if (!diri->is_auth() && !diri->is_base() && dir->get_num_head_items() == 0) {
+        if (dir->state_test(CDir::STATE_EXPORTING) ||
+           !(mds->is_active() || mds->is_stopping()) ||
+           dir->is_freezing() || dir->is_frozen())
+          continue;
+
+        migrator->export_empty_import(dir);
+        ++trimmed;
+      }
+    } else if (!diri->is_auth() && dir->get_num_ref() <= 1) {
+      // only subtree pin
+      if (diri->get_num_ref() > diri->get_num_subtree_roots()) {
+        continue;
+      }
+
+      // don't trim subtree root if its auth MDS is recovering.
+      // This simplify the cache rejoin code.
+      if (dir->is_subtree_root() && rejoin_ack_gather.count(dir->get_dir_auth().first))
+        continue;
+      trim_dirfrag(dir, 0, expiremap);
+      ++trimmed;
+    }
+  }
+
+  // trim root?
+  if (mds->is_stopping() && root) {
+    auto&& ls = root->get_dirfrags();
+    for (const auto& dir : ls) {
+      if (dir->get_num_ref() == 1) { // subtree pin
+	trim_dirfrag(dir, 0, expiremap);
+        ++trimmed;
+      }
+    }
+    if (root->get_num_ref() == 0) {
+      trim_inode(0, root, 0, expiremap);
+      ++trimmed;
+    }
+  }
+
+  std::set<mds_rank_t> stopping;
+  mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
+  stopping.erase(mds->get_nodeid());
+  for (auto rank : stopping) {
+    CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
+    if (!mdsdir_in)
+      continue;
+
+    auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
+    if (em.second) {
+      em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
+    }
+
+    dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds <<  dendl;
+
+    const bool aborted = expire_recursive(mdsdir_in, expiremap);
+    if (!aborted) {
+      dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
+      auto&& ls = mdsdir_in->get_dirfrags();
+      for (auto dir : ls) {
+	if (dir->get_num_ref() == 1) {  // subtree pin
+	  trim_dirfrag(dir, dir, expiremap);
+          ++trimmed;
+        }
+      }
+      if (mdsdir_in->get_num_ref() == 0) {
+	trim_inode(NULL, mdsdir_in, NULL, expiremap);
+        ++trimmed;
+      }
+    } else {
+      dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
+    }
+  }
+
+  // Other rank's base inodes (when I'm stopping)
+  if (mds->is_stopping()) {
+    for (set<CInode*>::iterator p = base_inodes.begin();
+         p != base_inodes.end();) {
+      CInode *base_in = *p;
+      ++p;
+      if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
+	  MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
+        dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
+        if (base_in->get_num_ref() == 0) {
+          trim_inode(NULL, base_in, NULL, expiremap);
+          ++trimmed;
+        }
+      }
+    }
+  }
+
+  // send any expire messages
+  send_expire_messages(expiremap);
+
+  return result;
+}
+
+void MDCache::send_expire_messages(expiremap& expiremap)
+{
+  // send expires
+  for (const auto &p : expiremap) {
+    if (mds->is_cluster_degraded() &&
+	(mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
+	 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
+	  rejoin_sent.count(p.first) == 0))) {
+      continue;
+    }
+    dout(7) << "sending cache_expire to " << p.first << dendl;
+    mds->send_message_mds(p.second, p.first);
+  }
+  expiremap.clear();
+}
+
+
+bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
+{
+  dout(12) << "trim_dentry " << *dn << dendl;
+  
+  CDentry::linkage_t *dnl = dn->get_linkage();
+
+  CDir *dir = dn->get_dir();
+  ceph_assert(dir);
+  
+  CDir *con = get_subtree_root(dir);
+  if (con)
+    dout(12) << " in container " << *con << dendl;
+  else {
+    dout(12) << " no container; under a not-yet-linked dir" << dendl;
+    ceph_assert(dn->is_auth());
+  }
+
+  // If replica dentry is not readable, it's likely we will receive
+  // MDentryLink/MDentryUnlink message soon (It's possible we first
+  // receive a MDentryUnlink message, then MDentryLink message)
+  // MDentryLink message only replicates an inode, so we should
+  // avoid trimming the inode's parent dentry. This is because that
+  // unconnected replicas are problematic for subtree migration.
+  if (!dn->is_auth() && !dn->lock.can_read(-1) &&
+      !dn->get_dir()->get_inode()->is_stray())
+    return true;
+
+  // adjust the dir state
+  // NOTE: we can safely remove a clean, null dentry without effecting
+  //       directory completeness.
+  // (check this _before_ we unlink the inode, below!)
+  bool clear_complete = false;
+  if (!(dnl->is_null() && dn->is_clean()))
+    clear_complete = true;
+
+  // unlink the dentry
+  if (dnl->is_remote()) {
+    // just unlink.
+    dir->unlink_inode(dn, false);
+  } else if (dnl->is_primary()) {
+    // expire the inode, too.
+    CInode *in = dnl->get_inode();
+    ceph_assert(in);
+    if (trim_inode(dn, in, con, expiremap))
+      return true; // purging stray instead of trimming
+  } else {
+    ceph_assert(dnl->is_null());
+  }
+
+  if (!dn->is_auth()) {
+    // notify dentry authority.
+    mds_authority_t auth = dn->authority();
+    
+    for (int p=0; p<2; p++) {
+      mds_rank_t a = auth.first;
+      if (p) a = auth.second;
+      if (a < 0 || (p == 1 && auth.second == auth.first)) break;
+      if (mds->get_nodeid() == auth.second &&
+	  con->is_importing()) break;                // don't send any expire while importing.
+      if (a == mds->get_nodeid()) continue;          // on export, ignore myself.
+      
+      dout(12) << "  sending expire to mds." << a << " on " << *dn << dendl;
+      ceph_assert(a != mds->get_nodeid());
+      auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
+      if (em.second)
+	em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
+      em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
+    }
+  }
+
+  // remove dentry
+  if (dn->last == CEPH_NOSNAP && dir->is_auth())
+    dir->add_to_bloom(dn);
+  dir->remove_dentry(dn);
+
+  if (clear_complete)
+    dir->state_clear(CDir::STATE_COMPLETE);
+  
+  if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
+  return false;
+}
+
+
+void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
+{
+  dout(15) << "trim_dirfrag " << *dir << dendl;
+
+  if (dir->is_subtree_root()) {
+    ceph_assert(!dir->is_auth() ||
+	   (!dir->is_replicated() && dir->inode->is_base()));
+    remove_subtree(dir);	// remove from subtree map
+  }
+  ceph_assert(dir->get_num_ref() == 0);
+
+  CInode *in = dir->get_inode();
+
+  if (!dir->is_auth()) {
+    mds_authority_t auth = dir->authority();
+    
+    // was this an auth delegation?  (if so, slightly modified container)
+    dirfrag_t condf;
+    if (dir->is_subtree_root()) {
+      dout(12) << " subtree root, container is " << *dir << dendl;
+      con = dir;
+      condf = dir->dirfrag();
+    } else {
+      condf = con->dirfrag();
+    }
+      
+    for (int p=0; p<2; p++) {
+      mds_rank_t a = auth.first;
+      if (p) a = auth.second;
+      if (a < 0 || (p == 1 && auth.second == auth.first)) break;
+      if (mds->get_nodeid() == auth.second &&
+	  con->is_importing()) break;                // don't send any expire while importing.
+      if (a == mds->get_nodeid()) continue;          // on export, ignore myself.
+
+      dout(12) << "  sending expire to mds." << a << " on   " << *dir << dendl;
+      ceph_assert(a != mds->get_nodeid());
+      auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
+      if (em.second)
+	em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
+      em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
+    }
+  }
+  
+  in->close_dirfrag(dir->dirfrag().frag);
+}
+
+/**
+ * Try trimming an inode from the cache
+ *
+ * @return true if the inode is still in cache, else false if it was trimmed
+ */
+bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
+{
+  dout(15) << "trim_inode " << *in << dendl;
+  ceph_assert(in->get_num_ref() == 0);
+
+  if (in->is_dir()) {
+    // If replica inode's dirfragtreelock is not readable, it's likely
+    // some dirfrags of the inode are being fragmented and we will receive
+    // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
+    // dirfrags, so we should avoid trimming these dirfrags' parent inode.
+    // This is because that unconnected replicas are problematic for
+    // subtree migration.
+    //
+    if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1)) {
+      return true;
+    }
+
+    // DIR
+    auto&& dfls = in->get_dirfrags();
+    for (const auto& dir : dfls) {
+      ceph_assert(!dir->is_subtree_root());
+      trim_dirfrag(dir, con ? con:dir, expiremap);  // if no container (e.g. root dirfrag), use *p
+    }
+  }
+  
+  // INODE
+  if (in->is_auth()) {
+    // eval stray after closing dirfrags
+    if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
+      maybe_eval_stray(in);
+      if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
+	return true;
+    }
+  } else {
+    mds_authority_t auth = in->authority();
+    
+    dirfrag_t df;
+    if (con)
+      df = con->dirfrag();
+    else
+      df = dirfrag_t(0,frag_t());   // must be a root or stray inode.
+
+    for (int p=0; p<2; p++) {
+      mds_rank_t a = auth.first;
+      if (p) a = auth.second;
+      if (a < 0 || (p == 1 && auth.second == auth.first)) break;
+      if (con && mds->get_nodeid() == auth.second &&
+	  con->is_importing()) break;                // don't send any expire while importing.
+      if (a == mds->get_nodeid()) continue;          // on export, ignore myself.
+
+      dout(12) << "  sending expire to mds." << a << " on " << *in << dendl;
+      ceph_assert(a != mds->get_nodeid());
+      auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
+      if (em.second)
+	em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
+      em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
+    }
+  }
+
+  /*
+  if (in->is_auth()) {
+    if (in->hack_accessed)
+      mds->logger->inc("outt");
+    else {
+      mds->logger->inc("outut");
+      mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
+    }
+  }
+  */
+    
+  // unlink
+  if (dn)
+    dn->get_dir()->unlink_inode(dn, false);
+  remove_inode(in);
+  return false;
+}
+
+
+/**
+ * trim_non_auth - remove any non-auth items from our cache
+ *
+ * this reduces the amount of non-auth metadata in our cache, reducing the 
+ * load incurred by the rejoin phase.
+ *
+ * the only non-auth items that remain are those that are needed to 
+ * attach our own subtrees to the root.  
+ *
+ * when we are done, all dentries will be in the top bit of the lru.
+ *
+ * why we have to do this:
+ *  we may not have accurate linkage for non-auth items.  which means we will 
+ *  know which subtree it falls into, and can not be sure to declare it to the
+ *  correct authority.  
+ */
+void MDCache::trim_non_auth()
+{
+  dout(7) << "trim_non_auth" << dendl;
+  
+  // temporarily pin all subtree roots
+  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+       p != subtrees.end();
+       ++p) 
+    p->first->get(CDir::PIN_SUBTREETEMP);
+
+  list<CDentry*> auth_list;
+  
+  // trim non-auth items from the lru
+  for (;;) {
+    CDentry *dn = NULL;
+    if (bottom_lru.lru_get_size() > 0)
+      dn = static_cast<CDentry*>(bottom_lru.lru_expire());
+    if (!dn && lru.lru_get_size() > 0)
+      dn = static_cast<CDentry*>(lru.lru_expire());
+    if (!dn)
+	break;
+
+    CDentry::linkage_t *dnl = dn->get_linkage();
+
+    if (dn->is_auth()) {
+      // add back into lru (at the top)
+      auth_list.push_back(dn);
+
+      if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
+	dn->unlink_remote(dnl);
+    } else {
+      // non-auth.  expire.
+      CDir *dir = dn->get_dir();
+      ceph_assert(dir);
+
+      // unlink the dentry
+      dout(10) << " removing " << *dn << dendl;
+      if (dnl->is_remote()) {
+	dir->unlink_inode(dn, false);
+      } 
+      else if (dnl->is_primary()) {
+	CInode *in = dnl->get_inode();
+	dout(10) << " removing " << *in << dendl;
+	auto&& ls = in->get_dirfrags();
+	for (const auto& subdir : ls) {
+	  ceph_assert(!subdir->is_subtree_root());
+	  in->close_dirfrag(subdir->dirfrag().frag);
+	}
+	dir->unlink_inode(dn, false);
+	remove_inode(in);
+      } 
+      else {
+	ceph_assert(dnl->is_null());
+      }
+
+      ceph_assert(!dir->has_bloom());
+      dir->remove_dentry(dn);
+      // adjust the dir state
+      dir->state_clear(CDir::STATE_COMPLETE);  // dir incomplete!
+      // close empty non-auth dirfrag
+      if (!dir->is_subtree_root() && dir->get_num_any() == 0)
+	dir->inode->close_dirfrag(dir->get_frag());
+    }
+  }
+
+  for (const auto& dn : auth_list) {
+      if (dn->state_test(CDentry::STATE_BOTTOMLRU))
+	bottom_lru.lru_insert_mid(dn);
+      else
+	lru.lru_insert_top(dn);
+  }
+
+  // move everything in the pintail to the top bit of the lru.
+  lru.lru_touch_entire_pintail();
+
+  // unpin all subtrees
+  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+       p != subtrees.end();
+       ++p) 
+    p->first->put(CDir::PIN_SUBTREETEMP);
+
+  if (lru.lru_get_size() == 0 &&
+      bottom_lru.lru_get_size() == 0) {
+    // root, stray, etc.?
+    auto p = inode_map.begin();
+    while (p != inode_map.end()) {
+      CInode *in = p->second;
+      ++p;
+      if (!in->is_auth()) {
+	auto&& ls = in->get_dirfrags();
+	for (const auto& dir : ls) {
+	  dout(10) << " removing " << *dir << dendl;
+	  ceph_assert(dir->get_num_ref() == 1);  // SUBTREE
+	  remove_subtree(dir);
+	  in->close_dirfrag(dir->dirfrag().frag);
+	}
+	dout(10) << " removing " << *in << dendl;
+	ceph_assert(!in->get_parent_dn());
+	ceph_assert(in->get_num_ref() == 0);
+	remove_inode(in);
+      }
+    }
+  }
+
+  show_subtrees();
+}
+
+/**
+ * Recursively trim the subtree rooted at directory to remove all
+ * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
+ * of those links. This is used to clear invalid data out of the cache.
+ * Note that it doesn't clear the passed-in directory, since that's not
+ * always safe.
+ */
+bool MDCache::trim_non_auth_subtree(CDir *dir)
+{
+  dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
+
+  bool keep_dir = !can_trim_non_auth_dirfrag(dir);
+
+  auto j = dir->begin();
+  auto i = j;
+  while (j != dir->end()) {
+    i = j++;
+    CDentry *dn = i->second;
+    dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
+    CDentry::linkage_t *dnl = dn->get_linkage();
+    if (dnl->is_primary()) { // check for subdirectories, etc
+      CInode *in = dnl->get_inode();
+      bool keep_inode = false;
+      if (in->is_dir()) {
+        auto&& subdirs = in->get_dirfrags();
+        for (const auto& subdir : subdirs) {
+          if (subdir->is_subtree_root()) {
+            keep_inode = true;
+            dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << *subdir << dendl;
+          } else {
+            if (trim_non_auth_subtree(subdir))
+              keep_inode = true;
+            else {
+              in->close_dirfrag(subdir->get_frag());
+              dir->state_clear(CDir::STATE_COMPLETE);  // now incomplete!
+            }
+          }
+        }
+
+      }
+      if (!keep_inode) { // remove it!
+        dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
+        dir->unlink_inode(dn, false);
+        remove_inode(in);
+	ceph_assert(!dir->has_bloom());
+        dir->remove_dentry(dn);
+      } else {
+        dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
+	dn->state_clear(CDentry::STATE_AUTH);
+	in->state_clear(CInode::STATE_AUTH);
+      }
+    } else if (keep_dir && dnl->is_null()) { // keep null dentry for peer rollback
+      dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
+    } else { // just remove it
+      dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
+      if (dnl->is_remote())
+        dir->unlink_inode(dn, false);
+      dir->remove_dentry(dn);
+    }
+  }
+  dir->state_clear(CDir::STATE_AUTH);
+  /**
+   * We've now checked all our children and deleted those that need it.
+   * Now return to caller, and tell them if *we're* a keeper.
+   */
+  return keep_dir || dir->get_num_any();
+}
+
+/*
+ * during replay, when we determine a subtree is no longer ours, we
+ * try to trim it from our cache.  because subtrees must be connected
+ * to the root, the fact that we can trim this tree may mean that our
+ * children or parents can also be trimmed.
+ */
+void MDCache::try_trim_non_auth_subtree(CDir *dir)
+{
+  dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
+
+  // can we now trim child subtrees?
+  set<CDir*> bounds;
+  get_subtree_bounds(dir, bounds);
+  for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
+    CDir *bd = *p;
+    if (bd->get_dir_auth().first != mds->get_nodeid() &&  // we are not auth
+	bd->get_num_any() == 0 && // and empty
+	can_trim_non_auth_dirfrag(bd)) {
+      CInode *bi = bd->get_inode();
+      dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
+      remove_subtree(bd);
+      bd->mark_clean();
+      bi->close_dirfrag(bd->get_frag());
+    }
+  }
+
+  if (trim_non_auth_subtree(dir)) {
+    // keep
+    try_subtree_merge(dir);
+  } else {
+    // can we trim this subtree (and possibly our ancestors) too?
+    while (true) {
+      CInode *diri = dir->get_inode();
+      if (diri->is_base()) {
+	if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
+	  dout(10) << " closing empty non-auth subtree " << *dir << dendl;
+	  remove_subtree(dir);
+	  dir->mark_clean();
+	  diri->close_dirfrag(dir->get_frag());
+
+	  dout(10) << " removing " << *diri << dendl;
+	  ceph_assert(!diri->get_parent_dn());
+	  ceph_assert(diri->get_num_ref() == 0);
+	  remove_inode(diri);
+	}
+	break;
+      }
+
+      CDir *psub = get_subtree_root(diri->get_parent_dir());
+      dout(10) << " parent subtree is " << *psub << dendl;
+      if (psub->get_dir_auth().first == mds->get_nodeid())
+	break;  // we are auth, keep.
+
+      dout(10) << " closing empty non-auth subtree " << *dir << dendl;
+      remove_subtree(dir);
+      dir->mark_clean();
+      diri->close_dirfrag(dir->get_frag());
+
+      dout(10) << " parent subtree also non-auth: " << *psub << dendl;
+      if (trim_non_auth_subtree(psub))
+	break;
+      dir = psub;
+    }
+  }
+
+  show_subtrees();
+}
+
+void MDCache::standby_trim_segment(LogSegment *ls)
+{
+  auto try_trim_inode = [this](CInode *in) {
+    if (in->get_num_ref() == 0 &&
+	!in->item_open_file.is_on_list() &&
+	in->parent != NULL &&
+	in->parent->get_num_ref() == 0){
+      touch_dentry_bottom(in->parent);
+    }
+  };
+
+  auto try_trim_dentry = [this](CDentry *dn) {
+    if (dn->get_num_ref() > 0)
+      return;
+    auto in = dn->get_linkage()->inode;
+    if(in && in->item_open_file.is_on_list())
+      return;
+    touch_dentry_bottom(dn);
+  };
+  
+  ls->new_dirfrags.clear_list();
+  ls->open_files.clear_list();
+
+  while (!ls->dirty_dirfrags.empty()) {
+    CDir *dir = ls->dirty_dirfrags.front();
+    dir->mark_clean();
+    if (dir->inode)
+      try_trim_inode(dir->inode);
+  }
+  while (!ls->dirty_inodes.empty()) {
+    CInode *in = ls->dirty_inodes.front();
+    in->mark_clean();
+    try_trim_inode(in);
+  }
+  while (!ls->dirty_dentries.empty()) {
+    CDentry *dn = ls->dirty_dentries.front();
+    dn->mark_clean();
+    try_trim_dentry(dn);
+  }
+  while (!ls->dirty_parent_inodes.empty()) {
+    CInode *in = ls->dirty_parent_inodes.front();
+    in->clear_dirty_parent();
+    try_trim_inode(in);
+  }
+  while (!ls->dirty_dirfrag_dir.empty()) {
+    CInode *in = ls->dirty_dirfrag_dir.front();
+    in->filelock.remove_dirty();
+    try_trim_inode(in);
+  }
+  while (!ls->dirty_dirfrag_nest.empty()) {
+    CInode *in = ls->dirty_dirfrag_nest.front();
+    in->nestlock.remove_dirty();
+    try_trim_inode(in);
+  }
+  while (!ls->dirty_dirfrag_dirfragtree.empty()) {
+    CInode *in = ls->dirty_dirfrag_dirfragtree.front();
+    in->dirfragtreelock.remove_dirty();
+    try_trim_inode(in);
+  }
+  while (!ls->truncating_inodes.empty()) {
+    auto it = ls->truncating_inodes.begin();
+    CInode *in = *it;
+    ls->truncating_inodes.erase(it);
+    in->put(CInode::PIN_TRUNCATING);
+    try_trim_inode(in);
+  }
+}
+
+void MDCache::handle_cache_expire(const cref_t<MCacheExpire> &m)
+{
+  mds_rank_t from = mds_rank_t(m->get_from());
+  
+  dout(7) << "cache_expire from mds." << from << dendl;
+
+  if (mds->get_state() < MDSMap::STATE_REJOIN) {
+    return;
+  }
+
+  set<SimpleLock *> gather_locks;
+  // loop over realms
+  for (const auto &p : m->realms) {
+    // check container?
+    if (p.first.ino > 0) {
+      CInode *expired_inode = get_inode(p.first.ino);
+      ceph_assert(expired_inode);  // we had better have this.
+      CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
+      ceph_assert(parent_dir);
+
+      int export_state = -1;
+      if (parent_dir->is_auth() && parent_dir->is_exporting()) {
+	export_state = migrator->get_export_state(parent_dir);
+	ceph_assert(export_state >= 0);
+      }
+
+      if (!parent_dir->is_auth() ||
+	  (export_state != -1 &&
+	   ((export_state == Migrator::EXPORT_WARNING &&
+	     migrator->export_has_warned(parent_dir,from)) ||
+	    export_state == Migrator::EXPORT_EXPORTING ||
+	    export_state == Migrator::EXPORT_LOGGINGFINISH ||
+	    (export_state == Migrator::EXPORT_NOTIFYING &&
+	     !migrator->export_has_notified(parent_dir,from))))) {
+
+	// not auth.
+	dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
+	ceph_assert(parent_dir->is_frozen_tree_root());
+	
+	// make a message container
+
+        auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
+        if (em.second)
+	  em.first->second = make_message<MCacheExpire>(from); /* new */
+
+	// merge these expires into it
+	em.first->second->add_realm(p.first, p.second);
+	continue;
+      }
+      ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
+             (export_state == Migrator::EXPORT_WARNING &&
+              !migrator->export_has_warned(parent_dir, from)));
+
+      dout(7) << "expires for " << *parent_dir << dendl;
+    } else {
+      dout(7) << "containerless expires (root, stray inodes)" << dendl;
+    }
+
+    // INODES
+    for (const auto &q : p.second.inodes) {
+      CInode *in = get_inode(q.first);
+      unsigned nonce = q.second;
+      
+      if (!in) {
+	dout(0) << " inode expire on " << q.first << " from " << from 
+		<< ", don't have it" << dendl;
+	ceph_assert(in);
+      }        
+      ceph_assert(in->is_auth());
+      dout(20) << __func__ << ": expiring inode " << *in << dendl;
+      
+      // check nonce
+      if (nonce == in->get_replica_nonce(from)) {
+	// remove from our cached_by
+	dout(7) << " inode expire on " << *in << " from mds." << from 
+		<< " cached_by was " << in->get_replicas() << dendl;
+	inode_remove_replica(in, from, false, gather_locks);
+      } 
+      else {
+	// this is an old nonce, ignore expire.
+	dout(7) << " inode expire on " << *in << " from mds." << from
+		<< " with old nonce " << nonce
+		<< " (current " << in->get_replica_nonce(from) << "), dropping" 
+		<< dendl;
+      }
+    }
+    
+    // DIRS
+    for (const auto &q : p.second.dirs) {
+      CDir *dir = get_dirfrag(q.first);
+      unsigned nonce = q.second;
+      
+      if (!dir) {
+	CInode *diri = get_inode(q.first.ino);
+	if (diri) {
+	  if (mds->is_rejoin() &&
+	      rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
+	      !diri->is_replica(from)) {
+	    auto&& ls = diri->get_nested_dirfrags();
+	    dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
+		    << " while rejoining, inode isn't replicated" << dendl;
+	    for (const auto& d : ls) {
+	      dir = d;
+	      if (dir->is_replica(from)) {
+		dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
+		dir->remove_replica(from);
+	      }
+	    }
+	    continue;
+	  }
+	  CDir *other = diri->get_approx_dirfrag(q.first.frag);
+	  if (other) {
+	    dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
+		    << " have " << *other << ", mismatched frags, dropping" << dendl;
+	    continue;
+	  }
+	}
+	dout(0) << " dir expire on " << q.first << " from " << from
+		<< ", don't have it" << dendl;
+	ceph_assert(dir);
+      }
+      dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
+
+      ceph_assert(dir->is_auth());
+
+      // check nonce
+      if (nonce == dir->get_replica_nonce(from)) {
+	// remove from our cached_by
+	dout(7) << " dir expire on " << *dir << " from mds." << from
+		<< " replicas was " << dir->get_replicas() << dendl;
+	dir->remove_replica(from);
+      } 
+      else {
+	// this is an old nonce, ignore expire.
+	dout(7) << " dir expire on " << *dir << " from mds." << from 
+		<< " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
+		<< "), dropping" << dendl;
+      }
+    }
+    
+    // DENTRIES
+    for (const auto &pd : p.second.dentries) {
+      dout(10) << " dn expires in dir " << pd.first << dendl;
+      CInode *diri = get_inode(pd.first.ino);
+      ceph_assert(diri);
+      CDir *dir = diri->get_dirfrag(pd.first.frag);
+      
+      if (!dir) {
+	dout(0) << " dn expires on " << pd.first << " from " << from
+		<< ", must have refragmented" << dendl;
+      } else {
+	ceph_assert(dir->is_auth());
+      }
+      
+      for (const auto &p : pd.second) {
+	unsigned nonce = p.second;
+	CDentry *dn;
+	
+	if (dir) {
+	  dn = dir->lookup(p.first.first, p.first.second);
+	} else {
+	  // which dirfrag for this dentry?
+	  CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
+	  ceph_assert(dir); 
+	  ceph_assert(dir->is_auth());
+	  dn = dir->lookup(p.first.first, p.first.second);
+	}
+
+	if (!dn) { 
+	  if (dir)
+	    dout(0) << "  missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
+	  else
+	    dout(0) << "  missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
+	}
+	ceph_assert(dn);
+	
+	if (nonce == dn->get_replica_nonce(from)) {
+	  dout(7) << "  dentry_expire on " << *dn << " from mds." << from << dendl;
+	  dentry_remove_replica(dn, from, gather_locks);
+	} 
+	else {
+	  dout(7) << "  dentry_expire on " << *dn << " from mds." << from
+		  << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
+		  << "), dropping" << dendl;
+	}
+      }
+    }
+  }
+
+  for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
+    if (!(*p)->is_stable())
+      mds->locker->eval_gather(*p);
+  }
+}
+
+void MDCache::process_delayed_expire(CDir *dir)
+{
+  dout(7) << "process_delayed_expire on " << *dir << dendl;
+  for (const auto &p : delayed_expire[dir]) {
+    handle_cache_expire(p.second);
+  }
+  delayed_expire.erase(dir);  
+}
+
+void MDCache::discard_delayed_expire(CDir *dir)
+{
+  dout(7) << "discard_delayed_expire on " << *dir << dendl;
+  delayed_expire.erase(dir);  
+}
+
+void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
+				   set<SimpleLock *>& gather_locks)
+{
+  in->remove_replica(from);
+  in->set_mds_caps_wanted(from, 0);
+  
+  // note: this code calls _eval more often than it needs to!
+  // fix lock
+  if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
+  if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
+  if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
+  if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
+  if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
+  if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
+
+  // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
+  // Don't remove the recovering mds from lock's gathering list because
+  // it may hold rejoined wrlocks.
+  if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
+  if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
+  if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
+}
+
+void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
+{
+  dn->remove_replica(from);
+
+  // fix lock
+  if (dn->lock.remove_replica(from))
+    gather_locks.insert(&dn->lock);
+
+  // Replicated strays might now be elegible for purge
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  if (dnl->is_primary()) {
+    maybe_eval_stray(dnl->get_inode());
+  }
+}
+
+void MDCache::trim_client_leases()
+{
+  utime_t now = ceph_clock_now();
+  
+  dout(10) << "trim_client_leases" << dendl;
+
+  std::size_t pool = 0;
+  for (const auto& list : client_leases) {
+    pool += 1;
+    if (list.empty())
+      continue;
+
+    auto before = list.size();
+    while (!list.empty()) {
+      ClientLease *r = list.front();
+      if (r->ttl > now) break;
+      CDentry *dn = static_cast<CDentry*>(r->parent);
+      dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
+      dn->remove_client_lease(r, mds->locker);
+    }
+    auto after = list.size();
+    dout(10) << "trim_client_leases pool " << pool << " trimmed "
+	     << (before-after) << " leases, " << after << " left" << dendl;
+  }
+}
+
+void MDCache::check_memory_usage()
+{
+  static MemoryModel mm(g_ceph_context);
+  static MemoryModel::snap last;
+  mm.sample(&last);
+  static MemoryModel::snap baseline = last;
+
+  // check client caps
+  ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
+  double caps_per_inode = 0.0;
+  if (CInode::count())
+    caps_per_inode = (double)Capability::count() / (double)CInode::count();
+
+  dout(2) << "Memory usage: "
+	   << " total " << last.get_total()
+	   << ", rss " << last.get_rss()
+	   << ", heap " << last.get_heap()
+	   << ", baseline " << baseline.get_heap()
+	   << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
+	   << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
+	   << dendl;
+
+  mds->update_mlogger();
+  mds->mlogger->set(l_mdm_rss, last.get_rss());
+  mds->mlogger->set(l_mdm_heap, last.get_heap());
+}
+
+
+
+// =========================================================================================
+// shutdown
+
+class C_MDC_ShutdownCheck : public MDCacheContext {
+public:
+  explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
+  void finish(int) override {
+    mdcache->shutdown_check();
+  }
+};
+
+void MDCache::shutdown_check()
+{
+  dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
+
+  // cache
+  char old_val[32] = { 0 };
+  char *o = old_val;
+  g_conf().get_val("debug_mds", &o, sizeof(old_val));
+  g_conf().set_val("debug_mds", "10");
+  g_conf().apply_changes(nullptr);
+  show_cache();
+  g_conf().set_val("debug_mds", old_val);
+  g_conf().apply_changes(nullptr);
+  mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
+
+  // this
+  dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
+  dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
+
+
+  if (mds->objecter->is_active()) {
+    dout(0) << "objecter still active" << dendl;
+    mds->objecter->dump_active();
+  }
+}
+
+
+void MDCache::shutdown_start()
+{
+  dout(5) << "shutdown_start" << dendl;
+
+  if (g_conf()->mds_shutdown_check)
+    mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
+
+  //  g_conf()->debug_mds = 10;
+}
+
+
+
+bool MDCache::shutdown_pass()
+{
+  dout(7) << "shutdown_pass" << dendl;
+
+  if (mds->is_stopped()) {
+    dout(7) << " already shut down" << dendl;
+    show_cache();
+    show_subtrees();
+    return true;
+  }
+
+  // empty stray dir
+  bool strays_all_exported = shutdown_export_strays();
+
+  // trim cache
+  trim(UINT64_MAX);
+  dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
+
+  // Export all subtrees to another active (usually rank 0) if not rank 0
+  int num_auth_subtree = 0;
+  if (!subtrees.empty() && mds->get_nodeid() != 0) {
+    dout(7) << "looking for subtrees to export" << dendl;
+    std::vector<CDir*> ls;
+    for (auto& [dir, bounds] : subtrees) {
+      dout(10) << "  examining " << *dir << " bounds " << bounds << dendl;
+      if (dir->get_inode()->is_mdsdir() || !dir->is_auth())
+	continue;
+      num_auth_subtree++;
+      if (dir->is_frozen() ||
+          dir->is_freezing() ||
+          dir->is_ambiguous_dir_auth() ||
+          dir->state_test(CDir::STATE_EXPORTING) ||
+          dir->get_inode()->is_ephemerally_pinned()) {
+        continue;
+      }
+      ls.push_back(dir);
+    }
+
+    migrator->clear_export_queue();
+    // stopping mds does not call MDBalancer::tick()
+    mds->balancer->handle_export_pins();
+    for (const auto& dir : ls) {
+      mds_rank_t dest = dir->get_inode()->authority().first;
+      if (dest > 0 && !mds->mdsmap->is_active(dest))
+	dest = 0;
+      dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
+      migrator->export_dir_nicely(dir, dest);
+    }
+  }
+
+  if (!strays_all_exported) {
+    dout(7) << "waiting for strays to migrate" << dendl;
+    return false;
+  }
+
+  if (num_auth_subtree > 0) {
+    ceph_assert(mds->get_nodeid() > 0);
+    dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
+    show_subtrees();
+    return false;
+  }
+
+  // close out any sessions (and open files!) before we try to trim the log, etc.
+  if (mds->sessionmap.have_unclosed_sessions()) {
+    if (!mds->server->terminating_sessions)
+      mds->server->terminate_sessions();
+    return false;
+  }
+
+  // Fully trim the log so that all objects in cache are clean and may be
+  // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
+  // trim the log such that the cache eventually becomes clean.
+  if (mds->mdlog->get_num_segments() > 0) {
+    auto ls = mds->mdlog->get_current_segment();
+    if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
+      // Current segment contains events other than subtreemap or
+      // there are dirty dirfrags (see CDir::log_mark_dirty())
+      mds->mdlog->start_new_segment();
+      mds->mdlog->flush();
+    }
+  }
+  mds->mdlog->trim_all();
+  if (mds->mdlog->get_num_segments() > 1) {
+    dout(7) << "still >1 segments, waiting for log to trim" << dendl;
+    return false;
+  }
+
+  // drop our reference to our stray dir inode
+  for (int i = 0; i < NUM_STRAY; ++i) {
+    if (strays[i] &&
+	strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
+      strays[i]->state_clear(CInode::STATE_STRAYPINNED);
+      strays[i]->put(CInode::PIN_STRAY);
+      strays[i]->put_stickydirs();
+    }
+  }
+
+  CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
+  if (mydir && !mydir->is_subtree_root())
+    mydir = NULL;
+
+  // subtrees map not empty yet?
+  if (subtrees.size() > (mydir ? 1 : 0)) {
+    dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
+    show_subtrees();
+    migrator->show_importing();
+    migrator->show_exporting();
+    if (!migrator->is_importing() && !migrator->is_exporting())
+      show_cache();
+    return false;
+  }
+  ceph_assert(!migrator->is_exporting());
+  ceph_assert(!migrator->is_importing());
+
+  // replicas may dirty scatter locks
+  if (myin && myin->is_replicated()) {
+    dout(7) << "still have replicated objects" << dendl;
+    return false;
+  }
+
+  if ((myin && myin->get_num_auth_pins()) ||
+      (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
+    dout(7) << "still have auth pinned objects" << dendl;
+    return false;
+  }
+
+  // (only do this once!)
+  if (!mds->mdlog->is_capped()) {
+    dout(7) << "capping the mdlog" << dendl;
+    mds->mdlog->cap();
+  }
+  
+  if (!mds->mdlog->empty())
+    mds->mdlog->trim(0);
+
+  if (!mds->mdlog->empty()) {
+    dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() 
+	    << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
+    return false;
+  }
+  
+  if (!did_shutdown_log_cap) {
+    // flush journal header
+    dout(7) << "writing header for (now-empty) journal" << dendl;
+    ceph_assert(mds->mdlog->empty());
+    mds->mdlog->write_head(0);  
+    // NOTE: filer active checker below will block us until this completes.
+    did_shutdown_log_cap = true;
+    return false;
+  }
+
+  // filer active?
+  if (mds->objecter->is_active()) {
+    dout(7) << "objecter still active" << dendl;
+    mds->objecter->dump_active();
+    return false;
+  }
+
+  // trim what we can from the cache
+  if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
+    dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size()  << dendl;
+    show_cache();
+    //dump();
+    return false;
+  }
+
+  // make mydir subtree go away
+  if (mydir) {
+    if (mydir->get_num_ref() > 1) { // subtree pin
+      dout(7) << "there's still reference to mydir " << *mydir << dendl;
+      show_cache();
+      return false;
+    }
+
+    remove_subtree(mydir);
+    myin->close_dirfrag(mydir->get_frag());
+  }
+  ceph_assert(subtrees.empty());
+
+  if (myin) {
+    remove_inode(myin);
+    ceph_assert(!myin);
+  }
+
+  if (global_snaprealm) {
+    remove_inode(global_snaprealm->inode);
+    global_snaprealm = nullptr;
+  }
+  
+  // done!
+  dout(5) << "shutdown done." << dendl;
+  return true;
+}
+
+bool MDCache::shutdown_export_strays()
+{
+  static const unsigned MAX_EXPORTING = 100;
+
+  if (mds->get_nodeid() == 0)
+    return true;
+
+  if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
+    return false;
+
+  dout(10) << "shutdown_export_strays " << shutdown_export_next.first
+	   << " '" << shutdown_export_next.second << "'" << dendl;
+
+  bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
+  bool all_exported = false;
+
+again:
+  auto next = shutdown_export_next;
+
+  for (int i = 0; i < NUM_STRAY; ++i) {
+    CInode *strayi = strays[i];
+    if (!strayi ||
+	!strayi->state_test(CInode::STATE_STRAYPINNED))
+      continue;
+    if (strayi->ino() < next.first.ino)
+      continue;
+
+    deque<CDir*> dfls;
+    strayi->get_dirfrags(dfls);
+
+    while (!dfls.empty()) {
+      CDir *dir = dfls.front();
+      dfls.pop_front();
+
+      if (dir->dirfrag() < next.first)
+	continue;
+      if (next.first < dir->dirfrag()) {
+	next.first = dir->dirfrag();
+	next.second.clear();
+      }
+
+      if (!dir->is_complete()) {
+	MDSContext *fin = nullptr;
+	if (shutdown_exporting_strays.empty()) {
+	  fin = new MDSInternalContextWrapper(mds,
+		  new LambdaContext([this](int r) {
+		    shutdown_export_strays();
+		  })
+		);
+	}
+	dir->fetch(fin);
+	goto done;
+      }
+
+      CDir::dentry_key_map::iterator it;
+      if (next.second.empty()) {
+	it = dir->begin();
+      } else {
+	auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
+	it = dir->lower_bound(dentry_key_t(0, next.second, hash));
+      }
+
+      for (; it != dir->end(); ++it) {
+	CDentry *dn = it->second;
+	CDentry::linkage_t *dnl = dn->get_projected_linkage();
+	if (dnl->is_null())
+	  continue;
+
+	if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
+	  next.second = it->first.name;
+	  goto done;
+	}
+
+	auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
+	if (!ret.second) {
+	  dout(10) << "already exporting/purging " << *dn << dendl;
+	  continue;
+	}
+
+	// Don't try to migrate anything that is actually
+	// being purged right now
+	if (!dn->state_test(CDentry::STATE_PURGING))
+	  stray_manager.migrate_stray(dn, mds_rank_t(0));  // send to root!
+
+	if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
+	  ++it;
+	  if (it != dir->end()) {
+	    next.second = it->first.name;
+	  } else {
+	    if (dfls.empty())
+	      next.first.ino.val++;
+	    else
+	      next.first = dfls.front()->dirfrag();
+	    next.second.clear();
+	  }
+	  goto done;
+	}
+      }
+    }
+  }
+
+  if (shutdown_exporting_strays.empty()) {
+    dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
+    if (first_df < shutdown_export_next.first ||
+	!shutdown_export_next.second.empty()) {
+      shutdown_export_next.first = first_df;
+      shutdown_export_next.second.clear();
+      goto again;
+    }
+    all_exported = true;
+  }
+
+done:
+  shutdown_export_next = next;
+  return all_exported;
+}
+
+// ========= messaging ==============
+
+void MDCache::dispatch(const cref_t<Message> &m)
+{
+  switch (m->get_type()) {
+
+    // RESOLVE
+  case MSG_MDS_RESOLVE:
+    handle_resolve(ref_cast<MMDSResolve>(m));
+    break;
+  case MSG_MDS_RESOLVEACK:
+    handle_resolve_ack(ref_cast<MMDSResolveAck>(m));
+    break;
+
+    // REJOIN
+  case MSG_MDS_CACHEREJOIN:
+    handle_cache_rejoin(ref_cast<MMDSCacheRejoin>(m));
+    break;
+
+  case MSG_MDS_DISCOVER:
+    handle_discover(ref_cast<MDiscover>(m));
+    break;
+  case MSG_MDS_DISCOVERREPLY:
+    handle_discover_reply(ref_cast<MDiscoverReply>(m));
+    break;
+
+  case MSG_MDS_DIRUPDATE:
+    handle_dir_update(ref_cast<MDirUpdate>(m));
+    break;
+
+  case MSG_MDS_CACHEEXPIRE:
+    handle_cache_expire(ref_cast<MCacheExpire>(m));
+    break;
+
+  case MSG_MDS_DENTRYLINK:
+    handle_dentry_link(ref_cast<MDentryLink>(m));
+    break;
+  case MSG_MDS_DENTRYUNLINK:
+    handle_dentry_unlink(ref_cast<MDentryUnlink>(m));
+    break;
+  case MSG_MDS_DENTRYUNLINK_ACK:
+    handle_dentry_unlink_ack(ref_cast<MDentryUnlinkAck>(m));
+    break;
+
+
+  case MSG_MDS_FRAGMENTNOTIFY:
+    handle_fragment_notify(ref_cast<MMDSFragmentNotify>(m));
+    break;
+  case MSG_MDS_FRAGMENTNOTIFYACK:
+    handle_fragment_notify_ack(ref_cast<MMDSFragmentNotifyAck>(m));
+    break;
+
+  case MSG_MDS_FINDINO:
+    handle_find_ino(ref_cast<MMDSFindIno>(m));
+    break;
+  case MSG_MDS_FINDINOREPLY:
+    handle_find_ino_reply(ref_cast<MMDSFindInoReply>(m));
+    break;
+
+  case MSG_MDS_OPENINO:
+    handle_open_ino(ref_cast<MMDSOpenIno>(m));
+    break;
+  case MSG_MDS_OPENINOREPLY:
+    handle_open_ino_reply(ref_cast<MMDSOpenInoReply>(m));
+    break;
+
+  case MSG_MDS_SNAPUPDATE:
+    handle_snap_update(ref_cast<MMDSSnapUpdate>(m));
+    break;
+    
+  default:
+    derr << "cache unknown message " << m->get_type() << dendl;
+    ceph_abort_msg("cache unknown message");
+  }
+}
+
+int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
+                           const filepath& path, int flags,
+                           vector<CDentry*> *pdnvec, CInode **pin)
+{
+  bool discover = (flags & MDS_TRAVERSE_DISCOVER);
+  bool forward = !discover;
+  bool path_locked = (flags & MDS_TRAVERSE_PATH_LOCKED);
+  bool want_dentry = (flags & MDS_TRAVERSE_WANT_DENTRY);
+  bool want_auth = (flags & MDS_TRAVERSE_WANT_AUTH);
+  bool rdlock_snap = (flags & (MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_SNAP2));
+  bool rdlock_path = (flags & MDS_TRAVERSE_RDLOCK_PATH);
+  bool xlock_dentry = (flags & MDS_TRAVERSE_XLOCK_DENTRY);
+  bool rdlock_authlock = (flags & MDS_TRAVERSE_RDLOCK_AUTHLOCK);
+
+  if (forward)
+    ceph_assert(mdr);  // forward requires a request
+
+  snapid_t snapid = CEPH_NOSNAP;
+  if (mdr)
+    mdr->snapid = snapid;
+
+  client_t client = mdr ? mdr->get_client() : -1;
+
+  if (mds->logger) mds->logger->inc(l_mds_traverse);
+
+  dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
+  CInode *cur = get_inode(path.get_ino());
+  if (!cur) {
+    if (MDS_INO_IS_MDSDIR(path.get_ino())) {
+      open_foreign_mdsdir(path.get_ino(), cf.build());
+      return 1;
+    }
+    if (MDS_INO_IS_STRAY(path.get_ino())) {
+      mds_rank_t rank = MDS_INO_STRAY_OWNER(path.get_ino());
+      unsigned idx = MDS_INO_STRAY_INDEX(path.get_ino());
+      filepath path(strays[idx]->get_parent_dn()->get_name(),
+		    MDS_INO_MDSDIR(rank));
+      MDRequestRef null_ref;
+      return path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, nullptr);
+    }
+    return -CEPHFS_ESTALE;
+  }
+  if (cur->state_test(CInode::STATE_PURGING))
+    return -CEPHFS_ESTALE;
+
+  if (flags & MDS_TRAVERSE_CHECK_LOCKCACHE)
+    mds->locker->find_and_attach_lock_cache(mdr, cur);
+
+  if (mdr && mdr->lock_cache) {
+    if (flags & MDS_TRAVERSE_WANT_DIRLAYOUT)
+      mdr->dir_layout = mdr->lock_cache->get_dir_layout();
+  } else if (rdlock_snap) {
+    int n = (flags & MDS_TRAVERSE_RDLOCK_SNAP2) ? 1 : 0;
+    if ((n == 0 && !(mdr->locking_state & MutationImpl::SNAP_LOCKED)) ||
+	(n == 1 && !(mdr->locking_state & MutationImpl::SNAP2_LOCKED))) {
+      bool want_layout = (flags & MDS_TRAVERSE_WANT_DIRLAYOUT);
+      if (!mds->locker->try_rdlock_snap_layout(cur, mdr, n, want_layout))
+	return 1;
+    }
+  }
+
+  // start trace
+  if (pdnvec)
+    pdnvec->clear();
+  if (pin)
+    *pin = cur;
+
+  MutationImpl::LockOpVec lov;
+
+  for (unsigned depth = 0; depth < path.depth(); ) {
+    dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
+	     << "' snapid " << snapid << dendl;
+    
+    if (!cur->is_dir()) {
+      dout(7) << "traverse: " << *cur << " not a dir " << dendl;
+      return -CEPHFS_ENOTDIR;
+    }
+
+    // walk into snapdir?
+    if (path[depth].length() == 0) {
+      dout(10) << "traverse: snapdir" << dendl;
+      if (!mdr || depth > 0) // snapdir must be the first component
+	return -CEPHFS_EINVAL;
+      snapid = CEPH_SNAPDIR;
+      mdr->snapid = snapid;
+      depth++;
+      continue;
+    }
+    // walk thru snapdir?
+    if (snapid == CEPH_SNAPDIR) {
+      if (!mdr)
+	return -CEPHFS_EINVAL;
+      SnapRealm *realm = cur->find_snaprealm();
+      snapid = realm->resolve_snapname(path[depth], cur->ino());
+      dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
+      if (!snapid) {
+	if (pdnvec)
+	  pdnvec->clear();   // do not confuse likes of rdlock_path_pin_ref();
+	return -CEPHFS_ENOENT;
+      }
+      mdr->snapid = snapid;
+      depth++;
+      continue;
+    }
+
+    // open dir
+    frag_t fg = cur->pick_dirfrag(path[depth]);
+    CDir *curdir = cur->get_dirfrag(fg);
+    if (!curdir) {
+      if (cur->is_auth()) {
+        // parent dir frozen_dir?
+        if (cur->is_frozen()) {
+          dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
+          cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
+          return 1;
+        }
+        curdir = cur->get_or_open_dirfrag(this, fg);
+      } else {
+        // discover?
+	dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
+	discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
+		      path_locked);
+	if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
+        return 1;
+      }
+    }
+    ceph_assert(curdir);
+
+#ifdef MDS_VERIFY_FRAGSTAT
+    if (curdir->is_complete())
+      curdir->verify_fragstat();
+#endif
+
+    // frozen?
+    /*
+    if (curdir->is_frozen()) {
+    // doh!
+      // FIXME: traverse is allowed?
+      dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
+      curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
+      if (onfinish) delete onfinish;
+      return 1;
+    }
+    */
+
+    if (want_auth && want_dentry && depth == path.depth() - 1) {
+      if (curdir->is_ambiguous_auth()) {
+	dout(10) << "waiting for single auth on " << *curdir << dendl;
+	curdir->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
+	return 1;
+      }
+      if (!curdir->is_auth()) {
+	dout(10) << "fw to auth for " << *curdir << dendl;
+	request_forward(mdr, curdir->authority().first);
+	return 2;
+      }
+    }
+
+    // Before doing dirfrag->dn lookup, compare with DamageTable's
+    // record of which dentries were unreadable
+    if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
+      dout(4) << "traverse: stopped lookup at damaged dentry "
+              << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
+      return -CEPHFS_EIO;
+    }
+
+    // dentry
+    CDentry *dn = curdir->lookup(path[depth], snapid);
+    if (dn) {
+      if (dn->state_test(CDentry::STATE_PURGING))
+	return -CEPHFS_ENOENT;
+
+      if (rdlock_path) {
+	lov.clear();
+	if (xlock_dentry && depth == path.depth() - 1) {
+	  if (depth > 0 || !mdr->lock_cache) {
+	    lov.add_wrlock(&cur->filelock);
+	    lov.add_wrlock(&cur->nestlock);
+	    if (rdlock_authlock)
+	      lov.add_rdlock(&cur->authlock);
+	  }
+	  lov.add_xlock(&dn->lock);
+	} else {
+	  // force client to flush async dir operation if necessary
+	  if (cur->filelock.is_cached())
+	    lov.add_wrlock(&cur->filelock);
+	  lov.add_rdlock(&dn->lock);
+	}
+	if (!mds->locker->acquire_locks(mdr, lov)) {
+	  dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
+	  return 1;
+	}
+      } else if (!path_locked &&
+		 !dn->lock.can_read(client) &&
+		 !(dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
+	dout(10) << "traverse: non-readable dentry at " << *dn << dendl;
+	dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
+	if (mds->logger)
+	  mds->logger->inc(l_mds_traverse_lock);
+	if (dn->is_auth() && dn->lock.is_unstable_and_locked())
+	  mds->mdlog->flush();
+	return 1;
+      }
+
+      if (pdnvec)
+	pdnvec->push_back(dn);
+
+      CDentry::linkage_t *dnl = dn->get_projected_linkage();
+      // can we conclude CEPHFS_ENOENT?
+      if (dnl->is_null()) {
+	dout(10) << "traverse: null+readable dentry at " << *dn << dendl;
+	if (depth == path.depth() - 1) {
+	  if (want_dentry)
+	    break;
+	} else {
+	  if (pdnvec)
+	    pdnvec->clear();   // do not confuse likes of rdlock_path_pin_ref();
+	}
+	return -CEPHFS_ENOENT;
+      }
+
+      // do we have inode?
+      CInode *in = dnl->get_inode();
+      if (!in) {
+        ceph_assert(dnl->is_remote());
+        // do i have it?
+        in = get_inode(dnl->get_remote_ino());
+        if (in) {
+	  dout(7) << "linking in remote in " << *in << dendl;
+	  dn->link_remote(dnl, in);
+	} else {
+          dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
+	  ceph_assert(mdr);  // we shouldn't hit non-primary dentries doing a non-mdr traversal!
+          if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
+            dout(4) << "traverse: remote dentry points to damaged ino "
+                    << *dn << dendl;
+            return -CEPHFS_EIO;
+          }
+          open_remote_dentry(dn, true, cf.build(),
+			     (path_locked && depth == path.depth() - 1));
+	  if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
+          return 1;
+        }
+      }
+
+      cur = in;
+
+      if (rdlock_snap && !(want_dentry && depth == path.depth() - 1)) {
+	lov.clear();
+	lov.add_rdlock(&cur->snaplock);
+	if (!mds->locker->acquire_locks(mdr, lov)) {
+	  dout(10) << "traverse: failed to rdlock " << cur->snaplock << " " << *cur << dendl;
+	  return 1;
+	}
+      }
+
+      // add to trace, continue.
+      touch_inode(cur);
+      if (pin)
+	*pin = cur;
+      depth++;
+      continue;
+    }
+
+    ceph_assert(!dn);
+
+    // MISS.  dentry doesn't exist.
+    dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
+
+    if (curdir->is_auth()) {
+      // dentry is mine.
+      if (curdir->is_complete() ||
+	  (snapid == CEPH_NOSNAP &&
+	   curdir->has_bloom() &&
+	   !curdir->is_in_bloom(path[depth]))) {
+        // file not found
+	if (pdnvec) {
+	  // instantiate a null dn?
+	  if (depth < path.depth() - 1) {
+	    dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
+	  } else if (snapid < CEPH_MAXSNAP) {
+	    dout(20) << " not adding null for snapid " << snapid << dendl;
+	  } else if (curdir->is_frozen()) {
+	    dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
+	    curdir->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
+	    return 1;
+	  } else {
+	    // create a null dentry
+	    dn = curdir->add_null_dentry(path[depth]);
+	    dout(20) << " added null " << *dn << dendl;
+
+	    if (rdlock_path) {
+	      lov.clear();
+	      if (xlock_dentry) {
+		if (depth > 0 || !mdr->lock_cache) {
+		  lov.add_wrlock(&cur->filelock);
+		  lov.add_wrlock(&cur->nestlock);
+		  if (rdlock_authlock)
+		    lov.add_rdlock(&cur->authlock);
+		}
+		lov.add_xlock(&dn->lock);
+	      } else {
+		// force client to flush async dir operation if necessary
+		if (cur->filelock.is_cached())
+		  lov.add_wrlock(&cur->filelock);
+		lov.add_rdlock(&dn->lock);
+	      }
+	      if (!mds->locker->acquire_locks(mdr, lov)) {
+		dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
+		return 1;
+	      }
+	    }
+	  }
+	  if (dn) {
+	    pdnvec->push_back(dn);
+	    if (want_dentry)
+	      break;
+	  } else {
+	    pdnvec->clear();   // do not confuse likes of rdlock_path_pin_ref();
+	  }
+	}
+        return -CEPHFS_ENOENT;
+      } else {
+
+        // Check DamageTable for missing fragments before trying to fetch
+        // this
+        if (mds->damage_table.is_dirfrag_damaged(curdir)) {
+          dout(4) << "traverse: damaged dirfrag " << *curdir
+                  << ", blocking fetch" << dendl;
+          return -CEPHFS_EIO;
+        }
+
+	// directory isn't complete; reload
+        dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
+        touch_inode(cur);
+        curdir->fetch(cf.build(), path[depth]);
+	if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
+        return 1;
+      }
+    } else {
+      // dirfrag/dentry is not mine.
+      mds_authority_t dauth = curdir->authority();
+
+      if (forward &&
+	  mdr && mdr->client_request &&
+	  (int)depth < mdr->client_request->get_num_fwd()){
+	dout(7) << "traverse: snap " << snapid << " and depth " << depth
+		<< " < fwd " << mdr->client_request->get_num_fwd()
+		<< ", discovering instead of forwarding" << dendl;
+	discover = true;
+      }
+
+      if ((discover)) {
+	dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
+	discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
+		      path_locked);
+	if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
+        return 1;
+      } 
+      if (forward) {
+        // forward
+        dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
+	
+	if (curdir->is_ambiguous_auth()) {
+	  // wait
+	  dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
+	  curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
+	  return 1;
+	} 
+
+	dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
+
+        request_forward(mdr, dauth.first);
+
+	if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
+	return 2;
+      }
+    }
+
+    ceph_abort();  // i shouldn't get here
+  }
+
+  if (want_auth && !want_dentry) {
+    if (cur->is_ambiguous_auth()) {
+      dout(10) << "waiting for single auth on " << *cur << dendl;
+      cur->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
+      return 1;
+    }
+    if (!cur->is_auth()) {
+      dout(10) << "fw to auth for " << *cur << dendl;
+      request_forward(mdr, cur->authority().first);
+      return 2;
+    }
+  }
+  
+  // success.
+  if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
+  dout(10) << "path_traverse finish on snapid " << snapid << dendl;
+  if (mdr) 
+    ceph_assert(mdr->snapid == snapid);
+
+  if (flags & MDS_TRAVERSE_RDLOCK_SNAP)
+    mdr->locking_state |= MutationImpl::SNAP_LOCKED;
+  else if (flags & MDS_TRAVERSE_RDLOCK_SNAP2)
+    mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
+
+  if (rdlock_path)
+    mdr->locking_state |= MutationImpl::PATH_LOCKED;
+
+  return 0;
+}
+
+CInode *MDCache::cache_traverse(const filepath& fp)
+{
+  dout(10) << "cache_traverse " << fp << dendl;
+
+  CInode *in;
+  unsigned depth = 0;
+  char mdsdir_name[16];
+  sprintf(mdsdir_name, "~mds%d", mds->get_nodeid());
+
+  if (fp.get_ino()) {
+    in = get_inode(fp.get_ino());
+  } else if (fp.depth() > 0 && (fp[0] == "~mdsdir" || fp[0] == mdsdir_name)) {
+    in = myin;
+    depth = 1;
+  } else {
+    in = root;
+  }
+  if (!in)
+    return NULL;
+
+  for (; depth < fp.depth(); depth++) {
+    std::string_view dname = fp[depth];
+    frag_t fg = in->pick_dirfrag(dname);
+    dout(20) << " " << depth << " " << dname << " frag " << fg << " from " << *in << dendl;
+    CDir *curdir = in->get_dirfrag(fg);
+    if (!curdir)
+      return NULL;
+    CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
+    if (!dn)
+      return NULL;
+    in = dn->get_linkage()->get_inode();
+    if (!in)
+      return NULL;
+  }
+  dout(10) << " got " << *in << dendl;
+  return in;
+}
+
+
+/**
+ * open_remote_dir -- open up a remote dirfrag
+ *
+ * @param diri base inode
+ * @param approxfg approximate fragment.
+ * @param fin completion callback
+ */
+void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin) 
+{
+  dout(10) << "open_remote_dir on " << *diri << dendl;
+  ceph_assert(diri->is_dir());
+  ceph_assert(!diri->is_auth());
+  ceph_assert(diri->get_dirfrag(approxfg) == 0);
+
+  discover_dir_frag(diri, approxfg, fin);
+}
+
+
+/** 
+ * get_dentry_inode - get or open inode
+ *
+ * @param dn the dentry
+ * @param mdr current request
+ *
+ * will return inode for primary, or link up/open up remote link's inode as necessary.
+ * If it's not available right now, puts mdr on wait list and returns null.
+ */
+CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
+{
+  CDentry::linkage_t *dnl;
+  if (projected)
+    dnl = dn->get_projected_linkage();
+  else
+    dnl = dn->get_linkage();
+
+  ceph_assert(!dnl->is_null());
+  
+  if (dnl->is_primary())
+    return dnl->inode;
+
+  ceph_assert(dnl->is_remote());
+  CInode *in = get_inode(dnl->get_remote_ino());
+  if (in) {
+    dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
+    dn->link_remote(dnl, in);
+    return in;
+  } else {
+    dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
+    open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
+    return 0;
+  }
+}
+
+struct C_MDC_OpenRemoteDentry : public MDCacheContext {
+  CDentry *dn;
+  inodeno_t ino;
+  MDSContext *onfinish;
+  bool want_xlocked;
+  C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
+    MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
+    dn->get(MDSCacheObject::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
+    dn->put(MDSCacheObject::PIN_PTRWAITER);
+  }
+};
+
+void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
+{
+  dout(10) << "open_remote_dentry " << *dn << dendl;
+  CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
+  inodeno_t ino = dnl->get_remote_ino();
+  int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->get_metadata_pool() : -1;
+  open_ino(ino, pool,
+      new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
+}
+
+void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
+					 bool want_xlocked, int r)
+{
+  if (r < 0) {
+    CDentry::linkage_t *dnl = dn->get_projected_linkage();
+    if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
+      dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
+      dn->state_set(CDentry::STATE_BADREMOTEINO);
+
+      std::string path;
+      CDir *dir = dn->get_dir();
+      if (dir) {
+	dir->get_inode()->make_path_string(path);
+	path += "/";
+        path += dn->get_name();
+      }
+
+      bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
+      if (fatal) {
+	mds->damaged();
+	ceph_abort();  // unreachable, damaged() respawns us
+      }
+    } else {
+      r = 0;
+    }
+  }
+  fin->complete(r < 0 ? r : 0);
+}
+
+
+void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
+{
+  // empty trace if we're a base inode
+  if (in->is_base())
+    return;
+
+  CInode *parent = in->get_parent_inode();
+  ceph_assert(parent);
+  make_trace(trace, parent);
+
+  CDentry *dn = in->get_parent_dn();
+  dout(15) << "make_trace adding " << *dn << dendl;
+  trace.push_back(dn);
+}
+
+
+// -------------------------------------------------------------------------------
+// Open inode by inode number
+
+class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
+  inodeno_t ino;
+  public:
+  bufferlist bl;
+  C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
+    MDCacheIOContext(c), ino(i) {}
+  void finish(int r) override {
+    mdcache->_open_ino_backtrace_fetched(ino, bl, r);
+  }
+  void print(ostream& out) const override {
+    out << "openino_backtrace_fetch" << ino << ")";
+  }
+};
+
+struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
+  inodeno_t ino;
+  cref_t<MMDSOpenIno> msg;
+  bool parent;
+  public:
+  C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const cref_t<MMDSOpenIno> &m,  bool p) :
+    MDCacheContext(c), ino(i), msg(m), parent(p) {}
+  void finish(int r) override {
+    if (r < 0 && !parent)
+      r = -CEPHFS_EAGAIN;
+    if (msg) {
+      mdcache->handle_open_ino(msg, r);
+      return;
+    }
+    auto& info = mdcache->opening_inodes.at(ino);
+    mdcache->_open_ino_traverse_dir(ino, info, r);
+  }
+};
+
+struct C_MDC_OpenInoParentOpened : public MDCacheContext {
+  inodeno_t ino;
+  public:
+  C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
+  void finish(int r) override {
+    mdcache->_open_ino_parent_opened(ino, r);
+  }
+};
+
+void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
+{
+  dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
+
+  open_ino_info_t& info = opening_inodes.at(ino);
+
+  CInode *in = get_inode(ino);
+  if (in) {
+    dout(10) << " found cached " << *in << dendl;
+    open_ino_finish(ino, info, in->authority().first);
+    return;
+  }
+
+  inode_backtrace_t backtrace;
+  if (err == 0) {
+    try {
+      decode(backtrace, bl);
+    } catch (const buffer::error &decode_exc) {
+      derr << "corrupt backtrace on ino x0" << std::hex << ino
+           << std::dec << ": " << decode_exc.what() << dendl;
+      open_ino_finish(ino, info, -CEPHFS_EIO);
+      return;
+    }
+    if (backtrace.pool != info.pool && backtrace.pool != -1) {
+      dout(10) << " old object in pool " << info.pool
+	       << ", retrying pool " << backtrace.pool << dendl;
+      info.pool = backtrace.pool;
+      C_IO_MDC_OpenInoBacktraceFetched *fin =
+	new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
+      fetch_backtrace(ino, info.pool, fin->bl,
+		      new C_OnFinisher(fin, mds->finisher));
+      return;
+    }
+  } else if (err == -CEPHFS_ENOENT) {
+    int64_t meta_pool = mds->get_metadata_pool();
+    if (info.pool != meta_pool) {
+      dout(10) << " no object in pool " << info.pool
+	       << ", retrying pool " << meta_pool << dendl;
+      info.pool = meta_pool;
+      C_IO_MDC_OpenInoBacktraceFetched *fin =
+	new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
+      fetch_backtrace(ino, info.pool, fin->bl,
+		      new C_OnFinisher(fin, mds->finisher));
+      return;
+    }
+    err = 0; // backtrace.ancestors.empty() is checked below
+  }
+
+  if (err == 0) {
+    if (backtrace.ancestors.empty()) {
+      dout(10) << " got empty backtrace " << dendl;
+      err = -CEPHFS_ESTALE;
+    } else if (!info.ancestors.empty()) {
+      if (info.ancestors[0] == backtrace.ancestors[0]) {
+	dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
+	err = -CEPHFS_EINVAL;
+      } else {
+	info.last_err = 0;
+      }
+    }
+  }
+  if (err) {
+    dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
+    if (info.last_err)
+      err = info.last_err;
+    open_ino_finish(ino, info, err);
+    return;
+  }
+
+  dout(10) << " got backtrace " << backtrace << dendl;
+  info.ancestors = backtrace.ancestors;
+
+  _open_ino_traverse_dir(ino, info, 0);
+}
+
+void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
+{
+  dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
+
+  open_ino_info_t& info = opening_inodes.at(ino);
+
+  CInode *in = get_inode(ino);
+  if (in) {
+    dout(10) << " found cached " << *in << dendl;
+    open_ino_finish(ino, info, in->authority().first);
+    return;
+  }
+
+  if (ret == mds->get_nodeid()) {
+    _open_ino_traverse_dir(ino, info, 0);
+  } else {
+    if (ret >= 0) {
+      mds_rank_t checked_rank = mds_rank_t(ret);
+      info.check_peers = true;
+      info.auth_hint = checked_rank;
+      info.checked.erase(checked_rank);
+    }
+    do_open_ino(ino, info, ret);
+  }
+}
+
+void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
+{
+  dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
+
+  CInode *in = get_inode(ino);
+  if (in) {
+    dout(10) << " found cached " << *in << dendl;
+    open_ino_finish(ino, info, in->authority().first);
+    return;
+  }
+
+  if (ret) {
+    do_open_ino(ino, info, ret);
+    return;
+  }
+
+  mds_rank_t hint = info.auth_hint;
+  ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
+			      info.discover, info.want_xlocked, &hint);
+  if (ret > 0)
+    return;
+  if (hint != mds->get_nodeid())
+    info.auth_hint = hint;
+  do_open_ino(ino, info, ret);
+}
+
+void MDCache::_open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, CDir *dir, bool parent)
+{
+  if (dir->state_test(CDir::STATE_REJOINUNDEF))
+    ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
+  dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
+  if (mds->logger)
+    mds->logger->inc(l_mds_openino_dir_fetch);
+}
+
+int MDCache::open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
+				   const vector<inode_backpointer_t>& ancestors,
+				   bool discover, bool want_xlocked, mds_rank_t *hint)
+{
+  dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
+  int err = 0;
+  for (unsigned i = 0; i < ancestors.size(); i++) {
+    const auto& ancestor = ancestors.at(i);
+    CInode *diri = get_inode(ancestor.dirino);
+
+    if (!diri) {
+      if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
+	open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
+	return 1;
+      }
+      continue;
+    }
+
+    if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
+      CDir *dir = diri->get_parent_dir();
+      while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
+	     dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
+	dir = dir->get_inode()->get_parent_dir();
+      _open_ino_fetch_dir(ino, m, dir, i == 0);
+      return 1;
+    }
+
+    if (!diri->is_dir()) {
+      dout(10) << " " << *diri << " is not dir" << dendl;
+      if (i == 0)
+	err = -CEPHFS_ENOTDIR;
+      break;
+    }
+
+    const string& name = ancestor.dname;
+    frag_t fg = diri->pick_dirfrag(name);
+    CDir *dir = diri->get_dirfrag(fg);
+    if (!dir) {
+      if (diri->is_auth()) {
+	if (diri->is_frozen()) {
+	  dout(10) << " " << *diri << " is frozen, waiting " << dendl;
+	  diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
+	  return 1;
+	}
+	dir = diri->get_or_open_dirfrag(this, fg);
+      } else if (discover) {
+	open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
+	return 1;
+      }
+    }
+    if (dir) {
+      inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
+      CDentry *dn = dir->lookup(name);
+      CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
+      if (dir->is_auth()) {
+	if (dnl && dnl->is_primary() &&
+	    dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
+	  dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
+	  _open_ino_fetch_dir(ino, m, dir, i == 0);
+	  return 1;
+	}
+
+	if (!dnl && !dir->is_complete() &&
+	    (!dir->has_bloom() || dir->is_in_bloom(name))) {
+	  dout(10) << " fetching incomplete " << *dir << dendl;
+	  _open_ino_fetch_dir(ino, m, dir, i == 0);
+	  return 1;
+	}
+
+	dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
+	if (i == 0)
+	  err = -CEPHFS_ENOENT;
+      } else if (discover) {
+	if (!dnl) {
+	  filepath path(name, 0);
+	  discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
+			(i == 0 && want_xlocked));
+	  return 1;
+	}
+	if (dnl->is_null() && !dn->lock.can_read(-1)) {
+	  dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
+	  dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
+	  return 1;
+	}
+	dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
+	if (i == 0)
+	  err = -CEPHFS_ENOENT;
+      }
+    }
+    if (hint && i == 0)
+      *hint = dir ? dir->authority().first : diri->authority().first;
+    break;
+  }
+  return err;
+}
+
+void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
+{
+  dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
+
+  MDSContext::vec waiters;
+  waiters.swap(info.waiters);
+  opening_inodes.erase(ino);
+  finish_contexts(g_ceph_context, waiters, ret);
+}
+
+void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
+{
+  if (err < 0 && err != -CEPHFS_EAGAIN) {
+    info.checked.clear();
+    info.checking = MDS_RANK_NONE;
+    info.check_peers = true;
+    info.fetch_backtrace = true;
+    if (info.discover) {
+      info.discover = false;
+      info.ancestors.clear();
+    }
+    if (err != -CEPHFS_ENOENT && err != -CEPHFS_ENOTDIR)
+      info.last_err = err;
+  }
+
+  if (info.check_peers || info.discover) {
+    if (info.discover) {
+      // got backtrace from peer, but failed to find inode. re-check peers
+      info.discover = false;
+      info.ancestors.clear();
+      info.checked.clear();
+    }
+    info.check_peers = false;
+    info.checking = MDS_RANK_NONE;
+    do_open_ino_peer(ino, info);
+  } else if (info.fetch_backtrace) {
+    info.check_peers = true;
+    info.fetch_backtrace = false;
+    info.checking = mds->get_nodeid();
+    info.checked.clear();
+    C_IO_MDC_OpenInoBacktraceFetched *fin =
+      new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
+    fetch_backtrace(ino, info.pool, fin->bl,
+		    new C_OnFinisher(fin, mds->finisher));
+  } else {
+    ceph_assert(!info.ancestors.empty());
+    info.checking = mds->get_nodeid();
+    open_ino(info.ancestors[0].dirino, mds->get_metadata_pool(),
+	     new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
+  }
+}
+
+void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
+{
+  set<mds_rank_t> all, active;
+  mds->mdsmap->get_mds_set(all);
+  if (mds->get_state() == MDSMap::STATE_REJOIN)
+    mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
+  else
+    mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
+
+  dout(10) << "do_open_ino_peer " << ino << " active " << active
+	   << " all " << all << " checked " << info.checked << dendl;
+
+  mds_rank_t whoami = mds->get_nodeid();
+  mds_rank_t peer = MDS_RANK_NONE;
+  if (info.auth_hint >= 0 && info.auth_hint != whoami) {
+    if (active.count(info.auth_hint)) {
+      peer = info.auth_hint;
+      info.auth_hint = MDS_RANK_NONE;
+    }
+  } else {
+    for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
+      if (*p != whoami && info.checked.count(*p) == 0) {
+	peer = *p;
+	break;
+      }
+  }
+  if (peer < 0) {
+    all.erase(whoami);
+    if (all != info.checked) {
+      dout(10) << " waiting for more peers to be active" << dendl;
+    } else {
+      dout(10) << " all MDS peers have been checked " << dendl;
+      do_open_ino(ino, info, 0);
+    }
+  } else {
+    info.checking = peer;
+    vector<inode_backpointer_t> *pa = NULL;
+    // got backtrace from peer or backtrace just fetched
+    if (info.discover || !info.fetch_backtrace)
+      pa = &info.ancestors;
+    mds->send_message_mds(make_message<MMDSOpenIno>(info.tid, ino, pa), peer);
+    if (mds->logger)
+      mds->logger->inc(l_mds_openino_peer_discover);
+  }
+}
+
+void MDCache::handle_open_ino(const cref_t<MMDSOpenIno> &m, int err)
+{
+  if (mds->get_state() < MDSMap::STATE_REJOIN &&
+      mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
+    return;
+  }
+
+  dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
+
+  auto from = mds_rank_t(m->get_source().num());
+  inodeno_t ino = m->ino;
+  ref_t<MMDSOpenInoReply> reply;
+  CInode *in = get_inode(ino);
+  if (in) {
+    dout(10) << " have " << *in << dendl;
+    reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, mds_rank_t(0));
+    if (in->is_auth()) {
+      touch_inode(in);
+      while (1) {
+	CDentry *pdn = in->get_parent_dn();
+	if (!pdn)
+	  break;
+	CInode *diri = pdn->get_dir()->get_inode();
+	reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
+						       in->get_version()));
+	in = diri;
+      }
+    } else {
+      reply->hint = in->authority().first;
+    }
+  } else if (err < 0) {
+    reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, MDS_RANK_NONE, err);
+  } else {
+    mds_rank_t hint = MDS_RANK_NONE;
+    int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
+    if (ret > 0)
+      return;
+    reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, hint, ret);
+  }
+  mds->send_message_mds(reply, from);
+}
+
+void MDCache::handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m)
+{
+  dout(10) << "handle_open_ino_reply " << *m << dendl;
+
+  inodeno_t ino = m->ino;
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+  auto it = opening_inodes.find(ino);
+  if (it != opening_inodes.end() && it->second.checking == from) {
+    open_ino_info_t& info = it->second;
+    info.checking = MDS_RANK_NONE;
+    info.checked.insert(from);
+
+    CInode *in = get_inode(ino);
+    if (in) {
+      dout(10) << " found cached " << *in << dendl;
+      open_ino_finish(ino, info, in->authority().first);
+    } else if (!m->ancestors.empty()) {
+      dout(10) << " found ino " << ino << " on mds." << from << dendl;
+      if (!info.want_replica) {
+	open_ino_finish(ino, info, from);
+	return;
+      }
+
+      info.ancestors = m->ancestors;
+      info.auth_hint = from;
+      info.checking = mds->get_nodeid();
+      info.discover = true;
+      _open_ino_traverse_dir(ino, info, 0);
+    } else if (m->error) {
+      dout(10) << " error " << m->error << " from mds." << from << dendl;
+      do_open_ino(ino, info, m->error);
+    } else {
+      if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
+	info.auth_hint = m->hint;
+	info.checked.erase(m->hint);
+      }
+      do_open_ino_peer(ino, info);
+    }
+  }
+}
+
+void MDCache::kick_open_ino_peers(mds_rank_t who)
+{
+  dout(10) << "kick_open_ino_peers mds." << who << dendl;
+
+  for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
+       p != opening_inodes.end();
+       ++p) {
+    open_ino_info_t& info = p->second;
+    if (info.checking == who) {
+      dout(10) << "  kicking ino " << p->first << " who was checking mds." << who << dendl;
+      info.checking = MDS_RANK_NONE;
+      do_open_ino_peer(p->first, info);
+    } else if (info.checking == MDS_RANK_NONE) {
+      dout(10) << "  kicking ino " << p->first << " who was waiting" << dendl;
+      do_open_ino_peer(p->first, info);
+    }
+  }
+}
+
+void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
+		       bool want_replica, bool want_xlocked,
+		       vector<inode_backpointer_t> *ancestors_hint,
+		       mds_rank_t auth_hint)
+{
+  dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
+	   << want_replica << dendl;
+
+  auto it = opening_inodes.find(ino);
+  if (it != opening_inodes.end()) {
+    open_ino_info_t& info = it->second;
+    if (want_replica) {
+      info.want_replica = true;
+      if (want_xlocked && !info.want_xlocked) {
+	if (!info.ancestors.empty()) {
+	  CInode *diri = get_inode(info.ancestors[0].dirino);
+	  if (diri) {
+	    frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
+	    CDir *dir = diri->get_dirfrag(fg);
+	    if (dir && !dir->is_auth()) {
+	      filepath path(info.ancestors[0].dname, 0);
+	      discover_path(dir, CEPH_NOSNAP, path, NULL, true);
+	    }
+	  }
+	}
+	info.want_xlocked = true;
+      }
+    }
+    info.waiters.push_back(fin);
+  } else {
+    open_ino_info_t& info = opening_inodes[ino];
+    info.want_replica = want_replica;
+    info.want_xlocked = want_xlocked;
+    info.tid = ++open_ino_last_tid;
+    info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
+    info.waiters.push_back(fin);
+    if (auth_hint != MDS_RANK_NONE)
+      info.auth_hint = auth_hint;
+    if (ancestors_hint) {
+      info.ancestors = std::move(*ancestors_hint);
+      info.fetch_backtrace = false;
+      info.checking = mds->get_nodeid();
+      _open_ino_traverse_dir(ino, info, 0);
+    } else {
+      do_open_ino(ino, info, 0);
+    }
+  }
+}
+
+/* ---------------------------- */
+
+/*
+ * search for a given inode on MDS peers.  optionally start with the given node.
+
+
+ TODO 
+  - recover from mds node failure, recovery
+  - traverse path
+
+ */
+void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c,
+			     mds_rank_t hint, bool path_locked)
+{
+  dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
+  CInode *in = get_inode(ino);
+  if (in && in->state_test(CInode::STATE_PURGING)) {
+    c->complete(-CEPHFS_ESTALE);
+    return;
+  }
+  ceph_assert(!in);
+  
+  ceph_tid_t tid = ++find_ino_peer_last_tid;
+  find_ino_peer_info_t& fip = find_ino_peer[tid];
+  fip.ino = ino;
+  fip.tid = tid;
+  fip.fin = c;
+  fip.path_locked = path_locked;
+  fip.hint = hint;
+  _do_find_ino_peer(fip);
+}
+
+void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
+{
+  set<mds_rank_t> all, active;
+  mds->mdsmap->get_mds_set(all);
+  mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
+
+  dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
+	   << " active " << active << " all " << all
+	   << " checked " << fip.checked
+	   << dendl;
+    
+  mds_rank_t m = MDS_RANK_NONE;
+  if (fip.hint >= 0) {
+    m = fip.hint;
+    fip.hint = MDS_RANK_NONE;
+  } else {
+    for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
+      if (*p != mds->get_nodeid() &&
+	  fip.checked.count(*p) == 0) {
+	m = *p;
+	break;
+      }
+  }
+  if (m == MDS_RANK_NONE) {
+    all.erase(mds->get_nodeid());
+    if (all != fip.checked) {
+      dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
+    } else {
+      dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
+      fip.fin->complete(-CEPHFS_ESTALE);
+      find_ino_peer.erase(fip.tid);
+    }
+  } else {
+    fip.checking = m;
+    mds->send_message_mds(make_message<MMDSFindIno>(fip.tid, fip.ino), m);
+  }
+}
+
+void MDCache::handle_find_ino(const cref_t<MMDSFindIno> &m)
+{
+  if (mds->get_state() < MDSMap::STATE_REJOIN) {
+    return;
+  }
+
+  dout(10) << "handle_find_ino " << *m << dendl;
+  auto r = make_message<MMDSFindInoReply>(m->tid);
+  CInode *in = get_inode(m->ino);
+  if (in) {
+    in->make_path(r->path);
+    dout(10) << " have " << r->path << " " << *in << dendl;
+
+    /*
+     * If the the CInode was just created by using openc in current
+     * auth MDS, but the client just sends a getattr request to another
+     * replica MDS. Then here it will make a path of '#INODE-NUMBER'
+     * only because the CInode hasn't been linked yet, and the replica
+     * MDS will keep retrying until the auth MDS flushes the mdlog and
+     * the C_MDS_openc_finish and link_primary_inode are called at most
+     * 5 seconds later.
+     */
+    if (!in->get_parent_dn() && in->is_auth()) {
+      mds->mdlog->flush();
+    }
+  }
+  mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
+}
+
+
+void MDCache::handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m)
+{
+  auto p = find_ino_peer.find(m->tid);
+  if (p != find_ino_peer.end()) {
+    dout(10) << "handle_find_ino_reply " << *m << dendl;
+    find_ino_peer_info_t& fip = p->second;
+
+    // success?
+    if (get_inode(fip.ino)) {
+      dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
+      mds->queue_waiter(fip.fin);
+      find_ino_peer.erase(p);
+      return;
+    }
+
+    mds_rank_t from = mds_rank_t(m->get_source().num());
+    if (fip.checking == from)
+      fip.checking = MDS_RANK_NONE;
+    fip.checked.insert(from);
+
+    if (!m->path.empty()) {
+      // we got a path!
+      vector<CDentry*> trace;
+      CF_MDS_RetryMessageFactory cf(mds, m);
+      MDRequestRef null_ref;
+      int flags = MDS_TRAVERSE_DISCOVER;
+      if (fip.path_locked)
+	flags |= MDS_TRAVERSE_PATH_LOCKED;
+      int r = path_traverse(null_ref, cf, m->path, flags, &trace);
+      if (r > 0)
+	return; 
+      dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path 
+	      << ", retrying" << dendl;
+      fip.checked.clear();
+      _do_find_ino_peer(fip);
+    } else {
+      // nope, continue.
+      _do_find_ino_peer(fip);
+    }      
+  } else {
+    dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
+  }  
+}
+
+void MDCache::kick_find_ino_peers(mds_rank_t who)
+{
+  // find_ino_peers requests we should move on from
+  for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
+       p != find_ino_peer.end();
+       ++p) {
+    find_ino_peer_info_t& fip = p->second;
+    if (fip.checking == who) {
+      dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
+      fip.checking = MDS_RANK_NONE;
+      _do_find_ino_peer(fip);
+    } else if (fip.checking == MDS_RANK_NONE) {
+      dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
+      _do_find_ino_peer(fip);
+    }
+  }
+}
+
+/* ---------------------------- */
+
+int MDCache::get_num_client_requests()
+{
+  int count = 0;
+  for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+      p != active_requests.end();
+      ++p) {
+    MDRequestRef& mdr = p->second;
+    if (mdr->reqid.name.is_client() && !mdr->is_peer())
+      count++;
+  }
+  return count;
+}
+
+MDRequestRef MDCache::request_start(const cref_t<MClientRequest>& req)
+{
+  // did we win a forward race against a peer?
+  if (active_requests.count(req->get_reqid())) {
+    MDRequestRef& mdr = active_requests[req->get_reqid()];
+    ceph_assert(mdr);
+    if (mdr->is_peer()) {
+      dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
+      mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
+    } else {
+      dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
+    }
+    return MDRequestRef();
+  }
+
+  // register new client request
+  MDRequestImpl::Params params;
+  params.reqid = req->get_reqid();
+  params.attempt = req->get_num_fwd();
+  params.client_req = req;
+  params.initiated = req->get_recv_stamp();
+  params.throttled = req->get_throttle_stamp();
+  params.all_read = req->get_recv_complete_stamp();
+  params.dispatched = req->get_dispatch_stamp();
+
+  MDRequestRef mdr =
+      mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
+  active_requests[params.reqid] = mdr;
+  mdr->set_op_stamp(req->get_stamp());
+  dout(7) << "request_start " << *mdr << dendl;
+  return mdr;
+}
+
+MDRequestRef MDCache::request_start_peer(metareqid_t ri, __u32 attempt, const cref_t<Message> &m)
+{
+  int by = m->get_source().num();
+  MDRequestImpl::Params params;
+  params.reqid = ri;
+  params.attempt = attempt;
+  params.triggering_peer_req = m;
+  params.peer_to = by;
+  params.initiated = m->get_recv_stamp();
+  params.throttled = m->get_throttle_stamp();
+  params.all_read = m->get_recv_complete_stamp();
+  params.dispatched = m->get_dispatch_stamp();
+  MDRequestRef mdr =
+      mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
+  ceph_assert(active_requests.count(mdr->reqid) == 0);
+  active_requests[mdr->reqid] = mdr;
+  dout(7) << "request_start_peer " << *mdr << " by mds." << by << dendl;
+  return mdr;
+}
+
+MDRequestRef MDCache::request_start_internal(int op)
+{
+  utime_t now = ceph_clock_now();
+  MDRequestImpl::Params params;
+  params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
+  params.reqid.tid = mds->issue_tid();
+  params.initiated = now;
+  params.throttled = now;
+  params.all_read = now;
+  params.dispatched = now;
+  params.internal_op = op;
+  MDRequestRef mdr =
+      mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
+
+  ceph_assert(active_requests.count(mdr->reqid) == 0);
+  active_requests[mdr->reqid] = mdr;
+  dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
+  return mdr;
+}
+
+MDRequestRef MDCache::request_get(metareqid_t rid)
+{
+  ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
+  ceph_assert(p != active_requests.end());
+  dout(7) << "request_get " << rid << " " << *p->second << dendl;
+  return p->second;
+}
+
+void MDCache::request_finish(MDRequestRef& mdr)
+{
+  dout(7) << "request_finish " << *mdr << dendl;
+  mdr->mark_event("finishing request");
+
+  // peer finisher?
+  if (mdr->has_more() && mdr->more()->peer_commit) {
+    Context *fin = mdr->more()->peer_commit;
+    mdr->more()->peer_commit = 0;
+    int ret;
+    if (mdr->aborted) {
+      mdr->aborted = false;
+      ret = -1;
+      mdr->more()->peer_rolling_back = true;
+    } else {
+      ret = 0;
+      mdr->committing = true;
+    }
+    fin->complete(ret);   // this must re-call request_finish.
+    return; 
+  }
+
+  switch(mdr->internal_op) {
+    case CEPH_MDS_OP_FRAGMENTDIR:
+      logger->inc(l_mdss_ireq_fragmentdir);
+      break;
+    case CEPH_MDS_OP_EXPORTDIR:
+      logger->inc(l_mdss_ireq_exportdir);
+      break;
+    case CEPH_MDS_OP_ENQUEUE_SCRUB:
+      logger->inc(l_mdss_ireq_enqueue_scrub);
+      break;
+    case CEPH_MDS_OP_FLUSH:
+      logger->inc(l_mdss_ireq_flush);
+      break;
+    case CEPH_MDS_OP_REPAIR_FRAGSTATS:
+      logger->inc(l_mdss_ireq_fragstats);
+      break;
+    case CEPH_MDS_OP_REPAIR_INODESTATS:
+      logger->inc(l_mdss_ireq_inodestats);
+      break;
+  }
+
+  request_cleanup(mdr);
+}
+
+
+void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
+{
+  CachedStackStringStream css;
+  *css << "forwarding request to mds." << who;
+  mdr->mark_event(css->strv());
+  if (mdr->client_request && mdr->client_request->get_source().is_client()) {
+    dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
+            << *mdr->client_request << dendl;
+    if (mdr->is_batch_head()) {
+      mdr->release_batch_op()->forward(who);
+    } else {
+      mds->forward_message_mds(mdr->release_client_request(), who);
+    }
+    if (mds->logger) mds->logger->inc(l_mds_forward);
+  } else if (mdr->internal_op >= 0) {
+    dout(10) << "request_forward on internal op; cancelling" << dendl;
+    mdr->internal_op_finish->complete(-CEPHFS_EXDEV);
+  } else {
+    dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
+            << " was from mds" << dendl;
+  }
+  request_cleanup(mdr);
+}
+
+
+void MDCache::dispatch_request(MDRequestRef& mdr)
+{
+  if (mdr->client_request) {
+    mds->server->dispatch_client_request(mdr);
+  } else if (mdr->peer_request) {
+    mds->server->dispatch_peer_request(mdr);
+  } else {
+    switch (mdr->internal_op) {
+    case CEPH_MDS_OP_FRAGMENTDIR:
+      dispatch_fragment_dir(mdr);
+      break;
+    case CEPH_MDS_OP_EXPORTDIR:
+      migrator->dispatch_export_dir(mdr, 0);
+      break;
+    case CEPH_MDS_OP_ENQUEUE_SCRUB:
+      enqueue_scrub_work(mdr);
+      break;
+    case CEPH_MDS_OP_FLUSH:
+      flush_dentry_work(mdr);
+      break;
+    case CEPH_MDS_OP_REPAIR_FRAGSTATS:
+      repair_dirfrag_stats_work(mdr);
+      break;
+    case CEPH_MDS_OP_REPAIR_INODESTATS:
+      repair_inode_stats_work(mdr);
+      break;
+    case CEPH_MDS_OP_RDLOCK_FRAGSSTATS:
+      rdlock_dirfrags_stats_work(mdr);
+      break;
+    default:
+      ceph_abort();
+    }
+  }
+}
+
+
+void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
+{
+  if (!mdr->has_more())
+    return;
+
+  // clean up peers
+  //  (will implicitly drop remote dn pins)
+  for (set<mds_rank_t>::iterator p = mdr->more()->peers.begin();
+       p != mdr->more()->peers.end();
+       ++p) {
+    auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt,
+					    MMDSPeerRequest::OP_FINISH);
+
+    if (mdr->killed && !mdr->committing) {
+      r->mark_abort();
+    } else if (mdr->more()->srcdn_auth_mds == *p &&
+	       mdr->more()->inode_import.length() > 0) {
+      // information about rename imported caps
+      r->inode_export = std::move(mdr->more()->inode_import);
+    }
+
+    mds->send_message_mds(r, *p);
+  }
+
+  /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
+   * implicitly. Note that we don't call the finishers -- there shouldn't
+   * be any on a remote lock and the request finish wakes up all
+   * the waiters anyway! */
+
+  for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
+    SimpleLock *lock = it->lock;
+    if (it->is_xlock() && !lock->get_parent()->is_auth()) {
+      dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
+	       << " on " << lock->get_parent() << dendl;
+      lock->put_xlock();
+      mdr->locks.erase(it++);
+    } else if (it->is_remote_wrlock()) {
+      dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
+	       << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
+      if (it->is_wrlock()) {
+	it->clear_remote_wrlock();
+	++it;
+      } else {
+	mdr->locks.erase(it++);
+      }
+    } else {
+      ++it;
+    }
+  }
+
+  mdr->more()->peers.clear(); /* we no longer have requests out to them, and
+                                * leaving them in can cause double-notifies as
+                                * this function can get called more than once */
+}
+
+void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
+{
+  request_drop_foreign_locks(mdr);
+  mds->locker->drop_non_rdlocks(mdr.get());
+}
+
+void MDCache::request_drop_locks(MDRequestRef& mdr)
+{
+  request_drop_foreign_locks(mdr);
+  mds->locker->drop_locks(mdr.get());
+}
+
+void MDCache::request_cleanup(MDRequestRef& mdr)
+{
+  dout(15) << "request_cleanup " << *mdr << dendl;
+
+  if (mdr->has_more()) {
+    if (mdr->more()->is_ambiguous_auth)
+      mdr->clear_ambiguous_auth();
+    if (!mdr->more()->waiting_for_finish.empty())
+      mds->queue_waiters(mdr->more()->waiting_for_finish);
+  }
+
+  request_drop_locks(mdr);
+
+  // drop (local) auth pins
+  mdr->drop_local_auth_pins();
+
+  // drop stickydirs
+  mdr->put_stickydirs();
+
+  mds->locker->kick_cap_releases(mdr);
+
+  // drop cache pins
+  mdr->drop_pins();
+
+  // remove from session
+  mdr->item_session_request.remove_myself();
+
+  // remove from map
+  active_requests.erase(mdr->reqid);
+
+  if (mds->logger)
+    log_stat();
+
+  mdr->mark_event("cleaned up request");
+}
+
+void MDCache::request_kill(MDRequestRef& mdr)
+{
+  // rollback peer requests is tricky. just let the request proceed.
+  if (mdr->has_more() &&
+      (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_peer.empty())) {
+    if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+      ceph_assert(mdr->more()->witnessed.empty());
+      mdr->aborted = true;
+      dout(10) << "request_kill " << *mdr << " -- waiting for peer reply, delaying" << dendl;
+    } else {
+      dout(10) << "request_kill " << *mdr << " -- already started peer prep, no-op" << dendl;
+    }
+
+    ceph_assert(mdr->used_prealloc_ino == 0);
+    ceph_assert(mdr->prealloc_inos.empty());
+
+    mdr->session = NULL;
+    mdr->item_session_request.remove_myself();
+    return;
+  }
+
+  mdr->killed = true;
+  mdr->mark_event("killing request");
+
+  if (mdr->committing) {
+    dout(10) << "request_kill " << *mdr << " -- already committing, remove it from sesssion requests" << dendl;
+    mdr->item_session_request.remove_myself();
+  } else {
+    dout(10) << "request_kill " << *mdr << dendl;
+    request_cleanup(mdr);
+  }
+}
+
+// -------------------------------------------------------------------------------
+// SNAPREALMS
+
+void MDCache::create_global_snaprealm()
+{
+  CInode *in = new CInode(this); // dummy inode
+  create_unlinked_system_inode(in, CEPH_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
+  add_inode(in);
+  global_snaprealm = in->snaprealm;
+}
+
+void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
+{
+  dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
+
+  vector<inodeno_t> split_inos;
+  vector<inodeno_t> split_realms;
+
+  if (notify_clients) {
+    if (snapop == CEPH_SNAP_OP_SPLIT) {
+      // notify clients of update|split
+      for (auto p = in->snaprealm->inodes_with_caps.begin(); !p.end(); ++p)
+	split_inos.push_back((*p)->ino());
+
+      for (auto& r : in->snaprealm->open_children)
+	split_realms.push_back(r->inode->ino());
+    }
+  }
+
+  map<client_t, ref_t<MClientSnap>> updates;
+  list<SnapRealm*> q;
+  q.push_back(in->snaprealm);
+  while (!q.empty()) {
+    SnapRealm *realm = q.front();
+    q.pop_front();
+
+    dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
+    realm->invalidate_cached_snaps();
+
+    if (notify_clients) {
+      for (const auto& p : realm->client_caps) {
+        const auto& client = p.first;
+        const auto& caps = p.second;
+	ceph_assert(!caps->empty());
+
+        auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
+        if (em.second) {
+          auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
+	  update->head.split = in->ino();
+	  update->split_inos = split_inos;
+	  update->split_realms = split_realms;
+	  update->bl = in->snaprealm->get_snap_trace();
+	  em.first->second = std::move(update);
+	}
+      }
+    }
+
+    // notify for active children, too.
+    dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
+    for (auto& r : realm->open_children)
+      q.push_back(r);
+  }
+
+  if (notify_clients)
+    send_snaps(updates);
+}
+
+void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
+{
+  dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
+  ceph_assert(in->is_auth());
+
+  set<mds_rank_t> mds_set;
+  if (stid > 0) {
+    mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
+    mds_set.erase(mds->get_nodeid());
+  } else {
+    in->list_replicas(mds_set);
+  }
+
+  if (!mds_set.empty()) {
+    bufferlist snap_blob;
+    in->encode_snap(snap_blob);
+
+    for (auto p : mds_set) {
+      auto m = make_message<MMDSSnapUpdate>(in->ino(), stid, snap_op);
+      m->snap_blob = snap_blob;
+      mds->send_message_mds(m, p);
+    }
+  }
+
+  if (stid > 0)
+    notify_global_snaprealm_update(snap_op);
+}
+
+void MDCache::handle_snap_update(const cref_t<MMDSSnapUpdate> &m)
+{
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+  dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
+
+  if (mds->get_state() < MDSMap::STATE_RESOLVE &&
+      mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
+    return;
+  }
+
+  // null rejoin_done means open_snaprealms() has already been called
+  bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
+			(mds->is_rejoin() && !rejoin_done);
+
+  if (m->get_tid() > 0) {
+    mds->snapclient->notify_commit(m->get_tid());
+    if (notify_clients)
+      notify_global_snaprealm_update(m->get_snap_op());
+  }
+
+  CInode *in = get_inode(m->get_ino());
+  if (in) {
+    ceph_assert(!in->is_auth());
+    if (mds->get_state() > MDSMap::STATE_REJOIN ||
+	(mds->is_rejoin() && !in->is_rejoining())) {
+      auto p = m->snap_blob.cbegin();
+      in->decode_snap(p);
+
+      if (!notify_clients) {
+	if (!rejoin_pending_snaprealms.count(in)) {
+	  in->get(CInode::PIN_OPENINGSNAPPARENTS);
+	  rejoin_pending_snaprealms.insert(in);
+	}
+      }
+      do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
+    }
+  }
+}
+
+void MDCache::notify_global_snaprealm_update(int snap_op)
+{
+  if (snap_op != CEPH_SNAP_OP_DESTROY)
+    snap_op = CEPH_SNAP_OP_UPDATE;
+  set<Session*> sessions;
+  mds->sessionmap.get_client_session_set(sessions);
+  for (auto &session : sessions) {
+    if (!session->is_open() && !session->is_stale())
+      continue;
+    auto update = make_message<MClientSnap>(snap_op);
+    update->head.split = global_snaprealm->inode->ino();
+    update->bl = global_snaprealm->get_snap_trace();
+    mds->send_message_client_counted(update, session);
+  }
+}
+
+// -------------------------------------------------------------------------------
+// STRAYS
+
+struct C_MDC_RetryScanStray : public MDCacheContext {
+  dirfrag_t next;
+  C_MDC_RetryScanStray(MDCache *c,  dirfrag_t n) : MDCacheContext(c), next(n) { }
+  void finish(int r) override {
+    mdcache->scan_stray_dir(next);
+  }
+};
+
+void MDCache::scan_stray_dir(dirfrag_t next)
+{
+  dout(10) << "scan_stray_dir " << next << dendl;
+
+  if (next.ino)
+    next.frag = strays[MDS_INO_STRAY_INDEX(next.ino)]->dirfragtree[next.frag.value()];
+
+  for (int i = 0; i < NUM_STRAY; ++i) {
+    if (strays[i]->ino() < next.ino)
+      continue;
+
+    std::vector<CDir*> ls;
+    strays[i]->get_dirfrags(ls);
+
+    for (const auto& dir : ls) {
+      if (dir->get_frag() < next.frag)
+	continue;
+
+      if (!dir->can_auth_pin()) {
+	dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_RetryScanStray(this, dir->dirfrag()));
+	return;
+      }
+
+      if (!dir->is_complete()) {
+	dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
+	return;
+      }
+
+      for (auto &p : dir->items) {
+	CDentry *dn = p.second;
+	dn->state_set(CDentry::STATE_STRAY);
+	CDentry::linkage_t *dnl = dn->get_projected_linkage();
+	if (dnl->is_primary()) {
+	  CInode *in = dnl->get_inode();
+	  if (in->get_inode()->nlink == 0)
+	    in->state_set(CInode::STATE_ORPHAN);
+	  maybe_eval_stray(in);
+	}
+      }
+    }
+  }
+}
+
+void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
+{
+  object_t oid = CInode::get_object_name(ino, frag_t(), "");
+  mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
+  if (mds->logger)
+    mds->logger->inc(l_mds_openino_backtrace_fetch);
+}
+
+
+
+
+
+// ========================================================================================
+// DISCOVER
+/*
+
+  - for all discovers (except base_inos, e.g. root, stray), waiters are attached
+  to the parent metadata object in the cache (pinning it).
+
+  - all discovers are tracked by tid, so that we can ignore potentially dup replies.
+
+*/
+
+void MDCache::_send_discover(discover_info_t& d)
+{
+  auto dis = make_message<MDiscover>(d.ino, d.frag, d.snap, d.want_path,
+				     d.want_base_dir, d.path_locked);
+  dis->set_tid(d.tid);
+  mds->send_message_mds(dis, d.mds);
+}
+
+void MDCache::discover_base_ino(inodeno_t want_ino,
+				MDSContext *onfinish,
+				mds_rank_t from) 
+{
+  dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
+  if (waiting_for_base_ino[from].count(want_ino) == 0) {
+    discover_info_t& d = _create_discover(from);
+    d.ino = want_ino;
+    _send_discover(d);
+  }
+  waiting_for_base_ino[from][want_ino].push_back(onfinish);
+}
+
+
+void MDCache::discover_dir_frag(CInode *base,
+				frag_t approx_fg,
+				MDSContext *onfinish,
+				mds_rank_t from)
+{
+  if (from < 0)
+    from = base->authority().first;
+
+  dirfrag_t df(base->ino(), approx_fg);
+  dout(7) << "discover_dir_frag " << df
+	  << " from mds." << from << dendl;
+
+  if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
+    discover_info_t& d = _create_discover(from);
+    d.pin_base(base);
+    d.ino = base->ino();
+    d.frag = approx_fg;
+    d.want_base_dir = true;
+    _send_discover(d);
+  }
+
+  if (onfinish) 
+    base->add_dir_waiter(approx_fg, onfinish);
+}
+
+struct C_MDC_RetryDiscoverPath : public MDCacheContext {
+  CInode *base;
+  snapid_t snapid;
+  filepath path;
+  mds_rank_t from;
+  C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
+    MDCacheContext(c), base(b), snapid(s), path(p), from(f)  {}
+  void finish(int r) override {
+    mdcache->discover_path(base, snapid, path, 0, from);
+  }
+};
+
+void MDCache::discover_path(CInode *base,
+			    snapid_t snap,
+			    filepath want_path,
+			    MDSContext *onfinish,
+			    bool path_locked,
+			    mds_rank_t from)
+{
+  if (from < 0)
+    from = base->authority().first;
+
+  dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
+	  << (path_locked ? " path_locked":"")
+	  << dendl;
+
+  if (base->is_ambiguous_auth()) {
+    dout(10) << " waiting for single auth on " << *base << dendl;
+    if (!onfinish)
+      onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
+    base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
+    return;
+  } else if (from == mds->get_nodeid()) {
+    MDSContext::vec finished;
+    base->take_waiting(CInode::WAIT_DIR, finished);
+    mds->queue_waiters(finished);
+    return;
+  }
+
+  frag_t fg = base->pick_dirfrag(want_path[0]);
+  if ((path_locked && want_path.depth() == 1) ||
+      !base->is_waiting_for_dir(fg) || !onfinish) {
+    discover_info_t& d = _create_discover(from);
+    d.ino = base->ino();
+    d.pin_base(base);
+    d.frag = fg;
+    d.snap = snap;
+    d.want_path = want_path;
+    d.want_base_dir = true;
+    d.path_locked = path_locked;
+    _send_discover(d);
+  }
+
+  // register + wait
+  if (onfinish)
+    base->add_dir_waiter(fg, onfinish);
+}
+
+struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
+  CDir *base;
+  snapid_t snapid;
+  filepath path;
+  C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
+    MDCacheContext(c), base(b), snapid(s), path(p) {}
+  void finish(int r) override {
+    mdcache->discover_path(base, snapid, path, 0);
+  }
+};
+
+void MDCache::discover_path(CDir *base,
+			    snapid_t snap,
+			    filepath want_path,
+			    MDSContext *onfinish,
+			    bool path_locked)
+{
+  mds_rank_t from = base->authority().first;
+
+  dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
+	  << (path_locked ? " path_locked":"")
+	  << dendl;
+
+  if (base->is_ambiguous_auth()) {
+    dout(7) << " waiting for single auth on " << *base << dendl;
+    if (!onfinish)
+      onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
+    base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
+    return;
+  } else if (from == mds->get_nodeid()) {
+    MDSContext::vec finished;
+    base->take_sub_waiting(finished);
+    mds->queue_waiters(finished);
+    return;
+  }
+
+  if ((path_locked && want_path.depth() == 1) ||
+      !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
+    discover_info_t& d = _create_discover(from);
+    d.ino = base->ino();
+    d.pin_base(base->inode);
+    d.frag = base->get_frag();
+    d.snap = snap;
+    d.want_path = want_path;
+    d.want_base_dir = false;
+    d.path_locked = path_locked;
+    _send_discover(d);
+  }
+
+  // register + wait
+  if (onfinish)
+    base->add_dentry_waiter(want_path[0], snap, onfinish);
+}
+
+void MDCache::kick_discovers(mds_rank_t who)
+{
+  for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
+       p != discovers.end();
+       ++p) {
+    if (p->second.mds != who)
+      continue;
+    _send_discover(p->second);
+  }
+}
+
+
+void MDCache::handle_discover(const cref_t<MDiscover> &dis) 
+{
+  mds_rank_t whoami = mds->get_nodeid();
+  mds_rank_t from = mds_rank_t(dis->get_source().num());
+
+  ceph_assert(from != whoami);
+
+  if (mds->get_state() <= MDSMap::STATE_REJOIN) {
+    if (mds->get_state() < MDSMap::STATE_REJOIN &&
+	mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
+      return;
+    }
+
+    // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
+    // delay processing request from survivor because we may not yet choose lock states.
+    if (!mds->mdsmap->is_rejoin(from)) {
+      dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
+      mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
+      return;
+    }
+  }
+
+
+  CInode *cur = 0;
+  auto reply = make_message<MDiscoverReply>(*dis);
+
+  snapid_t snapid = dis->get_snapid();
+
+  // get started.
+  if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
+      !dis->wants_base_dir() && dis->get_want().depth() == 0) {
+    // wants root
+    dout(7) << "handle_discover from mds." << from
+	    << " wants base + " << dis->get_want().get_path()
+	    << " snap " << snapid
+	    << dendl;
+
+    cur = get_inode(dis->get_base_ino());
+    ceph_assert(cur);
+
+    // add root
+    reply->starts_with = MDiscoverReply::INODE;
+    encode_replica_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
+    dout(10) << "added base " << *cur << dendl;
+  }
+  else {
+    // there's a base inode
+    cur = get_inode(dis->get_base_ino(), snapid);
+    if (!cur && snapid != CEPH_NOSNAP) {
+      cur = get_inode(dis->get_base_ino());
+      if (cur && !cur->is_multiversion())
+	cur = NULL;  // nope!
+    }
+    
+    if (!cur) {
+      dout(7) << "handle_discover mds." << from 
+	      << " don't have base ino " << dis->get_base_ino() << "." << snapid
+	      << dendl;
+      if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
+	reply->set_error_dentry(dis->get_dentry(0));
+      reply->set_flag_error_dir();
+    } else if (dis->wants_base_dir()) {
+      dout(7) << "handle_discover mds." << from 
+	      << " wants basedir+" << dis->get_want().get_path() 
+	      << " has " << *cur 
+	      << dendl;
+    } else {
+      dout(7) << "handle_discover mds." << from 
+	      << " wants " << dis->get_want().get_path()
+	      << " has " << *cur
+	      << dendl;
+    }
+  }
+
+  ceph_assert(reply);
+  
+  // add content
+  // do some fidgeting to include a dir if they asked for the base dir, or just root.
+  for (unsigned i = 0; 
+       cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0); 
+       i++) {
+
+    // -- figure out the dir
+
+    // is *cur even a dir at all?
+    if (!cur->is_dir()) {
+      dout(7) << *cur << " not a dir" << dendl;
+      reply->set_flag_error_dir();
+      break;
+    }
+
+    // pick frag
+    frag_t fg;
+    if (dis->get_want().depth()) {
+      // dentry specifies
+      fg = cur->pick_dirfrag(dis->get_dentry(i));
+    } else {
+      // requester explicity specified the frag
+      ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
+      fg = dis->get_base_dir_frag();
+      if (!cur->dirfragtree.is_leaf(fg))
+	fg = cur->dirfragtree[fg.value()];
+    }
+    CDir *curdir = cur->get_dirfrag(fg);
+
+    if ((!curdir && !cur->is_auth()) ||
+	(curdir && !curdir->is_auth())) {
+
+	/* before:
+	 * ONLY set flag if empty!!
+	 * otherwise requester will wake up waiter(s) _and_ continue with discover,
+	 * resulting in duplicate discovers in flight,
+	 * which can wreak havoc when discovering rename srcdn (which may move)
+	 */
+
+      if (reply->is_empty()) {
+	// only hint if empty.
+	//  someday this could be better, but right now the waiter logic isn't smart enough.
+	
+	// hint
+	if (curdir) {
+	  dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
+	  reply->set_dir_auth_hint(curdir->authority().first);
+	} else {
+	  dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " 
+		  << *cur << dendl;
+	  reply->set_dir_auth_hint(cur->authority().first);
+	}
+	
+	// note error dentry, if any
+	//  NOTE: important, as it allows requester to issue an equivalent discover
+	//        to whomever we hint at.
+	if (dis->get_want().depth() > i)
+	  reply->set_error_dentry(dis->get_dentry(i));
+      }
+
+      break;
+    }
+
+    if (!curdir) { // open dir?
+      if (cur->is_frozen()) {
+	if (!reply->is_empty()) {
+	  dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
+	  break;
+	}
+	dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
+	cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
+	return;
+      }
+      curdir = cur->get_or_open_dirfrag(this, fg);
+    } else if (curdir->is_frozen_tree() ||
+	       (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
+      if (!reply->is_empty()) {
+	dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
+	break;
+      }
+      if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
+	dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
+	reply->set_flag_error_dir();
+	break;
+      }
+      dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
+      curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
+      return;
+    }
+    
+    // add dir
+    if (curdir->get_version() == 0) {
+      // fetch newly opened dir
+    } else if (reply->is_empty() && !dis->wants_base_dir()) {
+      dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
+      // make sure the base frag is correct, though, in there was a refragment since the
+      // original request was sent.
+      reply->set_base_dir_frag(curdir->get_frag());
+    } else {
+      ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
+      if (!reply->trace.length())
+	reply->starts_with = MDiscoverReply::DIR;
+      encode_replica_dir(curdir, from, reply->trace);
+      dout(7) << "handle_discover added dir " << *curdir << dendl;
+    }
+
+    // lookup
+    CDentry *dn = 0;
+    if (curdir->get_version() == 0) {
+      // fetch newly opened dir
+      ceph_assert(!curdir->has_bloom());
+    } else if (dis->get_want().depth() > 0) {
+      // lookup dentry
+      dn = curdir->lookup(dis->get_dentry(i), snapid);
+    } else 
+      break; // done!
+          
+    // incomplete dir?
+    if (!dn) {
+      if (!curdir->is_complete() &&
+	  !(snapid == CEPH_NOSNAP &&
+	    curdir->has_bloom() &&
+	    !curdir->is_in_bloom(dis->get_dentry(i)))) {
+	// readdir
+	dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
+	if (reply->is_empty()) {
+	  // fetch and wait
+	  curdir->fetch(new C_MDS_RetryMessage(mds, dis),
+			dis->wants_base_dir() && curdir->get_version() == 0);
+	  return;
+	} else {
+	  // initiate fetch, but send what we have so far
+	  curdir->fetch(0);
+	  break;
+	}
+      }
+
+      if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
+	dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
+		<< " dne, non-empty reply, stopping" << dendl;
+	break;
+      }
+
+      // send null dentry
+      dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
+	      << *curdir << dendl;
+      if (snapid == CEPH_NOSNAP)
+	dn = curdir->add_null_dentry(dis->get_dentry(i));
+      else
+	dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
+    }
+    ceph_assert(dn);
+
+    // don't add replica to purging dentry/inode
+    if (dn->state_test(CDentry::STATE_PURGING)) {
+      if (reply->is_empty())
+	reply->set_flag_error_dn(dis->get_dentry(i));
+      break;
+    }
+
+    CDentry::linkage_t *dnl = dn->get_linkage();
+
+    // xlocked dentry?
+    //  ...always block on non-tail items (they are unrelated)
+    //  ...allow xlocked tail disocvery _only_ if explicitly requested
+    if (dn->lock.is_xlocked()) {
+      // is this the last (tail) item in the discover traversal?
+      if (dis->is_path_locked()) {
+	dout(7) << "handle_discover allowing discovery of xlocked " << *dn << dendl;
+      } else if (reply->is_empty()) {
+	dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
+	dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
+	return;
+      } else {
+	dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
+	break;
+      }
+    }
+
+    // frozen inode?
+    bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
+    if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
+      if (tailitem && dis->is_path_locked()) {
+	dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
+      } else if (reply->is_empty()) {
+	dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
+	dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
+	return;
+      } else {
+	dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
+	break;
+      }
+    }
+
+    // add dentry
+    if (!reply->trace.length())
+      reply->starts_with = MDiscoverReply::DENTRY;
+    encode_replica_dentry(dn, from, reply->trace);
+    dout(7) << "handle_discover added dentry " << *dn << dendl;
+    
+    if (!dnl->is_primary()) break;  // stop on null or remote link.
+    
+    // add inode
+    CInode *next = dnl->get_inode();
+    ceph_assert(next->is_auth());
+    
+    encode_replica_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
+    dout(7) << "handle_discover added inode " << *next << dendl;
+    
+    // descend, keep going.
+    cur = next;
+    continue;
+  }
+
+  // how did we do?
+  ceph_assert(!reply->is_empty());
+  dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
+  mds->send_message(reply, dis->get_connection());
+}
+
+void MDCache::handle_discover_reply(const cref_t<MDiscoverReply> &m)
+{
+  /*
+  if (mds->get_state() < MDSMap::STATE_ACTIVE) {
+    dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
+    return;
+  }
+  */
+  dout(7) << "discover_reply " << *m << dendl;
+  if (m->is_flag_error_dir()) 
+    dout(7) << " flag error, dir" << dendl;
+  if (m->is_flag_error_dn()) 
+    dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
+
+  MDSContext::vec finished, error;
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+
+  // starting point
+  CInode *cur = get_inode(m->get_base_ino());
+  auto p = m->trace.cbegin();
+
+  int next = m->starts_with;
+
+  // decrement discover counters
+  if (m->get_tid()) {
+    map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
+    if (p != discovers.end()) {
+      dout(10) << " found tid " << m->get_tid() << dendl;
+      discovers.erase(p);
+    } else {
+      dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
+    }
+  }
+
+  // discover may start with an inode
+  if (!p.end() && next == MDiscoverReply::INODE) {
+    decode_replica_inode(cur, p, NULL, finished);
+    dout(7) << "discover_reply got base inode " << *cur << dendl;
+    ceph_assert(cur->is_base());
+    
+    next = MDiscoverReply::DIR;
+    
+    // take waiters?
+    if (cur->is_base() &&
+	waiting_for_base_ino[from].count(cur->ino())) {
+      finished.swap(waiting_for_base_ino[from][cur->ino()]);
+      waiting_for_base_ino[from].erase(cur->ino());
+    }
+  }
+  ceph_assert(cur);
+  
+  // loop over discover results.
+  // indexes follow each ([[dir] dentry] inode) 
+  // can start, end with any type.
+  while (!p.end()) {
+    // dir
+    frag_t fg;
+    CDir *curdir = nullptr;
+    if (next == MDiscoverReply::DIR) {
+      decode_replica_dir(curdir, p, cur, mds_rank_t(m->get_source().num()), finished);
+      if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
+	ceph_assert(m->get_wanted_base_dir());
+	cur->take_dir_waiting(m->get_base_dir_frag(), finished);
+      }
+    } else {
+      // note: this can only happen our first way around this loop.
+      if (p.end() && m->is_flag_error_dn()) {
+	fg = cur->pick_dirfrag(m->get_error_dentry());
+	curdir = cur->get_dirfrag(fg);
+      } else
+	curdir = cur->get_dirfrag(m->get_base_dir_frag());
+    }
+
+    if (p.end())
+      break;
+    
+    // dentry
+    CDentry *dn = nullptr;
+    decode_replica_dentry(dn, p, curdir, finished);
+    
+    if (p.end())
+      break;
+
+    // inode
+    decode_replica_inode(cur, p, dn, finished);
+
+    next = MDiscoverReply::DIR;
+  }
+
+  // dir error?
+  // or dir_auth hint?
+  if (m->is_flag_error_dir() && !cur->is_dir()) {
+    // not a dir.
+    cur->take_waiting(CInode::WAIT_DIR, error);
+  } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
+    mds_rank_t who = m->get_dir_auth_hint();
+    if (who == mds->get_nodeid()) who = -1;
+    if (who >= 0)
+      dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
+
+
+    if (m->get_wanted_base_dir()) {
+      frag_t fg = m->get_base_dir_frag();
+      CDir *dir = cur->get_dirfrag(fg);
+
+      if (cur->is_waiting_for_dir(fg)) {
+	if (cur->is_auth())
+	  cur->take_waiting(CInode::WAIT_DIR, finished);
+	else if (dir || !cur->dirfragtree.is_leaf(fg))
+	  cur->take_dir_waiting(fg, finished);
+	else
+	  discover_dir_frag(cur, fg, 0, who);
+      } else
+	dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
+    }
+
+    // try again?
+    if (m->get_error_dentry().length()) {
+      frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
+      CDir *dir = cur->get_dirfrag(fg);
+      // wanted a dentry
+      if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
+	if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
+	  dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
+				   m->get_wanted_snapid(), finished);
+	} else {
+	  filepath relpath(m->get_error_dentry(), 0);
+	  discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->is_path_locked());
+	}
+      } else
+	dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
+		<< m->get_error_dentry() << dendl;
+    }
+  } else if (m->is_flag_error_dn()) {
+    frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
+    CDir *dir = cur->get_dirfrag(fg);
+    if (dir) {
+      if (dir->is_auth()) {
+	dir->take_sub_waiting(finished);
+      } else {
+	dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
+				 m->get_wanted_snapid(), error);
+      }
+    }
+  }
+
+  // waiters
+  finish_contexts(g_ceph_context, error, -CEPHFS_ENOENT);  // finish errors directly
+  mds->queue_waiters(finished);
+}
+
+
+
+// ----------------------------
+// REPLICAS
+
+
+void MDCache::encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
+{
+  ENCODE_START(1, 1, bl);
+  dirfrag_t df = dir->dirfrag();
+  encode(df, bl);
+  __u32 nonce = dir->add_replica(to);
+  encode(nonce, bl);
+  dir->_encode_base(bl);
+  ENCODE_FINISH(bl);
+}
+
+void MDCache::encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
+{
+  ENCODE_START(2, 1, bl);
+  encode(dn->get_name(), bl);
+  encode(dn->last, bl);
+
+  __u32 nonce = dn->add_replica(to);
+  encode(nonce, bl);
+  encode(dn->first, bl);
+  encode(dn->linkage.remote_ino, bl);
+  encode(dn->linkage.remote_d_type, bl);
+  dn->lock.encode_state_for_replica(bl);
+  bool need_recover = mds->get_state() < MDSMap::STATE_ACTIVE;
+  encode(need_recover, bl);
+  encode(dn->alternate_name, bl);
+  ENCODE_FINISH(bl);
+}
+
+void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
+			      uint64_t features)
+{
+  ceph_assert(in->is_auth());
+
+  ENCODE_START(2, 1, bl);
+  encode(in->ino(), bl);  // bleh, minor assymetry here
+  encode(in->last, bl);
+
+  __u32 nonce = in->add_replica(to);
+  encode(nonce, bl);
+
+  in->_encode_base(bl, features);
+  in->_encode_locks_state_for_replica(bl, mds->get_state() < MDSMap::STATE_ACTIVE);
+
+  __u32 state = in->state;
+  encode(state, bl);
+
+  ENCODE_FINISH(bl);
+}
+
+void MDCache::decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
+			       MDSContext::vec& finished)
+{
+  DECODE_START(1, p);
+  dirfrag_t df;
+  decode(df, p);
+
+  ceph_assert(diri->ino() == df.ino);
+
+  // add it (_replica_)
+  dir = diri->get_dirfrag(df.frag);
+
+  if (dir) {
+    // had replica. update w/ new nonce.
+    __u32 nonce;
+    decode(nonce, p);
+    dir->set_replica_nonce(nonce);
+    dir->_decode_base(p);
+    dout(7) << __func__ << " had " << *dir << " nonce " << dir->replica_nonce << dendl;
+  } else {
+    // force frag to leaf in the diri tree
+    if (!diri->dirfragtree.is_leaf(df.frag)) {
+      dout(7) << __func__ << " forcing frag " << df.frag << " to leaf in the fragtree "
+	      << diri->dirfragtree << dendl;
+      diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
+    }
+    // add replica.
+    dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
+    __u32 nonce;
+    decode(nonce, p);
+    dir->set_replica_nonce(nonce);
+    dir->_decode_base(p);
+    // is this a dir_auth delegation boundary?
+    if (from != diri->authority().first ||
+	diri->is_ambiguous_auth() ||
+	diri->is_base())
+      adjust_subtree_auth(dir, from);
+    
+    dout(7) << __func__ << " added " << *dir << " nonce " << dir->replica_nonce << dendl;
+    // get waiters
+    diri->take_dir_waiting(df.frag, finished);
+  }
+  DECODE_FINISH(p);
+}
+
+void MDCache::decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
+{
+  DECODE_START(1, p);
+  string name;
+  snapid_t last;
+  decode(name, p);
+  decode(last, p);
+
+  dn = dir->lookup(name, last);
+  
+  // have it?
+  bool is_new = false;
+  if (dn) {
+    is_new = false;
+    dout(7) << __func__ << " had " << *dn << dendl;
+  } else {
+    is_new = true;
+    dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
+    dout(7) << __func__ << " added " << *dn << dendl;
+  }
+  
+  __u32 nonce;
+  decode(nonce, p);
+  dn->set_replica_nonce(nonce); 
+  decode(dn->first, p);
+
+  inodeno_t rino;
+  unsigned char rdtype;
+  decode(rino, p);
+  decode(rdtype, p);
+  dn->lock.decode_state(p, is_new);
+
+  bool need_recover;
+  decode(need_recover, p);
+
+  mempool::mds_co::string alternate_name;
+  if (struct_v >= 2) {
+    decode(alternate_name, p);
+  }
+
+  if (is_new) {
+    dn->set_alternate_name(std::move(alternate_name));
+    if (rino)
+      dir->link_remote_inode(dn, rino, rdtype);
+    if (need_recover)
+      dn->lock.mark_need_recover();
+  } else {
+    ceph_assert(dn->alternate_name == alternate_name);
+  }
+
+  dir->take_dentry_waiting(name, dn->first, dn->last, finished);
+  DECODE_FINISH(p);
+}
+
+void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
+{
+  DECODE_START(2, p);
+  inodeno_t ino;
+  snapid_t last;
+  __u32 nonce;
+  decode(ino, p);
+  decode(last, p);
+  decode(nonce, p);
+  in = get_inode(ino, last);
+  if (!in) {
+    in = new CInode(this, false, 2, last);
+    in->set_replica_nonce(nonce);
+    in->_decode_base(p);
+    in->_decode_locks_state_for_replica(p, true);
+    add_inode(in);
+    if (in->ino() == CEPH_INO_ROOT)
+      in->inode_auth.first = 0;
+    else if (in->is_mdsdir())
+      in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
+    dout(10) << __func__ << " added " << *in << dendl;
+    if (dn) {
+      ceph_assert(dn->get_linkage()->is_null());
+      dn->dir->link_primary_inode(dn, in);
+    }
+  } else {
+    in->set_replica_nonce(nonce);
+    in->_decode_base(p);
+    in->_decode_locks_state_for_replica(p, false);
+    dout(10) << __func__ << " had " << *in << dendl;
+  }
+
+  if (dn) {
+    if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
+      dout(10) << __func__ << " different linkage in dentry " << *dn << dendl;
+  }
+
+  if (struct_v >= 2) {
+    __u32 s;
+    decode(s, p);
+    s &= CInode::MASK_STATE_REPLICATED;
+    if (s & CInode::STATE_RANDEPHEMERALPIN) {
+      dout(10) << "replica inode is random ephemeral pinned" << dendl;
+      in->set_ephemeral_pin(false, true);
+    }
+  }
+
+  DECODE_FINISH(p); 
+}
+
+ 
+void MDCache::encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
+{
+  ceph_assert(straydn->get_num_auth_pins());
+  ENCODE_START(2, 1, bl);
+  uint64_t features = mds->mdsmap->get_up_features();
+  encode_replica_inode(get_myin(), who, bl, features);
+  encode_replica_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
+  encode_replica_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
+  encode_replica_inode(straydn->get_dir()->inode, who, bl, features);
+  encode_replica_dir(straydn->get_dir(), who, bl);
+  encode_replica_dentry(straydn, who, bl);
+  if (!straydn->get_projected_linkage()->is_null()) {
+    encode_replica_inode(straydn->get_projected_linkage()->get_inode(), who, bl, features);
+  }
+  ENCODE_FINISH(bl);
+}
+   
+void MDCache::decode_replica_stray(CDentry *&straydn, CInode **in, const bufferlist &bl, mds_rank_t from)
+{
+  MDSContext::vec finished;
+  auto p = bl.cbegin();
+
+  DECODE_START(2, p);
+  CInode *mdsin = nullptr;
+  decode_replica_inode(mdsin, p, NULL, finished);
+  CDir *mdsdir = nullptr;
+  decode_replica_dir(mdsdir, p, mdsin, from, finished);
+  CDentry *straydirdn = nullptr; 
+  decode_replica_dentry(straydirdn, p, mdsdir, finished);
+  CInode *strayin = nullptr;
+  decode_replica_inode(strayin, p, straydirdn, finished);
+  CDir *straydir = nullptr;
+  decode_replica_dir(straydir, p, strayin, from, finished);
+
+  decode_replica_dentry(straydn, p, straydir, finished);
+  if (struct_v >= 2 && in) {
+    decode_replica_inode(*in, p, straydn, finished);
+  }
+  if (!finished.empty())
+    mds->queue_waiters(finished);
+  DECODE_FINISH(p);
+}
+
+
+int MDCache::send_dir_updates(CDir *dir, bool bcast)
+{
+  // this is an FYI, re: replication
+
+  set<mds_rank_t> who;
+  if (bcast) {
+    set<mds_rank_t> mds_set;
+    mds->get_mds_map()->get_active_mds_set(mds_set);
+
+    set<mds_rank_t> replica_set;
+    for (const auto &p : dir->get_replicas()) {
+      replica_set.insert(p.first);
+    }
+
+    std::set_difference(mds_set.begin(), mds_set.end(),
+                        replica_set.begin(), replica_set.end(),
+                        std::inserter(who, who.end()));
+  } else {
+    for (const auto &p : dir->get_replicas()) {
+      who.insert(p.first);
+    }
+  }
+  
+  dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
+
+  filepath path;
+  dir->inode->make_path(path);
+
+  std::set<int32_t> dir_rep_set;
+  for (const auto &r : dir->dir_rep_by) {
+    dir_rep_set.insert(r);
+  }
+
+  mds_rank_t whoami = mds->get_nodeid();
+  for (set<mds_rank_t>::iterator it = who.begin();
+       it != who.end();
+       ++it) {
+    if (*it == whoami) continue;
+    //if (*it == except) continue;
+    dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
+
+    mds->send_message_mds(make_message<MDirUpdate>(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, dir_rep_set, path, bcast), *it);
+  }
+
+  return 0;
+}
+
+void MDCache::handle_dir_update(const cref_t<MDirUpdate> &m)
+{
+  dirfrag_t df = m->get_dirfrag();
+  CDir *dir = get_dirfrag(df);
+  if (!dir) {
+    dout(5) << "dir_update on " << df << ", don't have it" << dendl;
+
+    // discover it?
+    if (m->should_discover()) {
+      // only try once! 
+      // this is key to avoid a fragtree update race, among other things.
+      m->inc_tried_discover();
+      vector<CDentry*> trace;
+      CInode *in;
+      filepath path = m->get_path();
+      dout(5) << "trying discover on dir_update for " << path << dendl;
+      CF_MDS_RetryMessageFactory cf(mds, m);
+      MDRequestRef null_ref;
+      int r = path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, &trace, &in);
+      if (r > 0)
+        return;
+      if (r == 0 &&
+	  in->ino() == df.ino &&
+	  in->get_approx_dirfrag(df.frag) == NULL) {
+	open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
+	return;
+      }
+    }
+
+    return;
+  }
+
+  if (!m->has_tried_discover()) {
+    // Update if it already exists. Othwerwise it got updated by discover reply.
+    dout(5) << "dir_update on " << *dir << dendl;
+    dir->dir_rep = m->get_dir_rep();
+    dir->dir_rep_by.clear();
+    for (const auto &e : m->get_dir_rep_by()) {
+      dir->dir_rep_by.insert(e);
+    }
+  }
+}
+
+
+
+
+
+// LINK
+
+void MDCache::encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl)
+{
+  ENCODE_START(1, 1, bl);
+  inodeno_t ino = dnl->get_remote_ino();
+  encode(ino, bl);
+  __u8 d_type = dnl->get_remote_d_type();
+  encode(d_type, bl);
+  ENCODE_FINISH(bl);
+}
+
+void MDCache::decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  inodeno_t ino;
+  __u8 d_type;
+  decode(ino, p);
+  decode(d_type, p);
+  dout(10) << __func__ << "  remote " << ino << " " << d_type << dendl;
+  dir->link_remote_inode(dn, ino, d_type);
+  DECODE_FINISH(p);
+}
+
+void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
+{
+  dout(7) << __func__ << " " << *dn << dendl;
+
+  CDir *subtree = get_subtree_root(dn->get_dir());
+  for (const auto &p : dn->get_replicas()) {
+    // don't tell (rename) witnesses; they already know
+    if (mdr.get() && mdr->more()->witnessed.count(p.first))
+      continue;
+    if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
+	(mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
+	 rejoin_gather.count(p.first)))
+      continue;
+    CDentry::linkage_t *dnl = dn->get_linkage();
+    auto m = make_message<MDentryLink>(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
+    if (dnl->is_primary()) {
+      dout(10) << __func__ << "  primary " << *dnl->get_inode() << dendl;
+      encode_replica_inode(dnl->get_inode(), p.first, m->bl,
+		      mds->mdsmap->get_up_features());
+    } else if (dnl->is_remote()) {
+      encode_remote_dentry_link(dnl, m->bl);
+    } else
+      ceph_abort();   // aie, bad caller!
+    mds->send_message_mds(m, p.first);
+  }
+}
+
+void MDCache::handle_dentry_link(const cref_t<MDentryLink> &m)
+{
+  CDentry *dn = NULL;
+  CDir *dir = get_dirfrag(m->get_dirfrag());
+  if (!dir) {
+    dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
+  } else {
+    dn = dir->lookup(m->get_dn());
+    if (!dn) {
+      dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
+    } else {
+      dout(7) << __func__ << " on " << *dn << dendl;
+      CDentry::linkage_t *dnl = dn->get_linkage();
+
+      ceph_assert(!dn->is_auth());
+      ceph_assert(dnl->is_null());
+    }
+  }
+
+  auto p = m->bl.cbegin();
+  MDSContext::vec finished;
+  if (dn) {
+    if (m->get_is_primary()) {
+      // primary link.
+      CInode *in = nullptr;
+      decode_replica_inode(in, p, dn, finished);
+    } else {
+      // remote link, easy enough.
+      decode_remote_dentry_link(dir, dn, p);
+    }
+  } else {
+    ceph_abort();
+  }
+
+  if (!finished.empty())
+    mds->queue_waiters(finished);
+
+  return;
+}
+
+
+// UNLINK
+
+void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn,
+                                 MDRequestRef& mdr, bool unlinking)
+{
+  dout(10) << __func__ << " " << *dn << dendl;
+  // share unlink news with replicas
+  set<mds_rank_t> replicas;
+  dn->list_replicas(replicas);
+  bufferlist snapbl;
+  if (straydn) {
+    straydn->list_replicas(replicas);
+    CInode *strayin = straydn->get_linkage()->get_inode();
+    strayin->encode_snap_blob(snapbl);
+  }
+
+  if (unlinking) {
+    ceph_assert(!straydn);
+    dn->replica_unlinking_ref = 0;
+  }
+  for (set<mds_rank_t>::iterator it = replicas.begin();
+       it != replicas.end();
+       ++it) {
+    // don't tell (rmdir) witnesses; they already know
+    if (mdr.get() && mdr->more()->witnessed.count(*it))
+      continue;
+
+    if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
+	(mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
+	 rejoin_gather.count(*it)))
+      continue;
+
+    auto unlink = make_message<MDentryUnlink>(dn->get_dir()->dirfrag(),
+                                              dn->get_name(), unlinking);
+    if (straydn) {
+      encode_replica_stray(straydn, *it, unlink->straybl);
+      unlink->snapbl = snapbl;
+    }
+    mds->send_message_mds(unlink, *it);
+    if (unlinking) {
+      dn->replica_unlinking_ref++;
+      dn->get(CDentry::PIN_WAITUNLINKSTATE);
+    }
+  }
+
+  if (unlinking && dn->replica_unlinking_ref) {
+    dn->add_waiter(CDentry::WAIT_UNLINK_STATE, new C_MDS_RetryRequest(this, mdr));
+  }
+}
+
+void MDCache::handle_dentry_unlink(const cref_t<MDentryUnlink> &m)
+{
+  // straydn
+  CDentry *straydn = nullptr;
+  CInode *strayin = nullptr;
+
+  if (m->straybl.length())
+    decode_replica_stray(straydn, &strayin, m->straybl, mds_rank_t(m->get_source().num()));
+
+  boost::intrusive_ptr<MDentryUnlinkAck> ack;
+  CDentry::linkage_t *dnl;
+  CDentry *dn;
+  CInode *in;
+  bool hadrealm;
+
+  CDir *dir = get_dirfrag(m->get_dirfrag());
+  if (!dir) {
+    dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
+    if (m->is_unlinking())
+      goto ack;
+  } else {
+    dn = dir->lookup(m->get_dn());
+    if (!dn) {
+      dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
+      if (m->is_unlinking())
+        goto ack;
+    } else {
+      dout(7) << __func__ << " on " << *dn << dendl;
+
+      if (m->is_unlinking()) {
+        dn->state_set(CDentry::STATE_UNLINKING);
+        goto ack;
+      }
+
+      dnl = dn->get_linkage();
+
+      // open inode?
+      if (dnl->is_primary()) {
+	in = dnl->get_inode();
+	dn->dir->unlink_inode(dn);
+	ceph_assert(straydn);
+	straydn->dir->link_primary_inode(straydn, in);
+
+	// in->first is lazily updated on replica; drag it forward so
+	// that we always keep it in sync with the dnq
+	ceph_assert(straydn->first >= in->first);
+	in->first = straydn->first;
+
+	// update subtree map?
+	if (in->is_dir()) {
+	  adjust_subtree_after_rename(in, dir, false);
+	}
+
+	if (m->snapbl.length()) {
+	  hadrealm = (in->snaprealm ? true : false);
+	  in->decode_snap_blob(m->snapbl);
+	  ceph_assert(in->snaprealm);
+	  if (!hadrealm)
+	    do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
+	}
+
+	// send caps to auth (if we're not already)
+	if (in->is_any_caps() &&
+	    !in->state_test(CInode::STATE_EXPORTINGCAPS))
+	  migrator->export_caps(in);
+
+	straydn = NULL;
+      } else {
+	ceph_assert(!straydn);
+	ceph_assert(dnl->is_remote());
+	dn->dir->unlink_inode(dn);
+      }
+      ceph_assert(dnl->is_null());
+      dn->state_clear(CDentry::STATE_UNLINKING);
+    }
+  }
+
+  // race with trim_dentry()
+  if (straydn) {
+    ceph_assert(straydn->get_num_ref() == 0);
+    ceph_assert(straydn->get_linkage()->is_null());
+    expiremap ex;
+    trim_dentry(straydn, ex);
+    send_expire_messages(ex);
+  }
+  return;
+
+ack:
+  ack = make_message<MDentryUnlinkAck>(m->get_dirfrag(), m->get_dn());
+  mds->send_message(ack, m->get_connection());
+}
+
+void MDCache::handle_dentry_unlink_ack(const cref_t<MDentryUnlinkAck> &m)
+{
+  CDir *dir = get_dirfrag(m->get_dirfrag());
+  if (!dir) {
+    dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
+  } else {
+    CDentry *dn = dir->lookup(m->get_dn());
+    if (!dn) {
+      dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
+    } else {
+      dout(7) << __func__ << " on " << *dn << " ref "
+	      << dn->replica_unlinking_ref << " -> "
+	      << dn->replica_unlinking_ref - 1 << dendl;
+      dn->replica_unlinking_ref--;
+      if (!dn->replica_unlinking_ref) {
+        MDSContext::vec finished;
+        dn->take_waiting(CDentry::WAIT_UNLINK_STATE, finished);
+        mds->queue_waiters(finished);
+      }
+      dn->put(CDentry::PIN_WAITUNLINKSTATE);
+    }
+  }
+}
+
+
+
+
+
+// ===================================================================
+
+
+
+// ===================================================================
+// FRAGMENT
+
+
+/** 
+ * adjust_dir_fragments -- adjust fragmentation for a directory
+ *
+ * @param diri directory inode
+ * @param basefrag base fragment
+ * @param bits bit adjustment.  positive for split, negative for merge.
+ */
+void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
+				   std::vector<CDir*>* resultfrags,
+				   MDSContext::vec& waiters,
+				   bool replay)
+{
+  dout(10) << "adjust_dir_fragments " << basefrag << " " << bits 
+	   << " on " << *diri << dendl;
+
+  auto&& p = diri->get_dirfrags_under(basefrag);
+
+  adjust_dir_fragments(diri, p.second, basefrag, bits, resultfrags, waiters, replay);
+}
+
+CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
+{
+  CDir *dir = diri->get_dirfrag(fg);
+  if (dir)
+    return dir;
+
+  dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
+
+  std::vector<CDir*> src, result;
+  MDSContext::vec waiters;
+
+  // split a parent?
+  frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
+  while (1) {
+    CDir *pdir = diri->get_dirfrag(parent);
+    if (pdir) {
+      int split = fg.bits() - parent.bits();
+      dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
+      src.push_back(pdir);
+      adjust_dir_fragments(diri, src, parent, split, &result, waiters, replay);
+      dir = diri->get_dirfrag(fg);
+      if (dir) {
+	dout(10) << "force_dir_fragment result " << *dir << dendl;
+	break;
+      }
+    }
+    if (parent == frag_t())
+      break;
+    frag_t last = parent;
+    parent = parent.parent();
+    dout(10) << " " << last << " parent is " << parent << dendl;
+  }
+
+  if (!dir) {
+    // hoover up things under fg?
+    {
+      auto&& p = diri->get_dirfrags_under(fg);
+      src.insert(std::end(src), std::cbegin(p.second), std::cend(p.second));
+    }
+    if (src.empty()) {
+      dout(10) << "force_dir_fragment no frags under " << fg << dendl;
+    } else {
+      dout(10) << " will combine frags under " << fg << ": " << src << dendl;
+      adjust_dir_fragments(diri, src, fg, 0, &result, waiters, replay);
+      dir = result.front();
+      dout(10) << "force_dir_fragment result " << *dir << dendl;
+    }
+  }
+  if (!replay)
+    mds->queue_waiters(waiters);
+  return dir;
+}
+
+void MDCache::adjust_dir_fragments(CInode *diri,
+				   const std::vector<CDir*>& srcfrags,
+				   frag_t basefrag, int bits,
+				   std::vector<CDir*>* resultfrags,
+				   MDSContext::vec& waiters,
+				   bool replay)
+{
+  dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
+	   << " srcfrags " << srcfrags
+	   << " on " << *diri << dendl;
+
+  // adjust fragtree
+  // yuck.  we may have discovered the inode while it was being fragmented.
+  if (!diri->dirfragtree.is_leaf(basefrag))
+    diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
+
+  if (bits > 0)
+    diri->dirfragtree.split(basefrag, bits);
+  dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
+
+  if (srcfrags.empty())
+    return;
+
+  // split
+  CDir *parent_dir = diri->get_parent_dir();
+  CDir *parent_subtree = 0;
+  if (parent_dir)
+    parent_subtree = get_subtree_root(parent_dir);
+
+  ceph_assert(srcfrags.size() >= 1);
+  if (bits > 0) {
+    // SPLIT
+    ceph_assert(srcfrags.size() == 1);
+    CDir *dir = srcfrags.front();
+
+    dir->split(bits, resultfrags, waiters, replay);
+
+    // did i change the subtree map?
+    if (dir->is_subtree_root()) {
+      // new frags are now separate subtrees
+      for (const auto& dir : *resultfrags) {
+	subtrees[dir].clear();   // new frag is now its own subtree
+      }
+      
+      // was i a bound?
+      if (parent_subtree) {
+	ceph_assert(subtrees[parent_subtree].count(dir));
+	subtrees[parent_subtree].erase(dir);
+	for (const auto& dir : *resultfrags) {
+	  ceph_assert(dir->is_subtree_root());
+	  subtrees[parent_subtree].insert(dir);
+	}
+      }
+      
+      // adjust my bounds.
+      set<CDir*> bounds;
+      bounds.swap(subtrees[dir]);
+      subtrees.erase(dir);
+      for (set<CDir*>::iterator p = bounds.begin();
+	   p != bounds.end();
+	   ++p) {
+	CDir *frag = get_subtree_root((*p)->get_parent_dir());
+	subtrees[frag].insert(*p);
+      }
+
+      show_subtrees(10);
+    }
+    
+    diri->close_dirfrag(dir->get_frag());
+    
+  } else {
+    // MERGE
+
+    // are my constituent bits subtrees?  if so, i will be too.
+    // (it's all or none, actually.)
+    bool any_subtree = false, any_non_subtree = false;
+    for (const auto& dir : srcfrags) {
+      if (dir->is_subtree_root())
+	any_subtree = true;
+      else
+	any_non_subtree = true;
+    }
+    ceph_assert(!any_subtree || !any_non_subtree);
+
+    set<CDir*> new_bounds;
+    if (any_subtree)  {
+      for (const auto& dir : srcfrags) {
+	// this simplifies the code that find subtrees underneath the dirfrag
+	if (!dir->is_subtree_root()) {
+	  dir->state_set(CDir::STATE_AUXSUBTREE);
+	  adjust_subtree_auth(dir, mds->get_nodeid());
+	}
+      }
+
+      for (const auto& dir : srcfrags) {
+	ceph_assert(dir->is_subtree_root());
+	dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
+	map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
+	set<CDir*>::iterator r = q->second.begin();
+	while (r != subtrees[dir].end()) {
+	  new_bounds.insert(*r);
+	  subtrees[dir].erase(r++);
+	}
+	subtrees.erase(q);
+
+	// remove myself as my parent's bound
+	if (parent_subtree)
+	  subtrees[parent_subtree].erase(dir);
+      }
+    }
+    
+    // merge
+    CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
+    f->merge(srcfrags, waiters, replay);
+
+    if (any_subtree) {
+      ceph_assert(f->is_subtree_root());
+      subtrees[f].swap(new_bounds);
+      if (parent_subtree)
+	subtrees[parent_subtree].insert(f);
+      
+      show_subtrees(10);
+    }
+
+    resultfrags->push_back(f);
+  }
+}
+
+
+class C_MDC_FragmentFrozen : public MDSInternalContext {
+  MDCache *mdcache;
+  MDRequestRef mdr;
+public:
+  C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
+    MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
+  void finish(int r) override {
+    mdcache->fragment_frozen(mdr, r);
+  }
+};
+
+bool MDCache::can_fragment(CInode *diri, const std::vector<CDir*>& dirs)
+{
+  if (is_readonly()) {
+    dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
+    return false;
+  }
+  if (mds->is_cluster_degraded()) {
+    dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
+    return false;
+  }
+  if (diri->get_parent_dir() &&
+      diri->get_parent_dir()->get_inode()->is_stray()) {
+    dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
+    return false;
+  }
+  if (diri->is_mdsdir() || diri->ino() == CEPH_INO_CEPH) {
+    dout(7) << "can_fragment: i won't fragment mdsdir or .ceph" << dendl;
+    return false;
+  }
+
+  for (const auto& dir : dirs) {
+    if (dir->scrub_is_in_progress()) {
+      dout(7) << "can_fragment: scrub in progress " << *dir << dendl;
+      return false;
+    }
+
+    if (dir->state_test(CDir::STATE_FRAGMENTING)) {
+      dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
+      return false;
+    }
+    if (!dir->is_auth()) {
+      dout(7) << "can_fragment: not auth on " << *dir << dendl;
+      return false;
+    }
+    if (dir->is_bad()) {
+      dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
+      return false;
+    }
+    if (dir->is_frozen() ||
+	dir->is_freezing()) {
+      dout(7) << "can_fragment: can't merge, freezing|frozen.  wait for other exports to finish first." << dendl;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void MDCache::split_dir(CDir *dir, int bits)
+{
+  dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
+  ceph_assert(dir->is_auth());
+  CInode *diri = dir->inode;
+
+  std::vector<CDir*> dirs;
+  dirs.push_back(dir);
+
+  if (!can_fragment(diri, dirs)) {
+    dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
+    return;
+  }
+
+  if (dir->frag.bits() + bits > 24) {
+    dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
+    return;
+  }
+
+  MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
+  mdr->more()->fragment_base = dir->dirfrag();
+
+  ceph_assert(fragments.count(dir->dirfrag()) == 0);
+  fragment_info_t& info = fragments[dir->dirfrag()];
+  info.mdr = mdr;
+  info.dirs.push_back(dir);
+  info.bits = bits;
+  info.last_cum_auth_pins_change = ceph_clock_now();
+
+  fragment_freeze_dirs(dirs);
+  // initial mark+complete pass
+  fragment_mark_and_complete(mdr);
+}
+
+void MDCache::merge_dir(CInode *diri, frag_t frag)
+{
+  dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
+
+  auto&& [all, dirs] = diri->get_dirfrags_under(frag);
+  if (!all) {
+    dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
+    return;
+  }
+
+  if (diri->dirfragtree.is_leaf(frag)) {
+    dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
+    return;
+  }
+
+  if (!can_fragment(diri, dirs))
+    return;
+
+  CDir *first = dirs.front();
+  int bits = first->get_frag().bits() - frag.bits();
+  dout(10) << " we are merging by " << bits << " bits" << dendl;
+
+  dirfrag_t basedirfrag(diri->ino(), frag);
+  MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
+  mdr->more()->fragment_base = basedirfrag;
+
+  ceph_assert(fragments.count(basedirfrag) == 0);
+  fragment_info_t& info = fragments[basedirfrag];
+  info.mdr = mdr;
+  info.dirs = dirs;
+  info.bits = -bits;
+  info.last_cum_auth_pins_change = ceph_clock_now();
+
+  fragment_freeze_dirs(dirs);
+  // initial mark+complete pass
+  fragment_mark_and_complete(mdr);
+}
+
+void MDCache::fragment_freeze_dirs(const std::vector<CDir*>& dirs)
+{
+  bool any_subtree = false, any_non_subtree = false;
+  for (const auto& dir : dirs) {
+    dir->auth_pin(dir);  // until we mark and complete them
+    dir->state_set(CDir::STATE_FRAGMENTING);
+    dir->freeze_dir();
+    ceph_assert(dir->is_freezing_dir());
+
+    if (dir->is_subtree_root())
+      any_subtree = true;
+    else
+      any_non_subtree = true;
+  }
+
+  if (any_subtree && any_non_subtree) {
+    // either all dirfrags are subtree roots or all are not.
+    for (const auto& dir : dirs) {
+      if (dir->is_subtree_root()) {
+	ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
+      } else {
+	dir->state_set(CDir::STATE_AUXSUBTREE);
+	adjust_subtree_auth(dir, mds->get_nodeid());
+      }
+    }
+  }
+}
+
+class C_MDC_FragmentMarking : public MDCacheContext {
+  MDRequestRef mdr;
+public:
+  C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
+  void finish(int r) override {
+    mdcache->fragment_mark_and_complete(mdr);
+  }
+};
+
+void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
+{
+  dirfrag_t basedirfrag = mdr->more()->fragment_base;
+  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
+  if (it == fragments.end() || it->second.mdr != mdr) {
+    dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
+    request_finish(mdr);
+    return;
+  }
+
+  fragment_info_t& info = it->second;
+  CInode *diri = info.dirs.front()->get_inode();
+  dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
+
+  MDSGatherBuilder gather(g_ceph_context);
+  
+  for (const auto& dir : info.dirs) {
+    bool ready = true;
+    if (!dir->is_complete()) {
+      dout(15) << " fetching incomplete " << *dir << dendl;
+      dir->fetch(gather.new_sub(), true);  // ignore authpinnability
+      ready = false;
+    } else if (dir->get_frag() == frag_t()) {
+      // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
+      // the operation. To avoid CDir::fetch() complaining about missing object,
+      // we commit new dirfrag first.
+      if (dir->state_test(CDir::STATE_CREATING)) {
+	dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
+	dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
+	ready = false;
+      } else if (dir->is_new()) {
+	dout(15) << " committing new " << *dir << dendl;
+	ceph_assert(dir->is_dirty());
+	dir->commit(0, gather.new_sub(), true);
+	ready = false;
+      }
+    }
+    if (!ready)
+      continue;
+
+    if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
+      dout(15) << " marking " << *dir << dendl;
+      for (auto &p : dir->items) {
+	CDentry *dn = p.second;
+	dn->get(CDentry::PIN_FRAGMENTING);
+	ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
+	dn->state_set(CDentry::STATE_FRAGMENTING);
+      }
+      dir->state_set(CDir::STATE_DNPINNEDFRAG);
+      dir->auth_unpin(dir);
+    } else {
+      dout(15) << " already marked " << *dir << dendl;
+    }
+  }
+  if (gather.has_subs()) {
+    gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
+    gather.activate();
+    return;
+  }
+
+  for (const auto& dir : info.dirs) {
+    if (!dir->is_frozen_dir()) {
+      ceph_assert(dir->is_freezing_dir());
+      dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
+    }
+  }
+  if (gather.has_subs()) {
+    gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
+    gather.activate();
+    // flush log so that request auth_pins are retired
+    mds->mdlog->flush();
+    return;
+  }
+
+  fragment_frozen(mdr, 0);
+}
+
+void MDCache::fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs)
+{
+  dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
+  for (const auto& dir : dirs) {
+    dout(10) << " frag " << *dir << dendl;
+
+    ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
+    dir->state_clear(CDir::STATE_FRAGMENTING);
+
+    if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
+      dir->state_clear(CDir::STATE_DNPINNEDFRAG);
+
+      for (auto &p : dir->items) {
+	CDentry *dn = p.second;
+	ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
+	dn->state_clear(CDentry::STATE_FRAGMENTING);
+	dn->put(CDentry::PIN_FRAGMENTING);
+      }
+    } else {
+      dir->auth_unpin(dir);
+    }
+
+    dir->unfreeze_dir();
+  }
+}
+
+bool MDCache::fragment_are_all_frozen(CDir *dir)
+{
+  ceph_assert(dir->is_frozen_dir());
+  map<dirfrag_t,fragment_info_t>::iterator p;
+  for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
+       p != fragments.end() && p->first.ino == dir->ino();
+       ++p) {
+    if (p->first.frag.contains(dir->get_frag()))
+      return p->second.all_frozen;
+  }
+  ceph_abort();
+  return false;
+}
+
+void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
+{
+  map<dirfrag_t,fragment_info_t>::iterator p;
+  for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
+       p != fragments.end() && p->first.ino == dir->ino();
+       ++p) {
+    if (p->first.frag.contains(dir->get_frag())) {
+      p->second.num_remote_waiters++;
+      return;
+    }
+  }
+  ceph_abort();
+}
+
+void MDCache::find_stale_fragment_freeze()
+{
+  dout(10) << "find_stale_fragment_freeze" << dendl;
+  // see comment in Migrator::find_stale_export_freeze()
+  utime_t now = ceph_clock_now();
+  utime_t cutoff = now;
+  cutoff -= g_conf()->mds_freeze_tree_timeout;
+
+  for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
+       p != fragments.end(); ) {
+    dirfrag_t df = p->first;
+    fragment_info_t& info = p->second;
+    ++p;
+    if (info.all_frozen)
+      continue;
+    CDir *dir;
+    int total_auth_pins = 0;
+    for (const auto& d : info.dirs) {
+      dir = d;
+      if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
+	total_auth_pins = -1;
+	break;
+      }
+      if (dir->is_frozen_dir())
+	continue;
+      total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
+    }
+    if (total_auth_pins < 0)
+      continue;
+    if (info.last_cum_auth_pins != total_auth_pins) {
+      info.last_cum_auth_pins = total_auth_pins;
+      info.last_cum_auth_pins_change = now;
+      continue;
+    }
+    if (info.last_cum_auth_pins_change >= cutoff)
+      continue;
+    dir = info.dirs.front();
+    if (info.num_remote_waiters > 0 ||
+	(!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
+      dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
+      std::vector<CDir*> dirs;
+      info.dirs.swap(dirs);
+      fragments.erase(df);
+      fragment_unmark_unfreeze_dirs(dirs);
+    }
+  }
+}
+
+class C_MDC_FragmentPrep : public MDCacheLogContext {
+  MDRequestRef mdr;
+public:
+  C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m),  mdr(r) {}
+  void finish(int r) override {
+    mdcache->_fragment_logged(mdr);
+  }
+};
+
+class C_MDC_FragmentStore : public MDCacheContext {
+  MDRequestRef mdr;
+public:
+  C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
+  void finish(int r) override {
+    mdcache->_fragment_stored(mdr);
+  }
+};
+
+class C_MDC_FragmentCommit : public MDCacheLogContext {
+  dirfrag_t basedirfrag;
+  MDRequestRef mdr;
+public:
+  C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
+    MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
+  void finish(int r) override {
+    mdcache->_fragment_committed(basedirfrag, mdr);
+  }
+};
+
+class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
+  dirfrag_t basedirfrag;
+  int bits;
+  MDRequestRef mdr;
+public:
+  C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
+			    const MDRequestRef& r) :
+    MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
+  void finish(int r) override {
+    ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
+    mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
+  }
+  void print(ostream& out) const override {
+    out << "fragment_purge_old(" << basedirfrag << ")";
+  }
+};
+
+void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
+{
+  dirfrag_t basedirfrag = mdr->more()->fragment_base;
+  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
+  if (it == fragments.end() || it->second.mdr != mdr) {
+    dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
+    request_finish(mdr);
+    return;
+  }
+
+  ceph_assert(r == 0);
+  fragment_info_t& info = it->second;
+  dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
+	   << " on " << info.dirs.front()->get_inode() << dendl;
+
+  info.all_frozen = true;
+  dispatch_fragment_dir(mdr);
+}
+
+void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
+{
+  dirfrag_t basedirfrag = mdr->more()->fragment_base;
+  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
+  if (it == fragments.end() || it->second.mdr != mdr) {
+    dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
+    request_finish(mdr);
+    return;
+  }
+
+  fragment_info_t& info = it->second;
+  CInode *diri = info.dirs.front()->get_inode();
+
+  dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
+	   << " on " << *diri << dendl;
+
+  if (mdr->more()->peer_error)
+    mdr->aborted = true;
+
+  if (!mdr->aborted) {
+    MutationImpl::LockOpVec lov;
+    lov.add_wrlock(&diri->dirfragtreelock);
+    // prevent a racing gather on any other scatterlocks too
+    lov.lock_scatter_gather(&diri->nestlock);
+    lov.lock_scatter_gather(&diri->filelock);
+    if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) {
+      if (!mdr->aborted)
+	return;
+    }
+  }
+
+  if (mdr->aborted) {
+    dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
+	     << info.dirs.front()->dirfrag() << dendl;
+    if (info.bits > 0)
+      mds->balancer->queue_split(info.dirs.front(), false);
+    else
+      mds->balancer->queue_merge(info.dirs.front());
+    fragment_unmark_unfreeze_dirs(info.dirs);
+    fragments.erase(it);
+    request_finish(mdr);
+    return;
+  }
+
+  mdr->ls = mds->mdlog->get_current_segment();
+  EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
+  mds->mdlog->start_entry(le);
+
+  for (const auto& dir : info.dirs) {
+    dirfrag_rollback rollback;
+    rollback.fnode = dir->fnode;
+    le->add_orig_frag(dir->get_frag(), &rollback);
+  }
+
+  // refragment
+  MDSContext::vec waiters;
+  adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
+		       &info.resultfrags, waiters, false);
+  if (g_conf()->mds_debug_frag)
+    diri->verify_dirfrags();
+  mds->queue_waiters(waiters);
+
+  for (const auto& fg : le->orig_frags)
+    ceph_assert(!diri->dirfragtree.is_leaf(fg));
+
+  le->metablob.add_dir_context(info.resultfrags.front());
+  for (const auto& dir : info.resultfrags) {
+    if (diri->is_auth()) {
+      le->metablob.add_fragmented_dir(dir, false, false);
+    } else {
+      dir->state_set(CDir::STATE_DIRTYDFT);
+      le->metablob.add_fragmented_dir(dir, false, true);
+    }
+  }
+
+  // dft lock
+  if (diri->is_auth()) {
+    // journal dirfragtree
+    auto pi = diri->project_inode(mdr);
+    pi.inode->version = diri->pre_dirty();
+    predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY);
+    journal_dirty_inode(mdr.get(), &le->metablob, diri);
+  } else {
+    mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
+    mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
+    mdr->add_updated_lock(&diri->dirfragtreelock);
+  }
+
+  /*
+  // filelock
+  mds->locker->mark_updated_scatterlock(&diri->filelock);
+  mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
+  mut->add_updated_lock(&diri->filelock);
+
+  // dirlock
+  mds->locker->mark_updated_scatterlock(&diri->nestlock);
+  mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
+  mut->add_updated_lock(&diri->nestlock);
+  */
+
+  add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
+  mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
+                                  mdr, __func__);
+  mds->mdlog->flush();
+}
+
+void MDCache::_fragment_logged(MDRequestRef& mdr)
+{
+  dirfrag_t basedirfrag = mdr->more()->fragment_base;
+  auto& info = fragments.at(basedirfrag);
+  CInode *diri = info.resultfrags.front()->get_inode();
+
+  dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
+	   << " on " << *diri << dendl;
+  mdr->mark_event("prepare logged");
+
+  mdr->apply();  // mark scatterlock
+
+  // store resulting frags
+  MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
+
+  for (const auto& dir : info.resultfrags) {
+    dout(10) << " storing result frag " << *dir << dendl;
+
+    dir->mark_dirty(mdr->ls);
+    dir->mark_new(mdr->ls);
+
+    // freeze and store them too
+    dir->auth_pin(this);
+    dir->state_set(CDir::STATE_FRAGMENTING);
+    dir->commit(0, gather.new_sub(), true);  // ignore authpinnability
+  }
+
+  gather.activate();
+}
+
+void MDCache::_fragment_stored(MDRequestRef& mdr)
+{
+  dirfrag_t basedirfrag = mdr->more()->fragment_base;
+  fragment_info_t &info = fragments.at(basedirfrag);
+  CDir *first = info.resultfrags.front();
+  CInode *diri = first->get_inode();
+
+  dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
+	   << " on " << *diri << dendl;
+  mdr->mark_event("new frags stored");
+
+  // tell peers
+  mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
+			  diri->authority().first : CDIR_AUTH_UNKNOWN;
+  for (const auto &p : first->get_replicas()) {
+    if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
+	(mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
+	 rejoin_gather.count(p.first)))
+      continue;
+
+    auto notify = make_message<MMDSFragmentNotify>(basedirfrag, info.bits, mdr->reqid.tid);
+    if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
+	diri_auth != p.first) { // not auth mds of diri
+      /*
+       * In the nornal case, mds does not trim dir inode whose child dirfrags
+       * are likely being fragmented (see trim_inode()). But when fragmenting
+       * subtree roots, following race can happen:
+       *
+       * - mds.a (auth mds of dirfrag) sends fragment_notify message to
+       *   mds.c and drops wrlock on dirfragtreelock.
+       * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
+       *   SYNC and send lock message mds.c
+       * - mds.c receives the lock message and changes dirfragtreelock state
+       *   to SYNC
+       * - mds.c trim dirfrag and dir inode from its cache
+       * - mds.c receives the fragment_notify message
+       *
+       * So we need to ensure replicas have received the notify, then unlock
+       * the dirfragtreelock.
+       */
+      notify->mark_ack_wanted();
+      info.notify_ack_waiting.insert(p.first);
+    }
+
+    // freshly replicate new dirs to peers
+    for (const auto& dir : info.resultfrags) {
+      encode_replica_dir(dir, p.first, notify->basebl);
+    }
+
+    mds->send_message_mds(notify, p.first);
+  }
+
+  // journal commit
+  EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
+  mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
+
+
+  // unfreeze resulting frags
+  for (const auto& dir : info.resultfrags) {
+    dout(10) << " result frag " << *dir << dendl;
+
+    for (auto &p : dir->items) {
+      CDentry *dn = p.second;
+      ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
+      dn->state_clear(CDentry::STATE_FRAGMENTING);
+      dn->put(CDentry::PIN_FRAGMENTING);
+    }
+
+    // unfreeze
+    dir->unfreeze_dir();
+  }
+
+  if (info.notify_ack_waiting.empty()) {
+    fragment_drop_locks(info);
+  } else {
+    mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
+  }
+}
+
+void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
+{
+  dout(10) << "fragment_committed " << basedirfrag << dendl;
+  if (mdr)
+    mdr->mark_event("commit logged");
+
+  ufragment &uf = uncommitted_fragments.at(basedirfrag);
+
+  // remove old frags
+  C_GatherBuilder gather(
+    g_ceph_context,
+    new C_OnFinisher(
+      new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
+      mds->finisher));
+
+  SnapContext nullsnapc;
+  object_locator_t oloc(mds->get_metadata_pool());
+  for (const auto& fg : uf.old_frags) {
+    object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
+    ObjectOperation op;
+    if (fg == frag_t()) {
+      // backtrace object
+      dout(10) << " truncate orphan dirfrag " << oid << dendl;
+      op.truncate(0);
+      op.omap_clear();
+    } else {
+      dout(10) << " removing orphan dirfrag " << oid << dendl;
+      op.remove();
+    }
+    mds->objecter->mutate(oid, oloc, op, nullsnapc,
+			  ceph::real_clock::now(),
+			  0, gather.new_sub());
+  }
+
+  ceph_assert(gather.has_subs());
+  gather.activate();
+}
+
+void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
+{
+  dout(10) << "fragment_old_purged " << basedirfrag << dendl;
+  if (mdr)
+    mdr->mark_event("old frags purged");
+
+  EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
+  mds->mdlog->start_submit_entry(le);
+
+  finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
+
+  if (mds->logger) {
+    if (bits > 0) {
+      mds->logger->inc(l_mds_dir_split);
+    } else {
+      mds->logger->inc(l_mds_dir_merge);
+    }
+  }
+
+  if (mdr) {
+    auto it = fragments.find(basedirfrag);
+    ceph_assert(it != fragments.end());
+    it->second.finishing = true;
+    if (it->second.notify_ack_waiting.empty())
+      fragment_maybe_finish(it);
+    else
+      mdr->mark_event("wating for notify acks");
+  }
+}
+
+void MDCache::fragment_drop_locks(fragment_info_t& info)
+{
+  mds->locker->drop_locks(info.mdr.get());
+  request_finish(info.mdr);
+  //info.mdr.reset();
+}
+
+void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
+{
+  if (!it->second.finishing)
+    return;
+
+  // unmark & auth_unpin
+  for (const auto &dir : it->second.resultfrags) {
+    dir->state_clear(CDir::STATE_FRAGMENTING);
+    dir->auth_unpin(this);
+
+    // In case the resulting fragments are beyond the split size,
+    // we might need to split them again right away (they could
+    // have been taking inserts between unfreezing and getting
+    // here)
+    mds->balancer->maybe_fragment(dir, false);
+  }
+
+  fragments.erase(it);
+}
+
+
+void MDCache::handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &ack)
+{
+  dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
+  mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+  if (mds->get_state() < MDSMap::STATE_ACTIVE) {
+    return;
+  }
+
+  auto it = fragments.find(ack->get_base_dirfrag());
+  if (it == fragments.end() ||
+      it->second.get_tid() != ack->get_tid()) {
+    dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
+    return;
+  }
+
+  if (it->second.notify_ack_waiting.erase(from) &&
+      it->second.notify_ack_waiting.empty()) {
+    fragment_drop_locks(it->second);
+    fragment_maybe_finish(it);
+  }
+}
+
+void MDCache::handle_fragment_notify(const cref_t<MMDSFragmentNotify> &notify)
+{
+  dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
+  mds_rank_t from = mds_rank_t(notify->get_source().num());
+
+  if (mds->get_state() < MDSMap::STATE_REJOIN) {
+    return;
+  }
+
+  CInode *diri = get_inode(notify->get_ino());
+  if (diri) {
+    frag_t base = notify->get_basefrag();
+    int bits = notify->get_bits();
+
+/*
+    if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
+	(bits > 0 && !diri->dirfragtree.is_leaf(base))) {
+      dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
+	       << ", must have found out during resolve/rejoin?  ignoring. " << *diri << dendl;
+      return;
+    }
+*/
+
+    // refragment
+    MDSContext::vec waiters;
+    std::vector<CDir*> resultfrags;
+    adjust_dir_fragments(diri, base, bits, &resultfrags, waiters, false);
+    if (g_conf()->mds_debug_frag)
+      diri->verify_dirfrags();
+    
+    for (const auto& dir : resultfrags) {
+      diri->take_dir_waiting(dir->get_frag(), waiters);
+    }
+
+    // add new replica dirs values
+    auto p = notify->basebl.cbegin();
+    while (!p.end()) {
+      CDir *tmp_dir = nullptr;
+      decode_replica_dir(tmp_dir, p, diri, from, waiters);
+    }
+
+    mds->queue_waiters(waiters);
+  } else {
+    ceph_abort();
+  }
+
+  if (notify->is_ack_wanted()) {
+    auto ack = make_message<MMDSFragmentNotifyAck>(notify->get_base_dirfrag(),
+					     notify->get_bits(), notify->get_tid());
+    mds->send_message_mds(ack, from);
+  }
+}
+
+void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
+				       LogSegment *ls, bufferlist *rollback)
+{
+  dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
+  ceph_assert(!uncommitted_fragments.count(basedirfrag));
+  ufragment& uf = uncommitted_fragments[basedirfrag];
+  uf.old_frags = old_frags;
+  uf.bits = bits;
+  uf.ls = ls;
+  ls->uncommitted_fragments.insert(basedirfrag);
+  if (rollback)
+    uf.rollback.swap(*rollback);
+}
+
+void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
+{
+  dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
+	   << " op " << EFragment::op_name(op) << dendl;
+  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
+  if (it != uncommitted_fragments.end()) {
+    ufragment& uf = it->second;
+    if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
+      uf.committed = true;
+    } else {
+      uf.ls->uncommitted_fragments.erase(basedirfrag);
+      mds->queue_waiters(uf.waiters);
+      uncommitted_fragments.erase(it);
+    }
+  }
+}
+
+void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
+{
+  dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
+           << " old_frags (" << old_frags << ")" << dendl;
+  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
+  if (it != uncommitted_fragments.end()) {
+    ufragment& uf = it->second;
+    if (!uf.old_frags.empty()) {
+      uf.old_frags = std::move(old_frags);
+      uf.committed = true;
+    } else {
+      uf.ls->uncommitted_fragments.erase(basedirfrag);
+      uncommitted_fragments.erase(it);
+    }
+  }
+}
+
+void MDCache::wait_for_uncommitted_fragments(MDSContext* finisher)
+{
+  MDSGatherBuilder gather(g_ceph_context, finisher);
+  for (auto& p : uncommitted_fragments) {
+    p.second.waiters.push_back(gather.new_sub());
+  }
+  gather.activate();
+}
+
+struct C_MDC_FragmentRollback : public MDCacheLogContext {
+  MutationRef mut;
+  C_MDC_FragmentRollback(MDCache *c, MutationRef& m) :
+    MDCacheLogContext(c), mut(m) {}
+  void finish(int r) override {
+    mut->apply();
+    get_mds()->locker->drop_locks(mut.get());
+    mut->cleanup();
+  }
+};
+
+void MDCache::rollback_uncommitted_fragments()
+{
+  dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
+  for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
+       p != uncommitted_fragments.end();
+       ++p) {
+    ufragment &uf = p->second;
+    CInode *diri = get_inode(p->first.ino);
+    ceph_assert(diri);
+
+    if (uf.committed) {
+      _fragment_committed(p->first, MDRequestRef());
+      continue;
+    }
+
+    dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
+
+    MutationRef mut(new MutationImpl());
+    mut->ls = mds->mdlog->get_current_segment();
+    EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
+    mds->mdlog->start_entry(le);
+    bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
+
+    frag_vec_t old_frags;
+    diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
+
+    std::vector<CDir*> resultfrags;
+    if (uf.old_frags.empty()) {
+      // created by old format EFragment
+      MDSContext::vec waiters;
+      adjust_dir_fragments(diri, p->first.frag, -uf.bits, &resultfrags, waiters, true);
+    } else {
+      auto bp = uf.rollback.cbegin();
+      for (const auto& fg : uf.old_frags) {
+	CDir *dir = force_dir_fragment(diri, fg);
+	resultfrags.push_back(dir);
+
+	dirfrag_rollback rollback;
+	decode(rollback, bp);
+
+	dir->fnode = rollback.fnode;
+
+	dir->mark_dirty(mut->ls);
+
+	if (!(dir->get_fnode()->rstat == dir->get_fnode()->accounted_rstat)) {
+	  dout(10) << "    dirty nestinfo on " << *dir << dendl;
+	  mds->locker->mark_updated_scatterlock(&diri->nestlock);
+	  mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
+	  mut->add_updated_lock(&diri->nestlock);
+	}
+	if (!(dir->get_fnode()->fragstat == dir->get_fnode()->accounted_fragstat)) {
+	  dout(10) << "    dirty fragstat on " << *dir << dendl;
+	  mds->locker->mark_updated_scatterlock(&diri->filelock);
+	  mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
+	  mut->add_updated_lock(&diri->filelock);
+	}
+
+	le->add_orig_frag(dir->get_frag());
+	le->metablob.add_dir_context(dir);
+	if (diri_auth) {
+	  le->metablob.add_fragmented_dir(dir, true, false);
+	} else {
+	  dout(10) << "    dirty dirfragtree on " << *dir << dendl;
+	  dir->state_set(CDir::STATE_DIRTYDFT);
+	  le->metablob.add_fragmented_dir(dir, true, true);
+	}
+      }
+    }
+
+    if (diri_auth) {
+      auto pi = diri->project_inode(mut);
+      pi.inode->version = diri->pre_dirty();
+      predirty_journal_parents(mut, &le->metablob, diri, 0, PREDIRTY_PRIMARY);
+      le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
+    } else {
+      mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
+      mut->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
+      mut->add_updated_lock(&diri->dirfragtreelock);
+    }
+
+    if (g_conf()->mds_debug_frag)
+      diri->verify_dirfrags();
+
+    for (const auto& leaf : old_frags) {
+      ceph_assert(!diri->dirfragtree.is_leaf(leaf));
+    }
+
+    mds->mdlog->submit_entry(le, new C_MDC_FragmentRollback(this, mut));
+
+    uf.old_frags.swap(old_frags);
+    _fragment_committed(p->first, MDRequestRef());
+  }
+}
+
+void MDCache::force_readonly()
+{
+  if (is_readonly())
+    return;
+
+  dout(1) << "force file system read-only" << dendl;
+  mds->clog->warn() << "force file system read-only";
+
+  set_readonly();
+
+  mds->server->force_clients_readonly();
+
+  // revoke write caps
+  int count = 0;
+  for (auto &p : inode_map) {
+    CInode *in = p.second;
+    if (in->is_head())
+      mds->locker->eval(in, CEPH_CAP_LOCKS);
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+
+  mds->mdlog->flush();
+}
+
+
+// ==============================================================
+// debug crap
+
+void MDCache::show_subtrees(int dbl, bool force_print)
+{
+  if (g_conf()->mds_thrash_exports)
+    dbl += 15;
+
+  //dout(10) << "show_subtrees" << dendl;
+
+  if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
+    return;  // i won't print anything.
+
+  if (subtrees.empty()) {
+    dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
+					<< dendl;
+    return;
+  }
+
+  if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD &&
+      !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
+    dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not "
+		"printing subtrees" << dendl;
+    return;
+  }
+
+  // root frags
+  std::vector<CDir*> basefrags;
+  for (set<CInode*>::iterator p = base_inodes.begin();
+       p != base_inodes.end();
+       ++p) 
+    (*p)->get_dirfrags(basefrags);
+  //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
+  dout(15) << "show_subtrees" << dendl;
+
+  // queue stuff
+  list<pair<CDir*,int> > q;
+  string indent;
+  set<CDir*> seen;
+
+  // calc max depth
+  for (const auto& dir : basefrags) {
+    q.emplace_back(dir, 0);
+  }
+
+  set<CDir*> subtrees_seen;
+
+  unsigned int depth = 0;
+  while (!q.empty()) {
+    CDir *dir = q.front().first;
+    unsigned int d = q.front().second;
+    q.pop_front();
+
+    if (subtrees.count(dir) == 0) continue;
+
+    subtrees_seen.insert(dir);
+
+    if (d > depth) depth = d;
+
+    // sanity check
+    //dout(25) << "saw depth " << d << " " << *dir << dendl;
+    if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
+    ceph_assert(seen.count(dir) == 0);
+    seen.insert(dir);
+
+    // nested items?
+    if (!subtrees[dir].empty()) {
+      for (set<CDir*>::iterator p = subtrees[dir].begin();
+	   p != subtrees[dir].end();
+	   ++p) {
+	//dout(25) << " saw sub " << **p << dendl;
+	q.push_front(pair<CDir*,int>(*p, d+1));
+      }
+    }
+  }
+
+  if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD &&
+      !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
+    dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing "
+		"subtrees" << dendl;
+    return;
+  }
+
+  // print tree
+  for (const auto& dir : basefrags) {
+    q.emplace_back(dir, 0);
+  }
+
+  while (!q.empty()) {
+    CDir *dir = q.front().first;
+    int d = q.front().second;
+    q.pop_front();
+
+    if (subtrees.count(dir) == 0) continue;
+
+    // adjust indenter
+    while ((unsigned)d < indent.size()) 
+      indent.resize(d);
+    
+    // pad
+    string pad = "______________________________________";
+    pad.resize(depth*2+1-indent.size());
+    if (!subtrees[dir].empty()) 
+      pad[0] = '.'; // parent
+
+
+    string auth;
+    if (dir->is_auth())
+      auth = "auth ";
+    else
+      auth = " rep ";
+
+    char s[10];
+    if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
+      snprintf(s, sizeof(s), "%2d   ", int(dir->get_dir_auth().first));
+    else
+      snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
+    
+    // print
+    dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
+					<< " " << auth << *dir << dendl;
+
+    if (dir->ino() == CEPH_INO_ROOT)
+      ceph_assert(dir->inode == root);
+    if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
+      ceph_assert(dir->inode == myin);
+    if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
+      ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
+
+    // nested items?
+    if (!subtrees[dir].empty()) {
+      // more at my level?
+      if (!q.empty() && q.front().second == d)
+	indent += "| ";
+      else
+	indent += "  ";
+
+      for (set<CDir*>::iterator p = subtrees[dir].begin();
+	   p != subtrees[dir].end();
+	   ++p) 
+	q.push_front(pair<CDir*,int>(*p, d+2));
+    }
+  }
+
+  // verify there isn't stray crap in subtree map
+  int lost = 0;
+  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+       p != subtrees.end();
+       ++p) {
+    if (subtrees_seen.count(p->first)) continue;
+    dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
+    lost++;
+  }
+  ceph_assert(lost == 0);
+}
+
+void MDCache::show_cache()
+{
+  if (!g_conf()->subsys.should_gather<ceph_subsys_mds, 7>())
+    return;
+  dout(7) << "show_cache" << dendl;
+
+  auto show_func = [this](CInode *in) {
+    // unlinked?
+    if (!in->parent)
+      dout(7) << " unlinked " << *in << dendl;
+
+    // dirfrags?
+    auto&& dfs = in->get_dirfrags();
+    for (const auto& dir : dfs) {
+      dout(7) << "  dirfrag " << *dir << dendl;
+
+      for (auto &p : dir->items) {
+	CDentry *dn = p.second;
+	dout(7) << "   dentry " << *dn << dendl;
+	CDentry::linkage_t *dnl = dn->get_linkage();
+	if (dnl->is_primary() && dnl->get_inode()) 
+	  dout(7) << "    inode " << *dnl->get_inode() << dendl;
+      }
+    }
+  };
+
+  for (auto &p : inode_map)
+    show_func(p.second);
+  for (auto &p : snap_inode_map)
+    show_func(p.second);
+}
+
+void MDCache::cache_status(Formatter *f)
+{
+  f->open_object_section("cache");
+
+  f->open_object_section("pool");
+  mempool::get_pool(mempool::mds_co::id).dump(f);
+  f->close_section();
+
+  f->close_section();
+}
+
+void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f) 
+{
+  ceph_assert(in);
+  if ((max_depth >= 0) && (cur_depth > max_depth)) {
+    return;
+  }
+  auto&& ls = in->get_dirfrags();
+  for (const auto &subdir : ls) {
+    for (const auto &p : subdir->items) {
+      CDentry *dn = p.second;
+      CInode *in = dn->get_linkage()->get_inode();
+      if (in) {
+        dump_tree(in, cur_depth + 1, max_depth, f);
+      }
+    }
+  }
+  f->open_object_section("inode");
+  in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
+  f->close_section();
+}
+
+int MDCache::dump_cache(std::string_view file_name)
+{
+  return dump_cache(file_name, NULL);
+}
+
+int MDCache::dump_cache(Formatter *f)
+{
+  return dump_cache(std::string_view(""), f);
+}
+
+/**
+ * Dump the metadata cache, either to a Formatter, if
+ * provided, else to a plain text file.
+ */
+int MDCache::dump_cache(std::string_view fn, Formatter *f)
+{
+  int r = 0;
+
+  // dumping large caches may cause mds to hang or worse get killed.
+  // so, disallow the dump if the cache size exceeds the configured
+  // threshold, which is 1G for formatter and unlimited for file (note
+  // that this can be jacked up by the admin... and is nothing but foot
+  // shooting, but the option itself is for devs and hence dangerous to
+  // tune). TODO: remove this when fixed.
+  uint64_t threshold = f ?
+    g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
+    g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
+
+  if (threshold && cache_size() > threshold) {
+    if (f) {
+      CachedStackStringStream css;
+      *css << "cache usage exceeds dump threshold";
+      f->open_object_section("result");
+      f->dump_string("error", css->strv());
+      f->close_section();
+    } else {
+      derr << "cache usage exceeds dump threshold" << dendl;
+      r = -CEPHFS_EINVAL;
+    }
+    return r;
+  }
+
+  r = 0;
+  int fd = -1;
+
+  if (f) {
+    f->open_array_section("inodes");
+  } else {
+    char path[PATH_MAX] = "";
+    if (fn.length()) {
+      snprintf(path, sizeof path, "%s", fn.data());
+    } else {
+      snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
+    }
+
+    dout(1) << "dump_cache to " << path << dendl;
+
+    fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
+    if (fd < 0) {
+      derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
+      return errno;
+    }
+  }
+
+  auto dump_func = [fd, f](CInode *in) {
+    int r;
+    if (f) {
+      f->open_object_section("inode");
+      in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
+      f->close_section();
+      return 1;
+    } 
+    CachedStackStringStream css;
+    *css << *in << std::endl;
+    auto sv = css->strv();
+    r = safe_write(fd, sv.data(), sv.size());
+    if (r < 0)
+      return r;
+    auto&& dfs = in->get_dirfrags();
+    for (auto &dir : dfs) {
+      CachedStackStringStream css2;
+      *css2 << " " << *dir << std::endl;
+      auto sv = css2->strv();
+      r = safe_write(fd, sv.data(), sv.size());
+      if (r < 0)
+        return r;
+      for (auto &p : dir->items) {
+	CDentry *dn = p.second;
+        CachedStackStringStream css3;
+        *css3 << "  " << *dn << std::endl;
+        auto sv = css3->strv();
+        r = safe_write(fd, sv.data(), sv.size());
+        if (r < 0)
+          return r;
+      }
+      dir->check_rstats();
+    }
+    return 1;
+  };
+
+  for (auto &p : inode_map) {
+    r = dump_func(p.second);
+    if (r < 0)
+      goto out;
+  }
+  for (auto &p : snap_inode_map) {
+    r = dump_func(p.second);
+    if (r < 0)
+      goto out;
+  }
+  r = 0;
+
+ out:
+  if (f) {
+    f->close_section();  // inodes
+  } else {
+    ::close(fd);
+  }
+  return r;
+}
+
+void C_MDS_RetryRequest::finish(int r)
+{
+  mdr->retry++;
+  cache->dispatch_request(mdr);
+}
+
+MDSContext *CF_MDS_RetryRequestFactory::build()
+{
+  if (drop_locks) {
+    mdcache->mds->locker->drop_locks(mdr.get(), nullptr);
+    mdr->drop_local_auth_pins();
+  }
+  return new C_MDS_RetryRequest(mdcache, mdr);
+}
+
+class C_MDS_EnqueueScrub : public Context
+{
+  std::string tag;
+  Formatter *formatter;
+  Context *on_finish;
+public:
+  ScrubHeaderRef header;
+  C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
+    tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
+
+  void finish(int r) override {
+    formatter->open_object_section("results");
+    formatter->dump_int("return_code", r);
+    if (r == 0) {
+      formatter->dump_string("scrub_tag", tag);
+      formatter->dump_string("mode", "asynchronous");
+    }
+    formatter->close_section();
+
+    r = 0;
+    if (on_finish)
+      on_finish->complete(r);
+  }
+};
+
+void MDCache::enqueue_scrub(
+    std::string_view path,
+    std::string_view tag,
+    bool force, bool recursive, bool repair,
+    Formatter *f, Context *fin)
+{
+  dout(10) << __func__ << " " << path << dendl;
+
+  filepath fp;
+  if (path.compare(0, 4, "~mds") == 0) {
+    mds_rank_t rank;
+    if (path == "~mdsdir") {
+      rank = mds->get_nodeid();
+    } else {
+      std::string err;
+      rank = strict_strtoll(path.substr(4), 10, &err);
+      if (!err.empty())
+	rank = MDS_RANK_NONE;
+    }
+    if (rank >= 0 && rank < MAX_MDS)
+      fp.set_path("", MDS_INO_MDSDIR(rank));
+  }
+  if (fp.get_ino() == inodeno_t(0))
+    fp.set_path(path);
+
+  MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
+  mdr->set_filepath(fp);
+
+  bool is_internal = false;
+  std::string tag_str(tag);
+  if (tag_str.empty()) {
+    uuid_d uuid_gen;
+    uuid_gen.generate_random();
+    tag_str = uuid_gen.to_string();
+    is_internal = true;
+  }
+
+  C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
+  cs->header = std::make_shared<ScrubHeader>(tag_str, is_internal, force, recursive, repair);
+
+  mdr->internal_op_finish = cs;
+  enqueue_scrub_work(mdr);
+}
+
+void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
+{
+  CInode *in;
+  CF_MDS_RetryRequestFactory cf(this, mdr, true);
+  int r = path_traverse(mdr, cf, mdr->get_filepath(),
+			MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_RDLOCK_PATH,
+			nullptr, &in);
+  if (r > 0)
+    return;
+  if (r < 0) {
+    mds->server->respond_to_request(mdr, r);
+    return;
+  }
+
+  // Cannot scrub same dentry twice at same time
+  if (in->scrub_is_in_progress()) {
+    mds->server->respond_to_request(mdr, -CEPHFS_EBUSY);
+    return;
+  } else {
+    in->scrub_info();
+  }
+
+  C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
+  ScrubHeaderRef& header = cs->header;
+
+  r = mds->scrubstack->enqueue(in, header, !header->get_recursive());
+
+  mds->server->respond_to_request(mdr, r);
+}
+
+struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
+  MDRequestRef mdr;
+  C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
+    MDCacheLogContext(c), mdr(m) {}
+  void finish(int r) override {
+    mdr->apply();
+    get_mds()->server->respond_to_request(mdr, r);
+  }
+};
+
+struct C_MDC_ScrubRepaired : public MDCacheContext {
+  ScrubHeaderRef header;
+public:
+  C_MDC_ScrubRepaired(MDCache *m, const ScrubHeaderRef& h)
+    : MDCacheContext(m), header(h) {
+    header->inc_num_pending();
+  }
+  void finish(int r) override {
+    header->dec_num_pending();
+  }
+};
+
+void MDCache::repair_dirfrag_stats(CDir *dir)
+{
+  MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
+  mdr->pin(dir);
+  mdr->internal_op_private = dir;
+  if (dir->scrub_is_in_progress())
+    mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, dir->get_scrub_header());
+  else
+    mdr->internal_op_finish = new C_MDSInternalNoop;
+  repair_dirfrag_stats_work(mdr);
+}
+
+void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
+{
+  CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
+  dout(10) << __func__ << " " << *dir << dendl;
+
+  if (!dir->is_auth()) {
+    mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
+    return;
+  }
+
+  if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
+    dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
+
+    mds->locker->drop_locks(mdr.get());
+    mdr->drop_local_auth_pins();
+    if (mdr->is_any_remote_auth_pin())
+      mds->locker->notify_freeze_waiter(dir);
+    return;
+  }
+
+  mdr->auth_pin(dir);
+
+  MutationImpl::LockOpVec lov;
+  CInode *diri = dir->inode;
+  lov.add_rdlock(&diri->dirfragtreelock);
+  lov.add_wrlock(&diri->nestlock);
+  lov.add_wrlock(&diri->filelock);
+  if (!mds->locker->acquire_locks(mdr, lov))
+    return;
+
+  if (!dir->is_complete()) {
+    dir->fetch(new C_MDS_RetryRequest(this, mdr));
+    return;
+  }
+
+  frag_info_t frag_info;
+  nest_info_t nest_info;
+  for (auto it = dir->begin(); it != dir->end(); ++it) {
+    CDentry *dn = it->second;
+    if (dn->last != CEPH_NOSNAP)
+      continue;
+    CDentry::linkage_t *dnl = dn->get_projected_linkage();
+    if (dnl->is_primary()) {
+      CInode *in = dnl->get_inode();
+      nest_info.add(in->get_projected_inode()->accounted_rstat);
+      if (in->is_dir())
+	frag_info.nsubdirs++;
+      else
+	frag_info.nfiles++;
+    } else if (dnl->is_remote())
+      frag_info.nfiles++;
+  }
+
+  auto pf = dir->get_projected_fnode();
+  bool good_fragstat = frag_info.same_sums(pf->fragstat);
+  bool good_rstat = nest_info.same_sums(pf->rstat);
+  if (good_fragstat && good_rstat) {
+    dout(10) << __func__ << " no corruption found" << dendl;
+    mds->server->respond_to_request(mdr, 0);
+    return;
+  }
+
+  auto _pf = dir->project_fnode(mdr);
+  _pf->version = dir->pre_dirty();
+  pf = _pf;
+
+  mdr->ls = mds->mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
+  mds->mdlog->start_entry(le);
+
+  if (!good_fragstat) {
+    if (pf->fragstat.mtime > frag_info.mtime)
+      frag_info.mtime = pf->fragstat.mtime;
+    if (pf->fragstat.change_attr > frag_info.change_attr)
+      frag_info.change_attr = pf->fragstat.change_attr;
+    _pf->fragstat = frag_info;
+    mds->locker->mark_updated_scatterlock(&diri->filelock);
+    mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
+    mdr->add_updated_lock(&diri->filelock);
+  }
+
+  if (!good_rstat) {
+    if (pf->rstat.rctime > nest_info.rctime)
+      nest_info.rctime = pf->rstat.rctime;
+    _pf->rstat = nest_info;
+    mds->locker->mark_updated_scatterlock(&diri->nestlock);
+    mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
+    mdr->add_updated_lock(&diri->nestlock);
+  }
+
+  le->metablob.add_dir_context(dir);
+  le->metablob.add_dir(dir, true);
+
+  mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
+}
+
+void MDCache::repair_inode_stats(CInode *diri)
+{
+  MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
+  mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state()
+  mdr->internal_op_private = diri;
+  if (diri->scrub_is_in_progress())
+    mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, diri->get_scrub_header());
+  else
+    mdr->internal_op_finish = new C_MDSInternalNoop;
+  repair_inode_stats_work(mdr);
+}
+
+void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
+{
+  CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
+  dout(10) << __func__ << " " << *diri << dendl;
+
+  if (!diri->is_auth()) {
+    mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
+    return;
+  }
+  if (!diri->is_dir()) {
+    mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR);
+    return;
+  }
+
+  MutationImpl::LockOpVec lov;
+
+  if (mdr->ls) // already marked filelock/nestlock dirty ?
+    goto do_rdlocks;
+
+  lov.add_rdlock(&diri->dirfragtreelock);
+  lov.add_wrlock(&diri->nestlock);
+  lov.add_wrlock(&diri->filelock);
+  if (!mds->locker->acquire_locks(mdr, lov))
+    return;
+
+  // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
+  // the scatter-gather process, which will fix any fragstat/rstat errors.
+  {
+    frag_vec_t leaves;
+    diri->dirfragtree.get_leaves(leaves);
+    for (const auto& leaf : leaves) {
+      CDir *dir = diri->get_dirfrag(leaf);
+      if (!dir) {
+        ceph_assert(mdr->is_auth_pinned(diri));
+        dir = diri->get_or_open_dirfrag(this, leaf);
+      }
+      if (dir->get_version() == 0) {
+        ceph_assert(dir->is_auth());
+        dir->fetch(new C_MDS_RetryRequest(this, mdr));
+        return;
+      }
+    }
+  }
+
+  diri->state_set(CInode::STATE_REPAIRSTATS);
+  mdr->ls = mds->mdlog->get_current_segment();
+  mds->locker->mark_updated_scatterlock(&diri->filelock);
+  mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
+  mds->locker->mark_updated_scatterlock(&diri->nestlock);
+  mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
+
+  mds->locker->drop_locks(mdr.get());
+
+do_rdlocks:
+  // force the scatter-gather process
+  lov.clear();
+  lov.add_rdlock(&diri->dirfragtreelock);
+  lov.add_rdlock(&diri->nestlock);
+  lov.add_rdlock(&diri->filelock);
+  if (!mds->locker->acquire_locks(mdr, lov))
+    return;
+
+  diri->state_clear(CInode::STATE_REPAIRSTATS);
+
+  frag_info_t dir_info;
+  nest_info_t nest_info;
+  nest_info.rsubdirs = 1; // it gets one to account for self
+  if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
+    nest_info.rsnaps = srnode->snaps.size();
+
+  {
+    frag_vec_t leaves;
+    diri->dirfragtree.get_leaves(leaves);
+    for (const auto& leaf : leaves) {
+      CDir *dir = diri->get_dirfrag(leaf);
+      ceph_assert(dir);
+      ceph_assert(dir->get_version() > 0);
+      dir_info.add(dir->get_fnode()->accounted_fragstat);
+      nest_info.add(dir->get_fnode()->accounted_rstat);
+    }
+  }
+
+  if (!dir_info.same_sums(diri->get_inode()->dirstat) ||
+      !nest_info.same_sums(diri->get_inode()->rstat)) {
+    dout(10) << __func__ << " failed to fix fragstat/rstat on "
+	     << *diri << dendl;
+  }
+
+  mds->server->respond_to_request(mdr, 0);
+}
+
+void MDCache::rdlock_dirfrags_stats(CInode *diri, MDSInternalContext* fin)
+{
+  MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_RDLOCK_FRAGSSTATS);
+  mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state()
+  mdr->internal_op_private = diri;
+  mdr->internal_op_finish = fin;
+  return rdlock_dirfrags_stats_work(mdr);
+}
+
+void MDCache::rdlock_dirfrags_stats_work(MDRequestRef& mdr)
+{
+  CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
+  dout(10) << __func__ << " " << *diri << dendl;
+  if (!diri->is_auth()) {
+    mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
+    return;
+  }
+  if (!diri->is_dir()) {
+    mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR);
+    return;
+  }
+
+  MutationImpl::LockOpVec lov;
+  lov.add_rdlock(&diri->dirfragtreelock);
+  lov.add_rdlock(&diri->nestlock);
+  lov.add_rdlock(&diri->filelock);
+  if (!mds->locker->acquire_locks(mdr, lov))
+    return;
+  dout(10) << __func__ << " start dirfrags : " << *diri << dendl;
+
+  mds->server->respond_to_request(mdr, 0);
+  return;
+}
+
+void MDCache::flush_dentry(std::string_view path, Context *fin)
+{
+  if (is_readonly()) {
+    dout(10) << __func__ << ": read-only FS" << dendl;
+    fin->complete(-CEPHFS_EROFS);
+    return;
+  }
+  dout(10) << "flush_dentry " << path << dendl;
+  MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
+  filepath fp(path);
+  mdr->set_filepath(fp);
+  mdr->internal_op_finish = fin;
+  flush_dentry_work(mdr);
+}
+
+class C_FinishIOMDR : public MDSContext {
+protected:
+  MDSRank *mds;
+  MDRequestRef mdr;
+  MDSRank *get_mds() override { return mds; }
+public:
+  C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
+  void finish(int r) override { mds->server->respond_to_request(mdr, r); }
+};
+
+void MDCache::flush_dentry_work(MDRequestRef& mdr)
+{
+  MutationImpl::LockOpVec lov;
+  CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
+  if (!in)
+    return;
+
+  ceph_assert(in->is_auth());
+  in->flush(new C_FinishIOMDR(mds, mdr));
+}
+
+
+/**
+ * Initialize performance counters with global perfcounter
+ * collection.
+ */
+void MDCache::register_perfcounters()
+{
+    PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
+
+    // Stray/purge statistics
+    pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
+                PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64(l_mdc_num_recovering_enqueued,
+                "num_recovering_enqueued", "Files waiting for recovery", "recy",
+                PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mdc_recovery_completed,
+                        "recovery_completed", "File recoveries completed", "recd",
+                        PerfCountersBuilder::PRIO_INTERESTING);
+
+    // useful recovery queue statistics
+    pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+    pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
+                "Files currently being recovered");
+    pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
+                "Files waiting for recovery with elevated priority");
+    pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
+                        "File recoveries started");
+
+    // along with other stray dentries stats
+    pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
+                "Stray dentries delayed");
+    pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
+                "Stray dentries enqueuing for purge");
+    pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
+                        "Stray dentries created");
+    pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
+                        "Stray dentries enqueued for purge");
+    pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
+                        "Stray dentries reintegrated");
+    pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
+                        "Stray dentries migrated");
+
+    // low prio internal request stats
+    pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
+                        "Internal Request type enqueue scrub");
+    pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
+                        "Internal Request type export dir");
+    pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
+                        "Internal Request type flush");
+    pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
+                        "Internal Request type fragmentdir");
+    pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
+                        "Internal Request type frag stats");
+    pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
+                        "Internal Request type inode stats");
+
+    logger.reset(pcb.create_perf_counters());
+    g_ceph_context->get_perfcounters_collection()->add(logger.get());
+    recovery_queue.set_logger(logger.get());
+    stray_manager.set_logger(logger.get());
+}
+
+/**
+ * Call this when putting references to an inode/dentry or
+ * when attempting to trim it.
+ *
+ * If this inode is no longer linked by anyone, and this MDS
+ * rank holds the primary dentry, and that dentry is in a stray
+ * directory, then give up the dentry to the StrayManager, never
+ * to be seen again by MDCache.
+ *
+ * @param delay if true, then purgeable inodes are stashed til
+ *              the next trim(), rather than being purged right
+ *              away.
+ */
+void MDCache::maybe_eval_stray(CInode *in, bool delay) {
+  if (in->get_inode()->nlink > 0 || in->is_base() || is_readonly() ||
+      mds->get_state() <= MDSMap::STATE_REJOIN)
+    return;
+
+  CDentry *dn = in->get_projected_parent_dn();
+
+  if (dn->state_test(CDentry::STATE_PURGING)) {
+    /* We have already entered the purging process, no need
+     * to re-evaluate me ! */
+    return;
+  }
+
+  if (dn->get_dir()->get_inode()->is_stray()) {
+    if (delay)
+      stray_manager.queue_delayed(dn);
+    else
+      stray_manager.eval_stray(dn);
+  }
+}
+
+void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
+  dout(10) << __func__ << " " << *diri << dendl;
+  ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
+  auto&& ls = diri->get_dirfrags();
+  for (auto &p : ls) {
+    if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
+      p->try_remove_dentries_for_stray();
+  }
+  if (!diri->snaprealm) {
+    if (diri->is_auth())
+      diri->clear_dirty_rstat();
+    diri->clear_scatter_dirty();
+  }
+}
+
+bool MDCache::dump_inode(Formatter *f, uint64_t number) {
+  CInode *in = get_inode(number);
+  if (!in) {
+    return false;
+  }
+  f->open_object_section("inode");
+  in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
+  f->close_section();
+  return true;
+}
+
+void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) {
+  const mds_rank_t max_mds = mdsmap.get_max_mds();
+
+  // process export_pin_delayed_queue whenever a new MDSMap received
+  auto &q = export_pin_delayed_queue;
+  for (auto it = q.begin(); it != q.end(); ) {
+    auto *in = *it;
+    mds_rank_t export_pin = in->get_export_pin(false);
+    dout(10) << " delayed export_pin=" << export_pin << " on " << *in 
+             << " max_mds=" << max_mds << dendl;
+    if (export_pin >= mdsmap.get_max_mds()) {
+      it++;
+      continue;
+    }
+
+    in->state_clear(CInode::STATE_DELAYEDEXPORTPIN);
+    it = q.erase(it);
+    in->queue_export_pin(export_pin);
+  }
+
+  if (mdsmap.get_max_mds() != oldmap.get_max_mds()) {
+    dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl;
+    /* copy to vector to avoid removals during iteration */
+    std::vector<CInode*> migrate;
+    migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end());
+    for (auto& in : migrate) {
+      in->maybe_export_pin();
+    }
+  }
+
+  if (max_mds <= 1) {
+    export_ephemeral_dist_frag_bits = 0;
+  } else {
+    double want = g_conf().get_val<double>("mds_export_ephemeral_distributed_factor");
+    want *= max_mds;
+    unsigned n = 0;
+    while ((1U << n) < (unsigned)want)
+      ++n;
+    export_ephemeral_dist_frag_bits = n;
+  }
+}
+
+void MDCache::upkeep_main(void)
+{
+  std::unique_lock lock(upkeep_mutex);
+  while (!upkeep_trim_shutdown.load()) {
+    auto now = clock::now();
+    auto since = now-upkeep_last_trim;
+    auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval"));
+    if (since >= trim_interval*.90) {
+      lock.unlock(); /* mds_lock -> upkeep_mutex */
+      std::scoped_lock mds_lock(mds->mds_lock);
+      lock.lock();
+      if (upkeep_trim_shutdown.load())
+        return;
+      check_memory_usage();
+      if (mds->is_cache_trimmable()) {
+        dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl;
+        bool active_with_clients = mds->is_active() || mds->is_clientreplay() || mds->is_stopping();
+        if (active_with_clients) {
+          trim_client_leases();
+        }
+        if (is_open()) {
+          trim();
+        }
+        if (active_with_clients) {
+          auto recall_flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS;
+          if (cache_toofull()) {
+            recall_flags = recall_flags|Server::RecallFlags::TRIM;
+          }
+          mds->server->recall_client_state(nullptr, recall_flags);
+        }
+        upkeep_last_trim = now = clock::now();
+      } else {
+        dout(10) << "cache not ready for trimming" << dendl;
+      }
+    } else {
+      trim_interval -= since;
+    }
+    since = now-upkeep_last_release;
+    auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval"));
+    if (since >= release_interval*.90) {
+      /* XXX not necessary once MDCache uses PriorityCache */
+      dout(10) << "releasing free memory" << dendl;
+      ceph_heap_release_free_memory();
+      upkeep_last_release = clock::now();
+    } else {
+      release_interval -= since;
+    }
+    auto interval = std::min(release_interval, trim_interval);
+    dout(20) << "upkeep thread waiting interval " << interval << dendl;
+    upkeep_cvar.wait_for(lock, interval);
+  }
+}
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
new file mode 100644
index 000000000..56fe1164b
--- /dev/null
+++ b/src/mds/MDCache.h
@@ -0,0 +1,1366 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#ifndef CEPH_MDCACHE_H
+#define CEPH_MDCACHE_H
+
+#include <atomic>
+#include <string_view>
+#include <thread>
+
+#include "common/DecayCounter.h"
+#include "include/common_fwd.h"
+#include "include/types.h"
+#include "include/filepath.h"
+#include "include/elist.h"
+
+#include "messages/MCacheExpire.h"
+#include "messages/MClientQuota.h"
+#include "messages/MClientRequest.h"
+#include "messages/MClientSnap.h"
+#include "messages/MDentryLink.h"
+#include "messages/MDentryUnlink.h"
+#include "messages/MDirUpdate.h"
+#include "messages/MDiscover.h"
+#include "messages/MDiscoverReply.h"
+#include "messages/MGatherCaps.h"
+#include "messages/MGenericMessage.h"
+#include "messages/MInodeFileCaps.h"
+#include "messages/MLock.h"
+#include "messages/MMDSCacheRejoin.h"
+#include "messages/MMDSFindIno.h"
+#include "messages/MMDSFindInoReply.h"
+#include "messages/MMDSFragmentNotify.h"
+#include "messages/MMDSFragmentNotifyAck.h"
+#include "messages/MMDSOpenIno.h"
+#include "messages/MMDSOpenInoReply.h"
+#include "messages/MMDSResolve.h"
+#include "messages/MMDSResolveAck.h"
+#include "messages/MMDSPeerRequest.h"
+#include "messages/MMDSSnapUpdate.h"
+
+#include "osdc/Filer.h"
+#include "CInode.h"
+#include "CDentry.h"
+#include "CDir.h"
+#include "include/Context.h"
+#include "events/EMetaBlob.h"
+#include "RecoveryQueue.h"
+#include "StrayManager.h"
+#include "OpenFileTable.h"
+#include "MDSContext.h"
+#include "MDSMap.h"
+#include "Mutation.h"
+
+class MDSRank;
+class Session;
+class Migrator;
+
+class Session;
+
+class ESubtreeMap;
+
+enum {
+  l_mdc_first = 3000,
+  // How many inodes currently in stray dentries
+  l_mdc_num_strays,
+  // How many stray dentries are currently delayed for purge due to refs
+  l_mdc_num_strays_delayed,
+  // How many stray dentries are currently being enqueued for purge
+  l_mdc_num_strays_enqueuing,
+
+  // How many dentries have ever been added to stray dir
+  l_mdc_strays_created,
+  // How many dentries have been passed on to PurgeQueue
+  l_mdc_strays_enqueued,
+  // How many strays have been reintegrated?
+  l_mdc_strays_reintegrated,
+  // How many strays have been migrated?
+  l_mdc_strays_migrated,
+
+  // How many inode sizes currently being recovered
+  l_mdc_num_recovering_processing,
+  // How many inodes currently waiting to have size recovered
+  l_mdc_num_recovering_enqueued,
+  // How many inodes waiting with elevated priority for recovery
+  l_mdc_num_recovering_prioritized,
+  // How many inodes ever started size recovery
+  l_mdc_recovery_started,
+  // How many inodes ever completed size recovery
+  l_mdc_recovery_completed,
+
+  l_mdss_ireq_enqueue_scrub,
+  l_mdss_ireq_exportdir,
+  l_mdss_ireq_flush,
+  l_mdss_ireq_fragmentdir,
+  l_mdss_ireq_fragstats,
+  l_mdss_ireq_inodestats,
+
+  l_mdc_last,
+};
+
+// flags for path_traverse();
+static const int MDS_TRAVERSE_DISCOVER		= (1 << 0);
+static const int MDS_TRAVERSE_PATH_LOCKED	= (1 << 1);
+static const int MDS_TRAVERSE_WANT_DENTRY	= (1 << 2);
+static const int MDS_TRAVERSE_WANT_AUTH		= (1 << 3);
+static const int MDS_TRAVERSE_RDLOCK_SNAP	= (1 << 4);
+static const int MDS_TRAVERSE_RDLOCK_SNAP2	= (1 << 5);
+static const int MDS_TRAVERSE_WANT_DIRLAYOUT	= (1 << 6);
+static const int MDS_TRAVERSE_RDLOCK_PATH	= (1 << 7);
+static const int MDS_TRAVERSE_XLOCK_DENTRY	= (1 << 8);
+static const int MDS_TRAVERSE_RDLOCK_AUTHLOCK	= (1 << 9);
+static const int MDS_TRAVERSE_CHECK_LOCKCACHE	= (1 << 10);
+
+
+// flags for predirty_journal_parents()
+static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting
+static const int PREDIRTY_DIR = 2;     // update parent dir mtime/size
+static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback)
+
+class MDCache {
+ public:
+  typedef std::map<mds_rank_t, ref_t<MCacheExpire>> expiremap;
+
+  using clock = ceph::coarse_mono_clock;
+  using time = ceph::coarse_mono_time;
+
+  // -- discover --
+  struct discover_info_t {
+    discover_info_t() {}
+    ~discover_info_t() {
+      if (basei)
+	basei->put(MDSCacheObject::PIN_DISCOVERBASE);
+    }
+    void pin_base(CInode *b) {
+      basei = b;
+      basei->get(MDSCacheObject::PIN_DISCOVERBASE);
+    }
+
+    ceph_tid_t tid = 0;
+    mds_rank_t mds = -1;
+    inodeno_t ino;
+    frag_t frag;
+    snapid_t snap = CEPH_NOSNAP;
+    filepath want_path;
+    CInode *basei = nullptr;
+    bool want_base_dir = false;
+    bool path_locked = false;
+  };
+
+  // [reconnect/rejoin caps]
+  struct reconnected_cap_info_t {
+    reconnected_cap_info_t() {}
+    inodeno_t realm_ino = 0;
+    snapid_t snap_follows = 0;
+    int dirty_caps = 0;
+    bool snapflush = 0;
+  };
+
+  // -- find_ino_peer --
+  struct find_ino_peer_info_t {
+    find_ino_peer_info_t() {}
+    inodeno_t ino;
+    ceph_tid_t tid = 0;
+    MDSContext *fin = nullptr;
+    bool path_locked = false;
+    mds_rank_t hint = MDS_RANK_NONE;
+    mds_rank_t checking = MDS_RANK_NONE;
+    set<mds_rank_t> checked;
+  };
+
+  friend class C_MDC_RejoinOpenInoFinish;
+  friend class C_MDC_RejoinSessionsOpened;
+
+  friend class Locker;
+  friend class Migrator;
+  friend class MDBalancer;
+
+  // StrayManager needs to be able to remove_inode() from us
+  // when it is done purging
+  friend class StrayManager;
+
+  explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
+  ~MDCache();
+
+  uint64_t cache_limit_memory(void) {
+    return cache_memory_limit;
+  }
+  double cache_toofull_ratio(void) const {
+    double memory_reserve = cache_memory_limit*(1.0-cache_reservation);
+    return fmax(0.0, (cache_size()-memory_reserve)/memory_reserve);
+  }
+  bool cache_toofull(void) const {
+    return cache_toofull_ratio() > 0.0;
+  }
+  uint64_t cache_size(void) const {
+    return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
+  }
+  bool cache_overfull(void) const {
+    return cache_size() > cache_memory_limit*cache_health_threshold;
+  }
+
+  void advance_stray();
+
+  unsigned get_ephemeral_dist_frag_bits() const {
+    return export_ephemeral_dist_frag_bits;
+  }
+  bool get_export_ephemeral_distributed_config(void) const {
+    return export_ephemeral_distributed_config;
+  }
+
+  bool get_export_ephemeral_random_config(void) const {
+    return export_ephemeral_random_config;
+  }
+
+  /**
+   * Call this when you know that a CDentry is ready to be passed
+   * on to StrayManager (i.e. this is a stray you've just created)
+   */
+  void notify_stray(CDentry *dn) {
+    ceph_assert(dn->get_dir()->get_inode()->is_stray());
+    if (dn->state_test(CDentry::STATE_PURGING))
+      return;
+
+    stray_manager.eval_stray(dn);
+  }
+
+  mds_rank_t hash_into_rank_bucket(inodeno_t ino, frag_t fg=0);
+
+  void maybe_eval_stray(CInode *in, bool delay=false);
+  void clear_dirty_bits_for_stray(CInode* diri);
+
+  bool is_readonly() { return readonly; }
+  void force_readonly();
+
+  static file_layout_t gen_default_file_layout(const MDSMap &mdsmap);
+  static file_layout_t gen_default_log_layout(const MDSMap &mdsmap);
+
+  void register_perfcounters();
+
+  void touch_client_lease(ClientLease *r, int pool, utime_t ttl) {
+    client_leases[pool].push_back(&r->item_lease);
+    r->ttl = ttl;
+  }
+
+  void notify_stray_removed()
+  {
+    stray_manager.notify_stray_removed();
+  }
+
+  void notify_stray_created()
+  {
+    stray_manager.notify_stray_created();
+  }
+
+  void eval_remote(CDentry *dn)
+  {
+    stray_manager.eval_remote(dn);
+  }
+
+  void _send_discover(discover_info_t& dis);
+  discover_info_t& _create_discover(mds_rank_t mds) {
+    ceph_tid_t t = ++discover_last_tid;
+    discover_info_t& d = discovers[t];
+    d.tid = t;
+    d.mds = mds;
+    return d;
+  }
+
+  void discover_base_ino(inodeno_t want_ino, MDSContext *onfinish, mds_rank_t from=MDS_RANK_NONE);
+  void discover_dir_frag(CInode *base, frag_t approx_fg, MDSContext *onfinish,
+			 mds_rank_t from=MDS_RANK_NONE);
+  void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
+		     bool path_locked=false, mds_rank_t from=MDS_RANK_NONE);
+  void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
+		     bool path_locked=false);
+  void kick_discovers(mds_rank_t who);  // after a failure.
+
+  // adjust subtree auth specification
+  //  dir->dir_auth
+  //  imports/exports/nested_exports
+  //  join/split subtrees as appropriate
+  bool is_subtrees() { return !subtrees.empty(); }
+  template<typename T>
+  void get_subtrees(T& c) {
+    if constexpr (std::is_same_v<T, std::vector<CDir*>>)
+      c.reserve(c.size() + subtrees.size());
+    for (const auto& p : subtrees) {
+      c.push_back(p.first);
+    }
+  }
+  void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true);
+  void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
+    adjust_subtree_auth(root, mds_authority_t(a,b));
+  }
+  void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth);
+  void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_rank_t a) {
+    adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
+  }
+  void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, const mds_authority_t &auth);
+  void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, mds_rank_t a) {
+    adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
+  }
+  void map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result);
+  void try_subtree_merge(CDir *root);
+  void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval, bool adjust_pop=true);
+  void eval_subtree_root(CInode *diri);
+  CDir *get_subtree_root(CDir *dir);
+  CDir *get_projected_subtree_root(CDir *dir);
+  bool is_leaf_subtree(CDir *dir) {
+    ceph_assert(subtrees.count(dir));
+    return subtrees[dir].empty();
+  }
+  void remove_subtree(CDir *dir);
+  bool is_subtree(CDir *root) {
+    return subtrees.count(root);
+  }
+  void get_subtree_bounds(CDir *root, set<CDir*>& bounds);
+  void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds);
+  void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds);
+  void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds);
+
+  void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir);
+  void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop);
+
+  auto get_auth_subtrees() {
+    std::vector<CDir*> c;
+    for (auto& p : subtrees) {
+      auto& root = p.first;
+      if (root->is_auth()) {
+        c.push_back(root);
+      }
+    }
+    return c;
+  }
+
+  auto get_fullauth_subtrees() {
+    std::vector<CDir*> c;
+    for (auto& p : subtrees) {
+      auto& root = p.first;
+      if (root->is_full_dir_auth()) {
+        c.push_back(root);
+      }
+    }
+    return c;
+  }
+  auto num_subtrees_fullauth() const {
+    std::size_t n = 0;
+    for (auto& p : subtrees) {
+      auto& root = p.first;
+      if (root->is_full_dir_auth()) {
+        ++n;
+      }
+    }
+    return n;
+  }
+
+  auto num_subtrees_fullnonauth() const {
+    std::size_t n = 0;
+    for (auto& p : subtrees) {
+      auto& root = p.first;
+      if (root->is_full_dir_nonauth()) {
+        ++n;
+      }
+    }
+    return n;
+  }
+
+  auto num_subtrees() const {
+    return subtrees.size();
+  }
+
+  int get_num_client_requests();
+
+  MDRequestRef request_start(const cref_t<MClientRequest>& req);
+  MDRequestRef request_start_peer(metareqid_t rid, __u32 attempt, const cref_t<Message> &m);
+  MDRequestRef request_start_internal(int op);
+  bool have_request(metareqid_t rid) {
+    return active_requests.count(rid);
+  }
+  MDRequestRef request_get(metareqid_t rid);
+  void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace);
+  void request_finish(MDRequestRef& mdr);
+  void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0);
+  void dispatch_request(MDRequestRef& mdr);
+  void request_drop_foreign_locks(MDRequestRef& mdr);
+  void request_drop_non_rdlocks(MDRequestRef& r);
+  void request_drop_locks(MDRequestRef& r);
+  void request_cleanup(MDRequestRef& r);
+  
+  void request_kill(MDRequestRef& r);  // called when session closes
+
+  // journal/snap helpers
+  CInode *pick_inode_snap(CInode *in, snapid_t follows);
+  CInode *cow_inode(CInode *in, snapid_t last);
+  void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
+                          snapid_t follows=CEPH_NOSNAP,
+			  CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
+  void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
+
+  void project_rstat_inode_to_frag(const MutationRef& mut,
+				   CInode *cur, CDir *parent, snapid_t first,
+				   int linkunlink, SnapRealm *prealm);
+  void _project_rstat_inode_to_frag(const CInode::mempool_inode* inode, snapid_t ofirst, snapid_t last,
+				    CDir *parent, int linkunlink, bool update_inode);
+  void project_rstat_frag_to_inode(const nest_info_t& rstat, const nest_info_t& accounted_rstat,
+				   snapid_t ofirst, snapid_t last, CInode *pin, bool cow_head);
+  void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false);
+  void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
+				CInode *in, CDir *parent,
+				int flags, int linkunlink=0,
+				snapid_t follows=CEPH_NOSNAP);
+
+  // peers
+  void add_uncommitted_leader(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &peers, bool safe=false) {
+    uncommitted_leaders[reqid].ls = ls;
+    uncommitted_leaders[reqid].peers = peers;
+    uncommitted_leaders[reqid].safe = safe;
+  }
+  void wait_for_uncommitted_leader(metareqid_t reqid, MDSContext *c) {
+    uncommitted_leaders[reqid].waiters.push_back(c);
+  }
+  bool have_uncommitted_leader(metareqid_t reqid, mds_rank_t from) {
+    auto p = uncommitted_leaders.find(reqid);
+    return p != uncommitted_leaders.end() && p->second.peers.count(from) > 0;
+  }
+  void log_leader_commit(metareqid_t reqid);
+  void logged_leader_update(metareqid_t reqid);
+  void _logged_leader_commit(metareqid_t reqid);
+  void committed_leader_peer(metareqid_t r, mds_rank_t from);
+  void finish_committed_leaders();
+
+  void add_uncommitted_peer(metareqid_t reqid, LogSegment*, mds_rank_t, MDPeerUpdate *su=nullptr);
+  void wait_for_uncommitted_peer(metareqid_t reqid, MDSContext *c) {
+    uncommitted_peers.at(reqid).waiters.push_back(c);
+  }
+  void finish_uncommitted_peer(metareqid_t reqid, bool assert_exist=true);
+  MDPeerUpdate* get_uncommitted_peer(metareqid_t reqid, mds_rank_t leader);
+  void _logged_peer_commit(mds_rank_t from, metareqid_t reqid);
+
+  void set_recovery_set(set<mds_rank_t>& s);
+  void handle_mds_failure(mds_rank_t who);
+  void handle_mds_recovery(mds_rank_t who);
+
+  void recalc_auth_bits(bool replay);
+  void remove_inode_recursive(CInode *in);
+
+  bool is_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) {
+    auto p = ambiguous_peer_updates.find(leader);
+    return p != ambiguous_peer_updates.end() && p->second.count(reqid);
+  }
+  void add_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) {
+    ambiguous_peer_updates[leader].insert(reqid);
+  }
+  void remove_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) {
+    auto p = ambiguous_peer_updates.find(leader);
+    auto q = p->second.find(reqid);
+    ceph_assert(q != p->second.end());
+    p->second.erase(q);
+    if (p->second.empty())
+      ambiguous_peer_updates.erase(p);
+  }
+
+  void add_rollback(metareqid_t reqid, mds_rank_t leader) {
+    resolve_need_rollback[reqid] = leader;
+  }
+  void finish_rollback(metareqid_t reqid, MDRequestRef& mdr);
+
+  // ambiguous imports
+  void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
+  void add_ambiguous_import(CDir *base, const set<CDir*>& bounds);
+  bool have_ambiguous_import(dirfrag_t base) {
+    return my_ambiguous_imports.count(base);
+  }
+  void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) {
+    ceph_assert(my_ambiguous_imports.count(base));
+    bounds = my_ambiguous_imports[base];
+  }
+  void cancel_ambiguous_import(CDir *);
+  void finish_ambiguous_import(dirfrag_t dirino);
+  void resolve_start(MDSContext *resolve_done_);
+  void send_resolves();
+  void maybe_send_pending_resolves() {
+    if (resolves_pending)
+      send_subtree_resolves();
+  }
+  
+  void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
+			       map<dirfrag_t,vector<dirfrag_t> >& subtrees);
+  ESubtreeMap *create_subtree_map();
+
+  void clean_open_file_lists();
+  void dump_openfiles(Formatter *f);
+  bool dump_inode(Formatter *f, uint64_t number);
+
+  void rejoin_start(MDSContext *rejoin_done_);
+  void rejoin_gather_finish();
+  void rejoin_send_rejoins();
+  void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
+			  int target=-1, bool drop_path=false) {
+    auto& ex = cap_exports[ino];
+    ex.first = target;
+    auto &_icr = ex.second[client] = icr;
+    if (drop_path)
+      _icr.path.clear();
+  }
+  void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr, 
+			     mds_rank_t frommds=MDS_RANK_NONE, bool drop_path=false) {
+    auto &_icr = cap_imports[ino][client][frommds] = icr;
+    if (drop_path)
+      _icr.path.clear();
+  }
+  void rejoin_recovered_client(client_t client, const entity_inst_t& inst) {
+    rejoin_client_map.emplace(client, inst);
+  }
+  bool rejoin_has_cap_reconnect(inodeno_t ino) const {
+    return cap_imports.count(ino);
+  }
+  void add_replay_ino_alloc(inodeno_t ino) {
+    cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin
+  }
+  const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
+    if (cap_imports.count(ino) &&
+	cap_imports[ino].count(client) &&
+	cap_imports[ino][client].count(MDS_RANK_NONE)) {
+      return &cap_imports[ino][client][MDS_RANK_NONE];
+    }
+    return NULL;
+  }
+  void remove_replay_cap_reconnect(inodeno_t ino, client_t client) {
+    ceph_assert(cap_imports[ino].size() == 1);
+    ceph_assert(cap_imports[ino][client].size() == 1);
+    cap_imports.erase(ino);
+  }
+  void wait_replay_cap_reconnect(inodeno_t ino, MDSContext *c) {
+    cap_reconnect_waiters[ino].push_back(c);
+  }
+
+  void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) {
+    reconnected_cap_info_t &info = reconnected_caps[ino][client];
+    info.realm_ino = inodeno_t(icr.capinfo.snaprealm);
+    info.snap_follows = icr.snap_follows;
+  }
+  void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty, bool snapflush) {
+    reconnected_cap_info_t &info = reconnected_caps[ino][client];
+    info.dirty_caps |= dirty;
+    if (snapflush)
+      info.snapflush = snapflush;
+  }
+  void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
+    reconnected_snaprealms[ino][client] = seq;
+  }
+
+  void rejoin_open_ino_finish(inodeno_t ino, int ret);
+  void rejoin_prefetch_ino_finish(inodeno_t ino, int ret);
+  void rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map);
+  bool process_imported_caps();
+  void choose_lock_states_and_reconnect_caps();
+  void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
+			   map<client_t,ref_t<MClientSnap>>& splits);
+  void prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, map<client_t,ref_t<MClientSnap>>& splits);
+  void send_snaps(map<client_t,ref_t<MClientSnap>>& splits);
+  Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
+  void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
+				  map<client_t,ref_t<MClientSnap>>& updates);
+  Capability* try_reconnect_cap(CInode *in, Session *session);
+  void export_remaining_imported_caps();
+
+  void do_cap_import(Session *session, CInode *in, Capability *cap,
+		     uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
+		     int peer, int p_flags);
+  void do_delayed_cap_imports();
+  void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client,
+			      snapid_t snap_follows);
+  void open_snaprealms();
+
+  bool open_undef_inodes_dirfrags();
+  void opened_undef_inode(CInode *in);
+  void opened_undef_dirfrag(CDir *dir) {
+    rejoin_undef_dirfrags.erase(dir);
+  }
+
+  void reissue_all_caps();
+
+  void start_files_to_recover();
+  void do_file_recover();
+  void queue_file_recover(CInode *in);
+  void _queued_file_recover_cow(CInode *in, MutationRef& mut);
+
+  void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
+  
+  // debug
+  void log_stat();
+
+  // root inode
+  CInode *get_root() { return root; }
+  CInode *get_myin() { return myin; }
+
+  size_t get_cache_size() { return lru.lru_get_size(); }
+
+  // trimming
+  std::pair<bool, uint64_t> trim(uint64_t count=0);
+
+  bool trim_non_auth_subtree(CDir *directory);
+  void standby_trim_segment(LogSegment *ls);
+  void try_trim_non_auth_subtree(CDir *dir);
+  bool can_trim_non_auth_dirfrag(CDir *dir) {
+    return my_ambiguous_imports.count((dir)->dirfrag()) == 0 &&
+	   uncommitted_peer_rename_olddir.count(dir->inode) == 0;
+  }
+
+  /**
+   * For all unreferenced inodes, dirs, dentries below an inode, compose
+   * expiry messages.  This is used when giving up all replicas of entities
+   * for an MDS peer in the 'stopping' state, such that the peer can
+   * empty its cache and finish shutting down.
+   *
+   * We have to make sure we're only expiring un-referenced items to
+   * avoid interfering with ongoing stray-movement (we can't distinguish
+   * between the "moving my strays" and "waiting for my cache to empty"
+   * phases within 'stopping')
+   *
+   * @return false if we completed cleanly, true if caller should stop
+   *         expiring because we hit something with refs.
+   */
+  bool expire_recursive(CInode *in, expiremap& expiremap);
+
+  void trim_client_leases();
+  void check_memory_usage();
+
+  void shutdown_start();
+  void shutdown_check();
+  bool shutdown_pass();
+  bool shutdown();                    // clear cache (ie at shutodwn)
+  bool shutdown_export_strays();
+  void shutdown_export_stray_finish(inodeno_t ino) {
+    if (shutdown_exporting_strays.erase(ino))
+      shutdown_export_strays();
+  }
+
+  // inode_map
+  bool have_inode(vinodeno_t vino) {
+    if (vino.snapid == CEPH_NOSNAP)
+      return inode_map.count(vino.ino) ? true : false;
+    else
+      return snap_inode_map.count(vino) ? true : false;
+  }
+  bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) {
+    return have_inode(vinodeno_t(ino, snap));
+  }
+  CInode* get_inode(vinodeno_t vino) {
+    if (vino.snapid == CEPH_NOSNAP) {
+      auto p = inode_map.find(vino.ino);
+      if (p != inode_map.end())
+	return p->second;
+    } else {
+      auto p = snap_inode_map.find(vino);
+      if (p != snap_inode_map.end())
+	return p->second;
+    }
+    return NULL;
+  }
+  CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) {
+    return get_inode(vinodeno_t(ino, s));
+  }
+  CInode* lookup_snap_inode(vinodeno_t vino) {
+    auto p = snap_inode_map.lower_bound(vino);
+    if (p != snap_inode_map.end() &&
+	p->second->ino() == vino.ino && p->second->first <= vino.snapid)
+      return p->second;
+    return NULL;
+  }
+
+  CDir* get_dirfrag(dirfrag_t df) {
+    CInode *in = get_inode(df.ino);
+    if (!in)
+      return NULL;
+    return in->get_dirfrag(df.frag);
+  }
+  CDir* get_dirfrag(inodeno_t ino, std::string_view dn) {
+    CInode *in = get_inode(ino);
+    if (!in)
+      return NULL;
+    frag_t fg = in->pick_dirfrag(dn);
+    return in->get_dirfrag(fg);
+  }
+  CDir* get_force_dirfrag(dirfrag_t df, bool replay) {
+    CInode *diri = get_inode(df.ino);
+    if (!diri)
+      return NULL;
+    CDir *dir = force_dir_fragment(diri, df.frag, replay);
+    if (!dir)
+      dir = diri->get_dirfrag(df.frag);
+    return dir;
+  }
+
+  MDSCacheObject *get_object(const MDSCacheObjectInfo &info);
+
+  void add_inode(CInode *in);
+
+  void remove_inode(CInode *in);
+
+  void touch_dentry(CDentry *dn) {
+    if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
+      bottom_lru.lru_midtouch(dn);
+    } else {
+      if (dn->is_auth())
+	lru.lru_touch(dn);
+      else
+	lru.lru_midtouch(dn);
+    }
+  }
+  void touch_dentry_bottom(CDentry *dn) {
+    if (dn->state_test(CDentry::STATE_BOTTOMLRU))
+      return;
+    lru.lru_bottouch(dn);
+  }
+
+  // truncate
+  void truncate_inode(CInode *in, LogSegment *ls);
+  void _truncate_inode(CInode *in, LogSegment *ls);
+  void truncate_inode_finish(CInode *in, LogSegment *ls);
+  void truncate_inode_logged(CInode *in, MutationRef& mut);
+
+  void add_recovered_truncate(CInode *in, LogSegment *ls);
+  void remove_recovered_truncate(CInode *in, LogSegment *ls);
+  void start_recovered_truncates();
+
+  // purge unsafe inodes
+  void start_purge_inodes();
+  void purge_inodes(const interval_set<inodeno_t>& i, LogSegment *ls);
+
+  CDir *get_auth_container(CDir *in);
+  CDir *get_export_container(CDir *dir);
+  void find_nested_exports(CDir *dir, set<CDir*>& s);
+  void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
+
+  void init_layouts();
+  void create_unlinked_system_inode(CInode *in, inodeno_t ino,
+                                    int mode) const;
+  CInode *create_system_inode(inodeno_t ino, int mode);
+  CInode *create_root_inode();
+
+  void create_empty_hierarchy(MDSGather *gather);
+  void create_mydir_hierarchy(MDSGather *gather);
+
+  bool is_open() { return open; }
+  void wait_for_open(MDSContext *c) {
+    waiting_for_open.push_back(c);
+  }
+
+  void open_root_inode(MDSContext *c);
+  void open_root();
+  void open_mydir_inode(MDSContext *c);
+  void open_mydir_frag(MDSContext *c);
+  void populate_mydir();
+
+  void _create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin);
+  void _create_system_file_finish(MutationRef& mut, CDentry *dn,
+                                  version_t dpv, MDSContext *fin);
+
+  void open_foreign_mdsdir(inodeno_t ino, MDSContext *c);
+  CDir *get_stray_dir(CInode *in);
+
+  /**
+   * Find the given dentry (and whether it exists or not), its ancestors,
+   * and get them all into memory and usable on this MDS. This function
+   * makes a best-effort attempt to load everything; if it needs to
+   * go away and do something then it will put the request on a waitlist.
+   * It prefers the mdr, then the req, then the fin. (At least one of these
+   * must be non-null.)
+   *
+   * At least one of the params mdr, req, and fin must be non-null.
+   *
+   * @param mdr The MDRequest associated with the path. Can be null.
+   * @param cf A MDSContextFactory for waiter building.
+   * @param path The path to traverse to.
+   *
+   * @param flags Specifies different lookup behaviors.
+   * By default, path_traverse() forwards the request to the auth MDS if that
+   * is appropriate (ie, if it doesn't know the contents of a directory).
+   * MDS_TRAVERSE_DISCOVER: Instead of forwarding request, path_traverse()
+   * attempts to look up the path from a different MDS (and bring them into
+   * its cache as replicas).
+   * MDS_TRAVERSE_PATH_LOCKED: path_traverse() will procceed when xlocked
+   * dentry is encountered.
+   * MDS_TRAVERSE_WANT_DENTRY: Caller wants tail dentry. Add a null dentry if
+   * tail dentry does not exist. return 0 even tail dentry is null.
+   * MDS_TRAVERSE_WANT_AUTH: Always forward request to auth MDS of target inode
+   * or auth MDS of tail dentry (MDS_TRAVERSE_WANT_DENTRY is set).
+   *
+   * @param pdnvec Data return parameter -- on success, contains a
+   * vector of dentries. On failure, is either empty or contains the
+   * full trace of traversable dentries.
+   * @param pin Data return parameter -- if successful, points to the inode
+   * associated with filepath. If unsuccessful, is null.
+   *
+   * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
+   * If it returns 1, the requester associated with this call has been placed
+   * on the appropriate waitlist, and it should unwind itself and back out.
+   * If it returns 2 the request has been forwarded, and again the requester
+   * should unwind itself and back out.
+   */
+  int path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
+		    const filepath& path, int flags,
+		    vector<CDentry*> *pdnvec, CInode **pin=nullptr);
+
+  CInode *cache_traverse(const filepath& path);
+
+  void open_remote_dirfrag(CInode *diri, frag_t fg, MDSContext *fin);
+  CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
+
+  bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
+  bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, 
+				   set<CDir*>& fetch_queue, set<inodeno_t>& missing,
+				   C_GatherBuilder &gather_bld);
+
+  void open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin,
+			  bool want_xlocked=false);
+  void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
+				  bool want_xlocked, int r);
+
+  void make_trace(vector<CDentry*>& trace, CInode *in);
+
+  void kick_open_ino_peers(mds_rank_t who);
+  void open_ino(inodeno_t ino, int64_t pool, MDSContext *fin,
+		bool want_replica=true, bool want_xlocked=false,
+		vector<inode_backpointer_t> *ancestors_hint=nullptr,
+		mds_rank_t auth_hint=MDS_RANK_NONE);
+
+  void find_ino_peers(inodeno_t ino, MDSContext *c,
+		      mds_rank_t hint=MDS_RANK_NONE, bool path_locked=false);
+  void _do_find_ino_peer(find_ino_peer_info_t& fip);
+  void handle_find_ino(const cref_t<MMDSFindIno> &m);
+  void handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m);
+  void kick_find_ino_peers(mds_rank_t who);
+
+  SnapRealm *get_global_snaprealm() const { return global_snaprealm; }
+  void create_global_snaprealm();
+  void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients=true);
+  void send_snap_update(CInode *in, version_t stid, int snap_op);
+  void handle_snap_update(const cref_t<MMDSSnapUpdate> &m);
+  void notify_global_snaprealm_update(int snap_op);
+
+  // -- stray --
+  void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
+  uint64_t get_num_strays() const { return stray_manager.get_num_strays(); }
+
+  // == messages ==
+  void dispatch(const cref_t<Message> &m);
+
+  void encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
+  void encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
+  void encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
+		       uint64_t features);
+  
+  void decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, MDSContext::vec& finished);
+  void decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished);
+  void decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished);
+
+  void encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl);
+  void decode_replica_stray(CDentry *&straydn, CInode **in, const bufferlist &bl, mds_rank_t from);
+
+  // -- namespace --
+  void encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl);
+  void decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p);
+  void send_dentry_link(CDentry *dn, MDRequestRef& mdr);
+  void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr, bool unlinking=false);
+
+  void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) {
+    uncommitted_fragments.at(dirfrag).waiters.push_back(c);
+  }
+  bool is_any_uncommitted_fragment() const {
+    return !uncommitted_fragments.empty();
+  }
+  void wait_for_uncommitted_fragments(MDSContext* finisher);
+  void rollback_uncommitted_fragments();
+
+  void split_dir(CDir *dir, int byn);
+  void merge_dir(CInode *diri, frag_t fg);
+
+  void find_stale_fragment_freeze();
+  void fragment_freeze_inc_num_waiters(CDir *dir);
+  bool fragment_are_all_frozen(CDir *dir);
+  int get_num_fragmenting_dirs() { return fragments.size(); }
+
+  // -- updates --
+  //int send_inode_updates(CInode *in);
+  //void handle_inode_update(MInodeUpdate *m);
+
+  int send_dir_updates(CDir *in, bool bcast=false);
+  void handle_dir_update(const cref_t<MDirUpdate> &m);
+
+  // -- cache expiration --
+  void handle_cache_expire(const cref_t<MCacheExpire> &m);
+  void process_delayed_expire(CDir *dir);
+  void discard_delayed_expire(CDir *dir);
+
+  // -- mdsmap --
+  void handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap);
+
+  int dump_cache() { return dump_cache({}, nullptr); }
+  int dump_cache(std::string_view filename);
+  int dump_cache(Formatter *f);
+  void dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f);
+
+  void cache_status(Formatter *f);
+
+  void dump_resolve_status(Formatter *f) const;
+  void dump_rejoin_status(Formatter *f) const;
+
+  // == crap fns ==
+  void show_cache();
+  void show_subtrees(int dbl=10, bool force_print=false);
+
+  CInode *hack_pick_random_inode() {
+    ceph_assert(!inode_map.empty());
+    int n = rand() % inode_map.size();
+    auto p = inode_map.begin();
+    while (n--) ++p;
+    return p->second;
+  }
+
+  void flush_dentry(std::string_view path, Context *fin);
+  /**
+   * Create and start an OP_ENQUEUE_SCRUB
+   */
+  void enqueue_scrub(std::string_view path, std::string_view tag,
+                     bool force, bool recursive, bool repair,
+		     Formatter *f, Context *fin);
+  void repair_inode_stats(CInode *diri);
+  void repair_dirfrag_stats(CDir *dir);
+  void rdlock_dirfrags_stats(CInode *diri, MDSInternalContext *fin);
+
+  // my leader
+  MDSRank *mds;
+
+  // -- my cache --
+  LRU lru;   // dentry lru for expiring items from cache
+  LRU bottom_lru; // dentries that should be trimmed ASAP
+
+  DecayRate decayrate;
+
+  int num_shadow_inodes = 0;
+
+  int num_inodes_with_caps = 0;
+
+  unsigned max_dir_commit_size;
+
+  file_layout_t default_file_layout;
+  file_layout_t default_log_layout;
+
+  // -- client leases --
+  static constexpr std::size_t client_lease_pools = 3;
+  std::array<float, client_lease_pools> client_lease_durations{5.0, 30.0, 300.0};
+
+  // -- client caps --
+  uint64_t last_cap_id = 0;
+
+  map<ceph_tid_t, discover_info_t> discovers;
+  ceph_tid_t discover_last_tid = 0;
+
+  // waiters
+  map<int, map<inodeno_t, MDSContext::vec > > waiting_for_base_ino;
+
+  map<inodeno_t,map<client_t, reconnected_cap_info_t> > reconnected_caps;   // inode -> client -> snap_follows,realmino
+  map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms;  // realmino -> client -> realmseq
+
+  //  realm inodes
+  set<CInode*> rejoin_pending_snaprealms;
+  // cap imports.  delayed snap parent opens.
+  map<client_t,set<CInode*> > delayed_imported_caps;
+
+  // subsystems
+  std::unique_ptr<Migrator> migrator;
+
+  bool did_shutdown_log_cap = false;
+
+  map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer;
+  ceph_tid_t find_ino_peer_last_tid = 0;
+
+  // delayed cache expire
+  map<CDir*, expiremap> delayed_expire; // subtree root -> expire msg
+
+  /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
+  std::set<CInode *> export_pin_queue;
+  std::set<CInode *> export_pin_delayed_queue;
+  std::set<CInode *> export_ephemeral_pins;
+
+  OpenFileTable open_file_table;
+
+  double export_ephemeral_random_max = 0.0;
+
+ protected:
+  // track leader requests whose peers haven't acknowledged commit
+  struct uleader {
+    uleader() {}
+    set<mds_rank_t> peers;
+    LogSegment *ls = nullptr;
+    MDSContext::vec waiters;
+    bool safe = false;
+    bool committing = false;
+    bool recovering = false;
+  };
+
+  struct upeer {
+    upeer() {}
+    mds_rank_t leader;
+    LogSegment *ls = nullptr;
+    MDPeerUpdate *su = nullptr;
+    MDSContext::vec waiters;
+  };
+
+  struct open_ino_info_t {
+    open_ino_info_t() {}
+    vector<inode_backpointer_t> ancestors;
+    set<mds_rank_t> checked;
+    mds_rank_t checking = MDS_RANK_NONE;
+    mds_rank_t auth_hint = MDS_RANK_NONE;
+    bool check_peers = true;
+    bool fetch_backtrace = true;
+    bool discover = false;
+    bool want_replica = false;
+    bool want_xlocked = false;
+    version_t tid = 0;
+    int64_t pool = -1;
+    int last_err = 0;
+    MDSContext::vec waiters;
+  };
+
+  friend struct C_MDC_OpenInoTraverseDir;
+  friend struct C_MDC_OpenInoParentOpened;
+  friend struct C_MDC_RetryScanStray;
+
+  friend class C_IO_MDC_OpenInoBacktraceFetched;
+  friend class C_MDC_Join;
+  friend class C_MDC_RespondInternalRequest;
+
+  friend class EPeerUpdate;
+  friend class ECommitted;
+
+  void set_readonly() { readonly = true; }
+
+  void handle_resolve(const cref_t<MMDSResolve> &m);
+  void handle_resolve_ack(const cref_t<MMDSResolveAck> &m);
+  void process_delayed_resolve();
+  void discard_delayed_resolve(mds_rank_t who);
+  void maybe_resolve_finish();
+  void disambiguate_my_imports();
+  void disambiguate_other_imports();
+  void trim_unlinked_inodes();
+
+  void send_peer_resolves();
+  void send_subtree_resolves();
+  void maybe_finish_peer_resolve();
+
+  void rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin);
+  void handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m);
+  void handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &m);
+  CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
+  CDir* rejoin_invent_dirfrag(dirfrag_t df);
+  void handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &m);
+  void rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
+				      set<vinodeno_t>& acked_inodes,
+				      set<SimpleLock *>& gather_locks);
+  void handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &m);
+  void rejoin_send_acks();
+  void rejoin_trim_undef_inodes();
+  void maybe_send_pending_rejoins() {
+    if (rejoins_pending)
+      rejoin_send_rejoins();
+  }
+
+  void touch_inode(CInode *in) {
+    if (in->get_parent_dn())
+      touch_dentry(in->get_projected_parent_dn());
+  }
+
+  void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin,
+			    set<SimpleLock *>& gather_locks);
+  void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks);
+
+  void rename_file(CDentry *srcdn, CDentry *destdn);
+
+  void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
+  void _open_ino_parent_opened(inodeno_t ino, int ret);
+  void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
+  void _open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, CDir *dir, bool parent);
+  int open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
+			    const vector<inode_backpointer_t>& ancestors,
+			    bool discover, bool want_xlocked, mds_rank_t *hint);
+  void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
+  void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
+  void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
+  void handle_open_ino(const cref_t<MMDSOpenIno> &m, int err=0);
+  void handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m);
+
+  void scan_stray_dir(dirfrag_t next=dirfrag_t());
+  // -- replicas --
+  void handle_discover(const cref_t<MDiscover> &dis);
+  void handle_discover_reply(const cref_t<MDiscoverReply> &m);
+  void handle_dentry_link(const cref_t<MDentryLink> &m);
+  void handle_dentry_unlink(const cref_t<MDentryUnlink> &m);
+  void handle_dentry_unlink_ack(const cref_t<MDentryUnlinkAck> &m);
+
+  int dump_cache(std::string_view fn, Formatter *f);
+
+  void flush_dentry_work(MDRequestRef& mdr);
+  /**
+   * Resolve path to a dentry and pass it onto the ScrubStack.
+   *
+   * TODO: return enough information to the original mdr formatter
+   * and completion that they can subsequeuntly check the progress of
+   * this scrub (we won't block them on a whole scrub as it can take a very
+   * long time)
+   */
+  void enqueue_scrub_work(MDRequestRef& mdr);
+  void repair_inode_stats_work(MDRequestRef& mdr);
+  void repair_dirfrag_stats_work(MDRequestRef& mdr);
+  void rdlock_dirfrags_stats_work(MDRequestRef& mdr);
+
+  ceph::unordered_map<inodeno_t,CInode*> inode_map;  // map of head inodes by ino
+  map<vinodeno_t, CInode*> snap_inode_map;  // map of snap inodes by ino
+  CInode *root = nullptr; // root inode
+  CInode *myin = nullptr; // .ceph/mds%d dir
+
+  bool readonly = false;
+
+  int stray_index = 0;
+  int stray_fragmenting_index = -1;
+
+  set<CInode*> base_inodes;
+
+  std::unique_ptr<PerfCounters> logger;
+
+  Filer filer;
+  std::array<xlist<ClientLease*>, client_lease_pools> client_leases{};
+
+  /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
+  map<CDir*,set<CDir*> > subtrees;
+  map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames;  // renamed ino -> target dir
+
+  // -- requests --
+  ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
+
+  // -- recovery --
+  set<mds_rank_t> recovery_set;
+
+  // [resolve]
+  // from EImportStart w/o EImportFinish during journal replay
+  map<dirfrag_t, vector<dirfrag_t> > my_ambiguous_imports;
+  // from MMDSResolves
+  map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
+
+  map<CInode*, int> uncommitted_peer_rename_olddir;  // peer: preserve the non-auth dir until seeing commit.
+  map<CInode*, int> uncommitted_peer_unlink;  // peer: preserve the unlinked inode until seeing commit.
+
+  map<metareqid_t, uleader> uncommitted_leaders;         // leader: req -> peer set
+  map<metareqid_t, upeer> uncommitted_peers;  // peer: preserve the peer req until seeing commit.
+
+  set<metareqid_t> pending_leaders;
+  map<int, set<metareqid_t> > ambiguous_peer_updates;
+
+  bool resolves_pending = false;
+  set<mds_rank_t> resolve_gather;	// nodes i need resolves from
+  set<mds_rank_t> resolve_ack_gather;	// nodes i need a resolve_ack from
+  set<version_t> resolve_snapclient_commits;
+  map<metareqid_t, mds_rank_t> resolve_need_rollback;  // rollbacks i'm writing to the journal
+  map<mds_rank_t, cref_t<MMDSResolve>> delayed_resolve;
+
+  // [rejoin]
+  bool rejoins_pending = false;
+  set<mds_rank_t> rejoin_gather;      // nodes from whom i need a rejoin
+  set<mds_rank_t> rejoin_sent;        // nodes i sent a rejoin to
+  set<mds_rank_t> rejoin_ack_sent;    // nodes i sent a rejoin to
+  set<mds_rank_t> rejoin_ack_gather;  // nodes from whom i need a rejoin ack
+  map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
+  map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_peer_exports;
+
+  map<client_t,entity_inst_t> rejoin_client_map;
+  map<client_t,client_metadata_t> rejoin_client_metadata_map;
+  map<client_t,pair<Session*,uint64_t> > rejoin_session_map;
+
+  map<inodeno_t,pair<mds_rank_t,map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex
+
+  map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports;  // ino -> client -> frommds -> capex
+  set<inodeno_t> cap_imports_missing;
+  map<inodeno_t, MDSContext::vec > cap_reconnect_waiters;
+  int cap_imports_num_opening = 0;
+
+  set<CInode*> rejoin_undef_inodes;
+  set<CInode*> rejoin_potential_updated_scatterlocks;
+  set<CDir*>   rejoin_undef_dirfrags;
+  map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes;
+
+  vector<CInode*> rejoin_recover_q, rejoin_check_q;
+  list<SimpleLock*> rejoin_eval_locks;
+  MDSContext::vec rejoin_waiters;
+
+  std::unique_ptr<MDSContext> rejoin_done;
+  std::unique_ptr<MDSContext> resolve_done;
+
+  ceph_tid_t open_ino_last_tid = 0;
+  map<inodeno_t,open_ino_info_t> opening_inodes;
+
+  StrayManager stray_manager;
+
+ private:
+  // -- fragmenting --
+  struct ufragment {
+    ufragment() {}
+    int bits = 0;
+    bool committed = false;
+    LogSegment *ls = nullptr;
+    MDSContext::vec waiters;
+    frag_vec_t old_frags;
+    bufferlist rollback;
+  };
+
+  struct fragment_info_t {
+    fragment_info_t() {}
+    bool is_fragmenting() { return !resultfrags.empty(); }
+    uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; }
+    int bits;
+    std::vector<CDir*> dirs;
+    std::vector<CDir*> resultfrags;
+    MDRequestRef mdr;
+    set<mds_rank_t> notify_ack_waiting;
+    bool finishing = false;
+
+    // for deadlock detection
+    bool all_frozen = false;
+    utime_t last_cum_auth_pins_change;
+    int last_cum_auth_pins = 0;
+    int num_remote_waiters = 0;	// number of remote authpin waiters
+  };
+
+  typedef map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator;
+
+  friend class EFragment;
+  friend class C_MDC_FragmentFrozen;
+  friend class C_MDC_FragmentMarking;
+  friend class C_MDC_FragmentPrep;
+  friend class C_MDC_FragmentStore;
+  friend class C_MDC_FragmentCommit;
+  friend class C_MDC_FragmentRollback;
+  friend class C_IO_MDC_FragmentPurgeOld;
+
+  // -- subtrees --
+  static const unsigned int SUBTREES_COUNT_THRESHOLD = 5;
+  static const unsigned int SUBTREES_DEPTH_THRESHOLD = 5;
+
+  CInode *get_stray() {
+    return strays[stray_index];
+  }
+
+  void identify_files_to_recover();
+
+  std::pair<bool, uint64_t> trim_lru(uint64_t count, expiremap& expiremap);
+  bool trim_dentry(CDentry *dn, expiremap& expiremap);
+  void trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap);
+  bool trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap&);
+  void send_expire_messages(expiremap& expiremap);
+  void trim_non_auth();      // trim out trimmable non-auth items
+
+  void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
+			    std::vector<CDir*>* frags, MDSContext::vec& waiters, bool replay);
+  void adjust_dir_fragments(CInode *diri,
+			    const std::vector<CDir*>& srcfrags,
+			    frag_t basefrag, int bits,
+			    std::vector<CDir*>* resultfrags,
+			    MDSContext::vec& waiters,
+			    bool replay);
+  CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true);
+  void get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds);
+
+  bool can_fragment(CInode *diri, const std::vector<CDir*>& dirs);
+  void fragment_freeze_dirs(const std::vector<CDir*>& dirs);
+  void fragment_mark_and_complete(MDRequestRef& mdr);
+  void fragment_frozen(MDRequestRef& mdr, int r);
+  void fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs);
+  void fragment_drop_locks(fragment_info_t &info);
+  void fragment_maybe_finish(const fragment_info_iterator& it);
+  void dispatch_fragment_dir(MDRequestRef& mdr);
+  void _fragment_logged(MDRequestRef& mdr);
+  void _fragment_stored(MDRequestRef& mdr);
+  void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr);
+  void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr);
+
+  void handle_fragment_notify(const cref_t<MMDSFragmentNotify> &m);
+  void handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &m);
+
+  void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frag,
+				LogSegment *ls, bufferlist *rollback=NULL);
+  void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
+  void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags);
+
+  void upkeep_main(void);
+
+  uint64_t cache_memory_limit;
+  double cache_reservation;
+  double cache_health_threshold;
+  std::array<CInode *, NUM_STRAY> strays{}; // my stray dir
+
+  bool export_ephemeral_distributed_config;
+  bool export_ephemeral_random_config;
+  unsigned export_ephemeral_dist_frag_bits;
+
+  // File size recovery
+  RecoveryQueue recovery_queue;
+
+  // shutdown
+  set<inodeno_t> shutdown_exporting_strays;
+  pair<dirfrag_t, string> shutdown_export_next;
+
+  bool opening_root = false, open = false;
+  MDSContext::vec waiting_for_open;
+
+  // -- snaprealms --
+  SnapRealm *global_snaprealm = nullptr;
+
+  map<dirfrag_t, ufragment> uncommitted_fragments;
+
+  map<dirfrag_t,fragment_info_t> fragments;
+
+  DecayCounter trim_counter;
+
+  std::thread upkeeper;
+  ceph::mutex upkeep_mutex = ceph::make_mutex("MDCache::upkeep_mutex");
+  ceph::condition_variable upkeep_cvar;
+  time upkeep_last_trim = time::min();
+  time upkeep_last_release = time::min();
+  std::atomic<bool> upkeep_trim_shutdown{false};
+};
+
+class C_MDS_RetryRequest : public MDSInternalContext {
+  MDCache *cache;
+  MDRequestRef mdr;
+ public:
+  C_MDS_RetryRequest(MDCache *c, MDRequestRef& r) :
+    MDSInternalContext(c->mds), cache(c), mdr(r) {}
+  void finish(int r) override;
+};
+
+class CF_MDS_RetryRequestFactory : public MDSContextFactory {
+public:
+  CF_MDS_RetryRequestFactory(MDCache *cache, MDRequestRef &mdr, bool dl) :
+    mdcache(cache), mdr(mdr), drop_locks(dl) {}
+  MDSContext *build() override;
+private:
+  MDCache *mdcache;
+  MDRequestRef mdr;
+  bool drop_locks;
+};
+
+#endif
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
new file mode 100644
index 000000000..79143021e
--- /dev/null
+++ b/src/mds/MDLog.cc
@@ -0,0 +1,1523 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "MDSRank.h"
+#include "MDLog.h"
+#include "MDCache.h"
+#include "LogEvent.h"
+#include "MDSContext.h"
+
+#include "osdc/Journaler.h"
+#include "mds/JournalPointer.h"
+
+#include "common/entity_name.h"
+#include "common/perf_counters.h"
+#include "common/Cond.h"
+
+#include "events/ESubtreeMap.h"
+
+#include "common/config.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".log "
+
+// cons/des
+MDLog::~MDLog()
+{
+  if (journaler) { delete journaler; journaler = 0; }
+  if (logger) {
+    g_ceph_context->get_perfcounters_collection()->remove(logger);
+    delete logger;
+    logger = 0;
+  }
+}
+
+
+void MDLog::create_logger()
+{
+  PerfCountersBuilder plb(g_ceph_context, "mds_log", l_mdl_first, l_mdl_last);
+
+  plb.add_u64_counter(l_mdl_evadd, "evadd", "Events submitted", "subm",
+                      PerfCountersBuilder::PRIO_INTERESTING);
+  plb.add_u64(l_mdl_ev, "ev", "Events", "evts",
+              PerfCountersBuilder::PRIO_INTERESTING);
+  plb.add_u64(l_mdl_seg, "seg", "Segments", "segs",
+              PerfCountersBuilder::PRIO_INTERESTING);
+
+  plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+  plb.add_u64(l_mdl_evexg, "evexg", "Expiring events");
+  plb.add_u64(l_mdl_evexd, "evexd", "Current expired events");
+  plb.add_u64(l_mdl_segexg, "segexg", "Expiring segments");
+  plb.add_u64(l_mdl_segexd, "segexd", "Current expired segments");
+  plb.add_u64_counter(l_mdl_replayed, "replayed", "Events replayed",
+		      "repl", PerfCountersBuilder::PRIO_INTERESTING);
+  plb.add_time_avg(l_mdl_jlat, "jlat", "Journaler flush latency");
+  plb.add_u64_counter(l_mdl_evex, "evex", "Total expired events");
+  plb.add_u64_counter(l_mdl_evtrm, "evtrm", "Trimmed events");
+  plb.add_u64_counter(l_mdl_segadd, "segadd", "Segments added");
+  plb.add_u64_counter(l_mdl_segex, "segex", "Total expired segments");
+  plb.add_u64_counter(l_mdl_segtrm, "segtrm", "Trimmed segments");
+
+  plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+  plb.add_u64(l_mdl_expos, "expos", "Journaler xpire position");
+  plb.add_u64(l_mdl_wrpos, "wrpos", "Journaler  write position");
+  plb.add_u64(l_mdl_rdpos, "rdpos", "Journaler  read position");
+
+  // logger
+  logger = plb.create_perf_counters();
+  g_ceph_context->get_perfcounters_collection()->add(logger);
+}
+
+void MDLog::set_write_iohint(unsigned iohint_flags)
+{
+  journaler->set_write_iohint(iohint_flags);
+}
+
+class C_MDL_WriteError : public MDSIOContextBase {
+  protected:
+  MDLog *mdlog;
+  MDSRank *get_mds() override {return mdlog->mds;}
+
+  void finish(int r) override {
+    MDSRank *mds = get_mds();
+    // assume journal is reliable, so don't choose action based on
+    // g_conf()->mds_action_on_write_error.
+    if (r == -CEPHFS_EBLOCKLISTED) {
+      derr << "we have been blocklisted (fenced), respawning..." << dendl;
+      mds->respawn();
+    } else {
+      derr << "unhandled error " << cpp_strerror(r) << ", shutting down..." << dendl;
+      // Although it's possible that this could be something transient,
+      // it's severe and scary, so disable this rank until an administrator
+      // intervenes.
+      mds->clog->error() << "Unhandled journal write error on MDS rank " <<
+        mds->get_nodeid() << ": " << cpp_strerror(r) << ", shutting down.";
+      mds->damaged();
+      ceph_abort();  // damaged should never return
+    }
+  }
+
+  public:
+  explicit C_MDL_WriteError(MDLog *m) :
+    MDSIOContextBase(false), mdlog(m) {}
+  void print(ostream& out) const override {
+    out << "mdlog_write_error";
+  }
+};
+
+
+void MDLog::write_head(MDSContext *c) 
+{
+  Context *fin = NULL;
+  if (c != NULL) {
+    fin = new C_IO_Wrapper(mds, c);
+  }
+  journaler->write_head(fin);
+}
+
+uint64_t MDLog::get_read_pos() const
+{
+  return journaler->get_read_pos(); 
+}
+
+uint64_t MDLog::get_write_pos() const
+{
+  return journaler->get_write_pos(); 
+}
+
+uint64_t MDLog::get_safe_pos() const
+{
+  return journaler->get_write_safe_pos(); 
+}
+
+
+
+void MDLog::create(MDSContext *c)
+{
+  dout(5) << "create empty log" << dendl;
+
+  C_GatherBuilder gather(g_ceph_context);
+  // This requires an OnFinisher wrapper because Journaler will call back the completion for write_head inside its own lock
+  // XXX but should maybe that be handled inside Journaler?
+  gather.set_finisher(new C_IO_Wrapper(mds, c));
+
+  // The inode of the default Journaler we will create
+  ino = MDS_INO_LOG_OFFSET + mds->get_nodeid();
+
+  // Instantiate Journaler and start async write to RADOS
+  ceph_assert(journaler == NULL);
+  journaler = new Journaler("mdlog", ino, mds->get_metadata_pool(),
+                            CEPH_FS_ONDISK_MAGIC, mds->objecter, logger,
+                            l_mdl_jlat, mds->finisher);
+  ceph_assert(journaler->is_readonly());
+  journaler->set_write_error_handler(new C_MDL_WriteError(this));
+  journaler->set_writeable();
+  journaler->create(&mds->mdcache->default_log_layout, g_conf()->mds_journal_format);
+  journaler->write_head(gather.new_sub());
+
+  // Async write JournalPointer to RADOS
+  JournalPointer jp(mds->get_nodeid(), mds->get_metadata_pool());
+  jp.front = ino;
+  jp.back = 0;
+  jp.save(mds->objecter, gather.new_sub());
+
+  gather.activate();
+
+  logger->set(l_mdl_expos, journaler->get_expire_pos());
+  logger->set(l_mdl_wrpos, journaler->get_write_pos());
+
+  submit_thread.create("md_submit");
+}
+
+void MDLog::open(MDSContext *c)
+{
+  dout(5) << "open discovering log bounds" << dendl;
+
+  ceph_assert(!recovery_thread.is_started());
+  recovery_thread.set_completion(c);
+  recovery_thread.create("md_recov_open");
+
+  submit_thread.create("md_submit");
+  // either append() or replay() will follow.
+}
+
+/**
+ * Final part of reopen() procedure, after recovery_thread
+ * has done its thing we call append()
+ */
+class C_ReopenComplete : public MDSInternalContext {
+  MDLog *mdlog;
+  MDSContext *on_complete;
+public:
+  C_ReopenComplete(MDLog *mdlog_, MDSContext *on_complete_) : MDSInternalContext(mdlog_->mds), mdlog(mdlog_), on_complete(on_complete_) {}
+  void finish(int r) override {
+    mdlog->append();
+    on_complete->complete(r);
+  }
+};
+
+/**
+ * Given that open() has been called in the past, go through the journal
+ * recovery procedure again, potentially reformatting the journal if it
+ * was in an old format.
+ */
+void MDLog::reopen(MDSContext *c)
+{
+  dout(5) << "reopen" << dendl;
+
+  // Because we will call append() at the completion of this, check that we have already
+  // read the whole journal.
+  ceph_assert(journaler != NULL);
+  ceph_assert(journaler->get_read_pos() == journaler->get_write_pos());
+
+  delete journaler;
+  journaler = NULL;
+
+  // recovery_thread was started at some point in the past.  Although
+  // it has called it's completion if we made it back here, it might
+  // still not have been cleaned up: join it.
+  recovery_thread.join();
+
+  recovery_thread.set_completion(new C_ReopenComplete(this, c));
+  recovery_thread.create("md_recov_reopen");
+}
+
+void MDLog::append()
+{
+  dout(5) << "append positioning at end and marking writeable" << dendl;
+  journaler->set_read_pos(journaler->get_write_pos());
+  journaler->set_expire_pos(journaler->get_write_pos());
+  
+  journaler->set_writeable();
+
+  logger->set(l_mdl_expos, journaler->get_write_pos());
+}
+
+
+
+// -------------------------------------------------
+
+void MDLog::_start_entry(LogEvent *e)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(submit_mutex));
+
+  ceph_assert(cur_event == NULL);
+  cur_event = e;
+
+  event_seq++;
+
+  EMetaBlob *metablob = e->get_metablob();
+  if (metablob) {
+    metablob->event_seq = event_seq;
+    metablob->last_subtree_map = get_last_segment_seq();
+  }
+}
+
+void MDLog::cancel_entry(LogEvent *le)
+{
+  ceph_assert(le == cur_event);
+  cur_event = NULL;
+  delete le;
+}
+
+void MDLog::_submit_entry(LogEvent *le, MDSLogContextBase *c)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(submit_mutex));
+  ceph_assert(!mds->is_any_replay());
+  ceph_assert(!mds_is_shutting_down);
+
+  ceph_assert(le == cur_event);
+  cur_event = NULL;
+
+  // let the event register itself in the segment
+  ceph_assert(!segments.empty());
+  LogSegment *ls = segments.rbegin()->second;
+  ls->num_events++;
+
+  le->_segment = ls;
+  le->update_segment();
+  le->set_stamp(ceph_clock_now());
+
+  mdsmap_up_features = mds->mdsmap->get_up_features();
+  pending_events[ls->seq].push_back(PendingEvent(le, c));
+  num_events++;
+
+  if (logger) {
+    logger->inc(l_mdl_evadd);
+    logger->set(l_mdl_ev, num_events);
+  }
+
+  unflushed++;
+  
+  uint64_t period = journaler->get_layout_period();
+  // start a new segment?
+  if (le->get_type() == EVENT_SUBTREEMAP ||
+      (le->get_type() == EVENT_IMPORTFINISH && mds->is_resolve())) {
+    // avoid infinite loop when ESubtreeMap is very large.
+    // do not insert ESubtreeMap among EImportFinish events that finish
+    // disambiguate imports. Because the ESubtreeMap reflects the subtree
+    // state when all EImportFinish events are replayed.
+  } else if (ls->end/period != ls->offset/period ||
+	     ls->num_events >= g_conf()->mds_log_events_per_segment) {
+    dout(10) << "submit_entry also starting new segment: last = "
+	     << ls->seq  << "/" << ls->offset << ", event seq = " << event_seq << dendl;
+    _start_new_segment();
+  } else if (g_conf()->mds_debug_subtrees &&
+	     le->get_type() != EVENT_SUBTREEMAP_TEST) {
+    // debug: journal this every time to catch subtree replay bugs.
+    // use a different event id so it doesn't get interpreted as a
+    // LogSegment boundary on replay.
+    LogEvent *sle = mds->mdcache->create_subtree_map();
+    sle->set_type(EVENT_SUBTREEMAP_TEST);
+    _submit_entry(sle, NULL);
+  }
+}
+
+/**
+ * Invoked on the flush after each entry submitted
+ */
+class C_MDL_Flushed : public MDSLogContextBase {
+protected:
+  MDLog *mdlog;
+  MDSRank *get_mds() override {return mdlog->mds;}
+  MDSContext *wrapped;
+
+  void finish(int r) override {
+    if (wrapped)
+      wrapped->complete(r);
+  }
+
+public:
+  C_MDL_Flushed(MDLog *m, MDSContext *w)
+    : mdlog(m), wrapped(w) {}
+  C_MDL_Flushed(MDLog *m, uint64_t wp) : mdlog(m), wrapped(NULL) {
+    set_write_pos(wp);
+  }
+};
+
+void MDLog::_submit_thread()
+{
+  dout(10) << "_submit_thread start" << dendl;
+
+  std::unique_lock locker{submit_mutex};
+
+  while (!mds->is_daemon_stopping()) {
+    if (g_conf()->mds_log_pause) {
+      submit_cond.wait(locker);
+      continue;
+    }
+
+    map<uint64_t,list<PendingEvent> >::iterator it = pending_events.begin();
+    if (it == pending_events.end()) {
+      submit_cond.wait(locker);
+      continue;
+    }
+
+    if (it->second.empty()) {
+      pending_events.erase(it);
+      continue;
+    }
+
+    int64_t features = mdsmap_up_features;
+    PendingEvent data = it->second.front();
+    it->second.pop_front();
+
+    locker.unlock();
+
+    if (data.le) {
+      LogEvent *le = data.le;
+      LogSegment *ls = le->_segment;
+      // encode it, with event type
+      bufferlist bl;
+      le->encode_with_header(bl, features);
+
+      uint64_t write_pos = journaler->get_write_pos();
+
+      le->set_start_off(write_pos);
+      if (le->get_type() == EVENT_SUBTREEMAP)
+	ls->offset = write_pos;
+
+      dout(5) << "_submit_thread " << write_pos << "~" << bl.length()
+	      << " : " << *le << dendl;
+
+      // journal it.
+      const uint64_t new_write_pos = journaler->append_entry(bl);  // bl is destroyed.
+      ls->end = new_write_pos;
+
+      MDSLogContextBase *fin;
+      if (data.fin) {
+	fin = dynamic_cast<MDSLogContextBase*>(data.fin);
+	ceph_assert(fin);
+	fin->set_write_pos(new_write_pos);
+      } else {
+	fin = new C_MDL_Flushed(this, new_write_pos);
+      }
+
+      journaler->wait_for_flush(fin);
+
+      if (data.flush)
+	journaler->flush();
+
+      if (logger)
+	logger->set(l_mdl_wrpos, ls->end);
+
+      delete le;
+    } else {
+      if (data.fin) {
+	MDSContext* fin =
+		dynamic_cast<MDSContext*>(data.fin);
+	ceph_assert(fin);
+	C_MDL_Flushed *fin2 = new C_MDL_Flushed(this, fin);
+	fin2->set_write_pos(journaler->get_write_pos());
+	journaler->wait_for_flush(fin2);
+      }
+      if (data.flush)
+	journaler->flush();
+    }
+
+    locker.lock();
+    if (data.flush)
+      unflushed = 0;
+    else if (data.le)
+      unflushed++;
+  }
+}
+
+void MDLog::wait_for_safe(MDSContext *c)
+{
+  submit_mutex.lock();
+
+  bool no_pending = true;
+  if (!pending_events.empty()) {
+    pending_events.rbegin()->second.push_back(PendingEvent(NULL, c));
+    no_pending = false;
+    submit_cond.notify_all();
+  }
+
+  submit_mutex.unlock();
+
+  if (no_pending && c)
+    journaler->wait_for_flush(new C_IO_Wrapper(mds, c));
+}
+
+void MDLog::flush()
+{
+  submit_mutex.lock();
+
+  bool do_flush = unflushed > 0;
+  unflushed = 0;
+  if (!pending_events.empty()) {
+    pending_events.rbegin()->second.push_back(PendingEvent(NULL, NULL, true));
+    do_flush = false;
+    submit_cond.notify_all();
+  }
+
+  submit_mutex.unlock();
+
+  if (do_flush)
+    journaler->flush();
+}
+
+void MDLog::kick_submitter()
+{
+  std::lock_guard l(submit_mutex);
+  submit_cond.notify_all();
+}
+
+void MDLog::cap()
+{
+  dout(5) << "mark mds is shutting down" << dendl;
+  mds_is_shutting_down = true;
+}
+
+void MDLog::shutdown()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
+
+  dout(5) << "shutdown" << dendl;
+  if (submit_thread.is_started()) {
+    ceph_assert(mds->is_daemon_stopping());
+
+    if (submit_thread.am_self()) {
+      // Called suicide from the thread: trust it to do no work after
+      // returning from suicide, and subsequently respect mds->is_daemon_stopping()
+      // and fall out of its loop.
+    } else {
+      mds->mds_lock.unlock();
+      // Because MDS::stopping is true, it's safe to drop mds_lock: nobody else
+      // picking it up will do anything with it.
+
+      submit_mutex.lock();
+      submit_cond.notify_all();
+      submit_mutex.unlock();
+
+      mds->mds_lock.lock();
+
+      submit_thread.join();
+    }
+  }
+
+  // Replay thread can be stuck inside e.g. Journaler::wait_for_readable,
+  // so we need to shutdown the journaler first.
+  if (journaler) {
+    journaler->shutdown();
+  }
+
+  if (replay_thread.is_started() && !replay_thread.am_self()) {
+    mds->mds_lock.unlock();
+    replay_thread.join();
+    mds->mds_lock.lock();
+  }
+
+  if (recovery_thread.is_started() && !recovery_thread.am_self()) {
+    mds->mds_lock.unlock();
+    recovery_thread.join();
+    mds->mds_lock.lock();
+  }
+}
+
+
+// -----------------------------
+// segments
+
+void MDLog::_start_new_segment()
+{
+  _prepare_new_segment();
+  _journal_segment_subtree_map(NULL);
+}
+
+void MDLog::_prepare_new_segment()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(submit_mutex));
+
+  uint64_t seq = event_seq + 1;
+  dout(7) << __func__ << " seq " << seq << dendl;
+
+  segments[seq] = new LogSegment(seq);
+
+  logger->inc(l_mdl_segadd);
+  logger->set(l_mdl_seg, segments.size());
+
+  // Adjust to next stray dir
+  mds->mdcache->advance_stray();
+}
+
+void MDLog::_journal_segment_subtree_map(MDSContext *onsync)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(submit_mutex));
+
+  dout(7) << __func__ << dendl;
+  ESubtreeMap *sle = mds->mdcache->create_subtree_map();
+  sle->event_seq = get_last_segment_seq();
+
+  _submit_entry(sle, new C_MDL_Flushed(this, onsync));
+}
+
+class C_OFT_Committed : public MDSInternalContext {
+  MDLog *mdlog;
+  uint64_t seq;
+public:
+  C_OFT_Committed(MDLog *l, uint64_t s) :
+    MDSInternalContext(l->mds), mdlog(l), seq(s) {}
+  void finish(int ret) override {
+    mdlog->trim_expired_segments();
+  }
+};
+
+void MDLog::try_to_commit_open_file_table(uint64_t last_seq)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(submit_mutex));
+
+  if (mds_is_shutting_down) // shutting down the MDS
+    return;
+
+  if (mds->mdcache->open_file_table.is_any_committing())
+    return;
+
+  // when there have dirty items, maybe there has no any new log event
+  if (mds->mdcache->open_file_table.is_any_dirty() ||
+      last_seq > mds->mdcache->open_file_table.get_committed_log_seq()) {
+    submit_mutex.unlock();
+    mds->mdcache->open_file_table.commit(new C_OFT_Committed(this, last_seq),
+                                         last_seq, CEPH_MSG_PRIO_HIGH);
+    submit_mutex.lock();
+  }
+}
+
+void MDLog::trim(int m)
+{
+  unsigned max_segments = g_conf()->mds_log_max_segments;
+  int max_events = g_conf()->mds_log_max_events;
+  if (m >= 0)
+    max_events = m;
+
+  if (mds->mdcache->is_readonly()) {
+    dout(10) << "trim, ignoring read-only FS" <<  dendl;
+    return;
+  }
+
+  // Clamp max_events to not be smaller than events per segment
+  if (max_events > 0 && max_events <= g_conf()->mds_log_events_per_segment) {
+    max_events = g_conf()->mds_log_events_per_segment + 1;
+  }
+
+  submit_mutex.lock();
+
+  // trim!
+  dout(10) << "trim " 
+	   << segments.size() << " / " << max_segments << " segments, " 
+	   << num_events << " / " << max_events << " events"
+	   << ", " << expiring_segments.size() << " (" << expiring_events << ") expiring"
+	   << ", " << expired_segments.size() << " (" << expired_events << ") expired"
+	   << dendl;
+
+  if (segments.empty()) {
+    submit_mutex.unlock();
+    return;
+  }
+
+  // hack: only trim for a few seconds at a time
+  utime_t stop = ceph_clock_now();
+  stop += 2.0;
+
+  int op_prio = CEPH_MSG_PRIO_LOW +
+		(CEPH_MSG_PRIO_HIGH - CEPH_MSG_PRIO_LOW) *
+		expiring_segments.size() / max_segments;
+  if (op_prio > CEPH_MSG_PRIO_HIGH)
+    op_prio = CEPH_MSG_PRIO_HIGH;
+
+  unsigned new_expiring_segments = 0;
+
+  unsigned max_expiring_segments = 0;
+  if (pre_segments_size > 0){
+    max_expiring_segments = max_segments/2;
+    assert(segments.size() >= pre_segments_size);
+    max_expiring_segments = std::max<unsigned>(max_expiring_segments,segments.size() - pre_segments_size);
+  }
+  
+  map<uint64_t,LogSegment*>::iterator p = segments.begin();
+  while (p != segments.end()) {
+    if (stop < ceph_clock_now())
+      break;
+
+    unsigned num_remaining_segments = (segments.size() - expired_segments.size() - expiring_segments.size());
+    if ((num_remaining_segments <= max_segments) &&
+	(max_events < 0 || num_events - expiring_events - expired_events <= max_events))
+      break;
+
+    // Do not trim too many segments at once for peak workload. If mds keeps creating N segments each tick,
+    // the upper bound of 'num_remaining_segments - max_segments' is '2 * N'
+    if (new_expiring_segments * 2 > num_remaining_segments)
+      break;
+
+    if (max_expiring_segments > 0 &&
+	expiring_segments.size() >= max_expiring_segments)
+      break;
+    
+    // look at first segment
+    LogSegment *ls = p->second;
+    ceph_assert(ls);
+    ++p;
+    
+    if (pending_events.count(ls->seq) ||
+	ls->end > safe_pos) {
+      dout(5) << "trim segment " << ls->seq << "/" << ls->offset << ", not fully flushed yet, safe "
+	      << journaler->get_write_safe_pos() << " < end " << ls->end << dendl;
+      break;
+    }
+
+    if (expiring_segments.count(ls)) {
+      dout(5) << "trim already expiring segment " << ls->seq << "/" << ls->offset
+	      << ", " << ls->num_events << " events" << dendl;
+    } else if (expired_segments.count(ls)) {
+      dout(5) << "trim already expired segment " << ls->seq << "/" << ls->offset
+	      << ", " << ls->num_events << " events" << dendl;
+    } else {
+      ceph_assert(expiring_segments.count(ls) == 0);
+      new_expiring_segments++;
+      expiring_segments.insert(ls);
+      expiring_events += ls->num_events;
+      submit_mutex.unlock();
+
+      uint64_t last_seq = ls->seq;
+      try_expire(ls, op_prio);
+
+      submit_mutex.lock();
+      p = segments.lower_bound(last_seq + 1);
+    }
+  }
+
+  try_to_commit_open_file_table(get_last_segment_seq());
+
+  // discard expired segments and unlock submit_mutex
+  _trim_expired_segments();
+}
+
+class C_MaybeExpiredSegment : public MDSInternalContext {
+  MDLog *mdlog;
+  LogSegment *ls;
+  int op_prio;
+  public:
+  C_MaybeExpiredSegment(MDLog *mdl, LogSegment *s, int p) :
+    MDSInternalContext(mdl->mds), mdlog(mdl), ls(s), op_prio(p) {}
+  void finish(int res) override {
+    if (res < 0)
+      mdlog->mds->handle_write_error(res);
+    mdlog->_maybe_expired(ls, op_prio);
+  }
+};
+
+/**
+ * Like MDLog::trim, but instead of trimming to max_segments, trim all but the latest
+ * segment.
+ */
+int MDLog::trim_all()
+{
+  submit_mutex.lock();
+
+  dout(10) << __func__ << ": "
+	   << segments.size()
+           << "/" << expiring_segments.size()
+           << "/" << expired_segments.size() << dendl;
+
+  uint64_t last_seq = 0;
+  if (!segments.empty()) {
+    last_seq = get_last_segment_seq();
+    try_to_commit_open_file_table(last_seq);
+  }
+
+  map<uint64_t,LogSegment*>::iterator p = segments.begin();
+  while (p != segments.end() &&
+	 p->first < last_seq &&
+	 p->second->end < safe_pos) { // next segment should have been started
+    LogSegment *ls = p->second;
+    ++p;
+
+    // Caller should have flushed journaler before calling this
+    if (pending_events.count(ls->seq)) {
+      dout(5) << __func__ << ": segment " << ls->seq << " has pending events" << dendl;
+      submit_mutex.unlock();
+      return -CEPHFS_EAGAIN;
+    }
+
+    if (expiring_segments.count(ls)) {
+      dout(5) << "trim already expiring segment " << ls->seq << "/" << ls->offset
+	      << ", " << ls->num_events << " events" << dendl;
+    } else if (expired_segments.count(ls)) {
+      dout(5) << "trim already expired segment " << ls->seq << "/" << ls->offset
+	      << ", " << ls->num_events << " events" << dendl;
+    } else {
+      ceph_assert(expiring_segments.count(ls) == 0);
+      expiring_segments.insert(ls);
+      expiring_events += ls->num_events;
+      submit_mutex.unlock();
+
+      uint64_t next_seq = ls->seq + 1;
+      try_expire(ls, CEPH_MSG_PRIO_DEFAULT);
+
+      submit_mutex.lock();
+      p = segments.lower_bound(next_seq);
+    }
+  }
+
+  _trim_expired_segments();
+
+  return 0;
+}
+
+
+void MDLog::try_expire(LogSegment *ls, int op_prio)
+{
+  MDSGatherBuilder gather_bld(g_ceph_context);
+  ls->try_to_expire(mds, gather_bld, op_prio);
+
+  if (gather_bld.has_subs()) {
+    dout(5) << "try_expire expiring segment " << ls->seq << "/" << ls->offset << dendl;
+    gather_bld.set_finisher(new C_MaybeExpiredSegment(this, ls, op_prio));
+    gather_bld.activate();
+  } else {
+    dout(10) << "try_expire expired segment " << ls->seq << "/" << ls->offset << dendl;
+    submit_mutex.lock();
+    ceph_assert(expiring_segments.count(ls));
+    expiring_segments.erase(ls);
+    expiring_events -= ls->num_events;
+    _expired(ls);
+    submit_mutex.unlock();
+  }
+  
+  logger->set(l_mdl_segexg, expiring_segments.size());
+  logger->set(l_mdl_evexg, expiring_events);
+}
+
+void MDLog::_maybe_expired(LogSegment *ls, int op_prio)
+{
+  if (mds->mdcache->is_readonly()) {
+    dout(10) << "_maybe_expired, ignoring read-only FS" <<  dendl;
+    return;
+  }
+
+  dout(10) << "_maybe_expired segment " << ls->seq << "/" << ls->offset
+	   << ", " << ls->num_events << " events" << dendl;
+  try_expire(ls, op_prio);
+}
+
+void MDLog::_trim_expired_segments()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(submit_mutex));
+
+  uint64_t oft_committed_seq = mds->mdcache->open_file_table.get_committed_log_seq();
+
+  // trim expired segments?
+  bool trimmed = false;
+  while (!segments.empty()) {
+    LogSegment *ls = segments.begin()->second;
+    if (!expired_segments.count(ls)) {
+      dout(10) << "_trim_expired_segments waiting for " << ls->seq << "/" << ls->offset
+	       << " to expire" << dendl;
+      break;
+    }
+
+    if (!mds_is_shutting_down && ls->seq >= oft_committed_seq) {
+      dout(10) << "_trim_expired_segments open file table committedseq " << oft_committed_seq
+	       << " <= " << ls->seq << "/" << ls->offset << dendl;
+      break;
+    }
+    
+    dout(10) << "_trim_expired_segments trimming expired "
+	     << ls->seq << "/0x" << std::hex << ls->offset << std::dec << dendl;
+    expired_events -= ls->num_events;
+    expired_segments.erase(ls);
+    if (pre_segments_size > 0)
+      pre_segments_size--;
+    num_events -= ls->num_events;
+      
+    // this was the oldest segment, adjust expire pos
+    if (journaler->get_expire_pos() < ls->end) {
+      journaler->set_expire_pos(ls->end);
+      logger->set(l_mdl_expos, ls->end);
+    } else {
+      logger->set(l_mdl_expos, ls->offset);
+    }
+    
+    logger->inc(l_mdl_segtrm);
+    logger->inc(l_mdl_evtrm, ls->num_events);
+    
+    segments.erase(ls->seq);
+    delete ls;
+    trimmed = true;
+  }
+
+  submit_mutex.unlock();
+
+  if (trimmed)
+    journaler->write_head(0);
+}
+
+void MDLog::trim_expired_segments()
+{
+  submit_mutex.lock();
+  _trim_expired_segments();
+}
+
+void MDLog::_expired(LogSegment *ls)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(submit_mutex));
+
+  dout(5) << "_expired segment " << ls->seq << "/" << ls->offset
+	  << ", " << ls->num_events << " events" << dendl;
+
+  if (!mds_is_shutting_down && ls == peek_current_segment()) {
+    dout(5) << "_expired not expiring " << ls->seq << "/" << ls->offset
+	    << ", last one and !mds_is_shutting_down" << dendl;
+  } else {
+    // expired.
+    expired_segments.insert(ls);
+    expired_events += ls->num_events;
+
+    // Trigger all waiters
+    finish_contexts(g_ceph_context, ls->expiry_waiters);
+    
+    logger->inc(l_mdl_evex, ls->num_events);
+    logger->inc(l_mdl_segex);
+  }
+
+  logger->set(l_mdl_ev, num_events);
+  logger->set(l_mdl_evexd, expired_events);
+  logger->set(l_mdl_seg, segments.size());
+  logger->set(l_mdl_segexd, expired_segments.size());
+}
+
+
+
+void MDLog::replay(MDSContext *c)
+{
+  ceph_assert(journaler->is_active());
+  ceph_assert(journaler->is_readonly());
+
+  // empty?
+  if (journaler->get_read_pos() == journaler->get_write_pos()) {
+    dout(10) << "replay - journal empty, done." << dendl;
+    mds->mdcache->trim();
+    if (mds->is_standby_replay())
+      mds->update_mlogger();
+    if (c) {
+      c->complete(0);
+    }
+    return;
+  }
+
+  // add waiter
+  if (c)
+    waitfor_replay.push_back(c);
+
+  // go!
+  dout(10) << "replay start, from " << journaler->get_read_pos()
+	   << " to " << journaler->get_write_pos() << dendl;
+
+  ceph_assert(num_events == 0 || already_replayed);
+  if (already_replayed) {
+    // Ensure previous instance of ReplayThread is joined before
+    // we create another one
+    replay_thread.join();
+  }
+  already_replayed = true;
+
+  replay_thread.create("md_log_replay");
+}
+
+
+/**
+ * Resolve the JournalPointer object to a journal file, and
+ * instantiate a Journaler object.  This may re-write the journal
+ * if the journal in RADOS appears to be in an old format.
+ *
+ * This is a separate thread because of the way it is initialized from inside
+ * the mds lock, which is also the global objecter lock -- rather than split
+ * it up into hard-to-read async operations linked up by contexts, 
+ *
+ * When this function completes, the `journaler` attribute will be set to
+ * a Journaler instance using the latest available serialization format.
+ */
+void MDLog::_recovery_thread(MDSContext *completion)
+{
+  ceph_assert(journaler == NULL);
+  if (g_conf()->mds_journal_format > JOURNAL_FORMAT_MAX) {
+      dout(0) << "Configuration value for mds_journal_format is out of bounds, max is "
+              << JOURNAL_FORMAT_MAX << dendl;
+
+      // Oh dear, something unreadable in the store for this rank: require
+      // operator intervention.
+      mds->damaged_unlocked();
+      ceph_abort();  // damaged should not return
+  }
+
+  // First, read the pointer object.
+  // If the pointer object is not present, then create it with
+  // front = default ino and back = null
+  JournalPointer jp(mds->get_nodeid(), mds->get_metadata_pool());
+  const int read_result = jp.load(mds->objecter);
+  if (read_result == -CEPHFS_ENOENT) {
+    inodeno_t const default_log_ino = MDS_INO_LOG_OFFSET + mds->get_nodeid();
+    jp.front = default_log_ino;
+    int write_result = jp.save(mds->objecter);
+    // Nothing graceful we can do for this
+    ceph_assert(write_result >= 0);
+  } else if (read_result == -CEPHFS_EBLOCKLISTED) {
+    derr << "Blocklisted during JournalPointer read!  Respawning..." << dendl;
+    mds->respawn();
+    ceph_abort(); // Should be unreachable because respawn calls execv
+  } else if (read_result != 0) {
+    mds->clog->error() << "failed to read JournalPointer: " << read_result
+                       << " (" << cpp_strerror(read_result) << ")";
+    mds->damaged_unlocked();
+    ceph_abort();  // Should be unreachable because damaged() calls respawn()
+  }
+
+  // If the back pointer is non-null, that means that a journal
+  // rewrite failed part way through.  Erase the back journal
+  // to clean up.
+  if (jp.back) {
+    if (mds->is_standby_replay()) {
+      dout(1) << "Journal " << jp.front << " is being rewritten, "
+        << "cannot replay in standby until an active MDS completes rewrite" << dendl;
+      std::lock_guard l(mds->mds_lock);
+      if (mds->is_daemon_stopping()) {
+        return;
+      }
+      completion->complete(-CEPHFS_EAGAIN);
+      return;
+    }
+    dout(1) << "Erasing journal " << jp.back << dendl;
+    C_SaferCond erase_waiter;
+    Journaler back("mdlog", jp.back, mds->get_metadata_pool(),
+        CEPH_FS_ONDISK_MAGIC, mds->objecter, logger, l_mdl_jlat,
+        mds->finisher);
+
+    // Read all about this journal (header + extents)
+    C_SaferCond recover_wait;
+    back.recover(&recover_wait);
+    int recovery_result = recover_wait.wait();
+    if (recovery_result == -CEPHFS_EBLOCKLISTED) {
+      derr << "Blocklisted during journal recovery!  Respawning..." << dendl;
+      mds->respawn();
+      ceph_abort(); // Should be unreachable because respawn calls execv
+    } else if (recovery_result != 0) {
+      // Journaler.recover succeeds if no journal objects are present: an error
+      // means something worse like a corrupt header, which we can't handle here.
+      mds->clog->error() << "Error recovering journal " << jp.front << ": "
+        << cpp_strerror(recovery_result);
+      mds->damaged_unlocked();
+      ceph_assert(recovery_result == 0); // Unreachable because damaged() calls respawn()
+    }
+
+    // We could read journal, so we can erase it.
+    back.erase(&erase_waiter);
+    int erase_result = erase_waiter.wait();
+
+    // If we are successful, or find no data, we can update the JournalPointer to
+    // reflect that the back journal is gone.
+    if (erase_result != 0 && erase_result != -CEPHFS_ENOENT) {
+      derr << "Failed to erase journal " << jp.back << ": " << cpp_strerror(erase_result) << dendl;
+    } else {
+      dout(1) << "Successfully erased journal, updating journal pointer" << dendl;
+      jp.back = 0;
+      int write_result = jp.save(mds->objecter);
+      // Nothing graceful we can do for this
+      ceph_assert(write_result >= 0);
+    }
+  }
+
+  /* Read the header from the front journal */
+  Journaler *front_journal = new Journaler("mdlog", jp.front,
+      mds->get_metadata_pool(), CEPH_FS_ONDISK_MAGIC, mds->objecter,
+      logger, l_mdl_jlat, mds->finisher);
+
+  // Assign to ::journaler so that we can be aborted by ::shutdown while
+  // waiting for journaler recovery
+  {
+    std::lock_guard l(mds->mds_lock);
+    journaler = front_journal;
+  }
+
+  C_SaferCond recover_wait;
+  front_journal->recover(&recover_wait);
+  dout(4) << "Waiting for journal " << jp.front << " to recover..." << dendl;
+  int recovery_result = recover_wait.wait();
+  dout(4) << "Journal " << jp.front << " recovered." << dendl;
+
+  if (recovery_result == -CEPHFS_EBLOCKLISTED) {
+    derr << "Blocklisted during journal recovery!  Respawning..." << dendl;
+    mds->respawn();
+    ceph_abort(); // Should be unreachable because respawn calls execv
+  } else if (recovery_result != 0) {
+    mds->clog->error() << "Error recovering journal " << jp.front << ": "
+      << cpp_strerror(recovery_result);
+    mds->damaged_unlocked();
+    ceph_assert(recovery_result == 0); // Unreachable because damaged() calls respawn()
+  }
+
+  /* Check whether the front journal format is acceptable or needs re-write */
+  if (front_journal->get_stream_format() > JOURNAL_FORMAT_MAX) {
+    dout(0) << "Journal " << jp.front << " is in unknown format " << front_journal->get_stream_format()
+            << ", does this MDS daemon require upgrade?" << dendl;
+    {
+      std::lock_guard l(mds->mds_lock);
+      if (mds->is_daemon_stopping()) {
+        journaler = NULL;
+        delete front_journal;
+        return;
+      }
+      completion->complete(-CEPHFS_EINVAL);
+    }
+  } else if (mds->is_standby_replay() || front_journal->get_stream_format() >= g_conf()->mds_journal_format) {
+    /* The journal is of configured format, or we are in standbyreplay and will
+     * tolerate replaying old journals until we have to go active. Use front_journal as
+     * our journaler attribute and complete */
+    dout(4) << "Recovered journal " << jp.front << " in format " << front_journal->get_stream_format() << dendl;
+    {
+      std::lock_guard l(mds->mds_lock);
+      journaler->set_write_error_handler(new C_MDL_WriteError(this));
+      if (mds->is_daemon_stopping()) {
+        return;
+      }
+      completion->complete(0);
+    }
+  } else {
+    /* Hand off to reformat routine, which will ultimately set the
+     * completion when it has done its thing */
+    dout(1) << "Journal " << jp.front << " has old format "
+      << front_journal->get_stream_format() << ", it will now be updated" << dendl;
+    _reformat_journal(jp, front_journal, completion);
+  }
+}
+
+/**
+ * Blocking rewrite of the journal to a new file, followed by
+ * swap of journal pointer to point to the new one.
+ *
+ * We write the new journal to the 'back' journal from the JournalPointer,
+ * swapping pointers to make that one the front journal only when we have
+ * safely completed.
+ */
+void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journal, MDSContext *completion)
+{
+  ceph_assert(!jp_in.is_null());
+  ceph_assert(completion != NULL);
+  ceph_assert(old_journal != NULL);
+
+  JournalPointer jp = jp_in;
+
+  /* Set JournalPointer.back to the location we will write the new journal */
+  inodeno_t primary_ino = MDS_INO_LOG_OFFSET + mds->get_nodeid();
+  inodeno_t secondary_ino = MDS_INO_LOG_BACKUP_OFFSET + mds->get_nodeid();
+  jp.back = (jp.front == primary_ino ? secondary_ino : primary_ino);
+  int write_result = jp.save(mds->objecter);
+  ceph_assert(write_result == 0);
+
+  /* Create the new Journaler file */
+  Journaler *new_journal = new Journaler("mdlog", jp.back,
+      mds->get_metadata_pool(), CEPH_FS_ONDISK_MAGIC, mds->objecter, logger, l_mdl_jlat, mds->finisher);
+  dout(4) << "Writing new journal header " << jp.back << dendl;
+  file_layout_t new_layout = old_journal->get_layout();
+  new_journal->set_writeable();
+  new_journal->create(&new_layout, g_conf()->mds_journal_format);
+
+  /* Write the new journal header to RADOS */
+  C_SaferCond write_head_wait;
+  new_journal->write_head(&write_head_wait);
+  write_head_wait.wait();
+
+  // Read in the old journal, and whenever we have readable events,
+  // write them to the new journal.
+  int r = 0;
+
+  // In old format journals before event_seq was introduced, the serialized
+  // offset of a SubtreeMap message in the log is used as the unique ID for
+  // a log segment.  Because we change serialization, this will end up changing
+  // for us, so we have to explicitly update the fields that point back to that
+  // log segment.
+  std::map<LogSegment::seq_t, LogSegment::seq_t> segment_pos_rewrite;
+
+  // The logic in here borrowed from replay_thread expects mds_lock to be held,
+  // e.g. between checking readable and doing wait_for_readable so that journaler
+  // state doesn't change in between.
+  uint32_t events_transcribed = 0;
+  while (1) {
+    while (!old_journal->is_readable() &&
+	   old_journal->get_read_pos() < old_journal->get_write_pos() &&
+	   !old_journal->get_error()) {
+
+      // Issue a journal prefetch
+      C_SaferCond readable_waiter;
+      old_journal->wait_for_readable(&readable_waiter);
+
+      // Wait for a journal prefetch to complete
+      readable_waiter.wait();
+    }
+    if (old_journal->get_error()) {
+      r = old_journal->get_error();
+      dout(0) << "_replay journaler got error " << r << ", aborting" << dendl;
+      break;
+    }
+
+    if (!old_journal->is_readable() &&
+	old_journal->get_read_pos() == old_journal->get_write_pos())
+      break;
+
+    // Read one serialized LogEvent
+    ceph_assert(old_journal->is_readable());
+    bufferlist bl;
+    uint64_t le_pos = old_journal->get_read_pos();
+    bool r = old_journal->try_read_entry(bl);
+    if (!r && old_journal->get_error())
+      continue;
+    ceph_assert(r);
+
+    // Update segment_pos_rewrite
+    auto le = LogEvent::decode_event(bl.cbegin());
+    if (le) {
+      bool modified = false;
+
+      if (le->get_type() == EVENT_SUBTREEMAP ||
+          le->get_type() == EVENT_RESETJOURNAL) {
+        auto sle = dynamic_cast<ESubtreeMap*>(le.get());
+        if (sle == NULL || sle->event_seq == 0) {
+          // A non-explicit event seq: the effective sequence number 
+          // of this segment is it's position in the old journal and
+          // the new effective sequence number will be its position
+          // in the new journal.
+          segment_pos_rewrite[le_pos] = new_journal->get_write_pos();
+          dout(20) << __func__ << " discovered segment seq mapping "
+            << le_pos << " -> " << new_journal->get_write_pos() << dendl;
+        }
+      } else {
+        event_seq++;
+      }
+
+      // Rewrite segment references if necessary
+      EMetaBlob *blob = le->get_metablob();
+      if (blob) {
+        modified = blob->rewrite_truncate_finish(mds, segment_pos_rewrite);
+      }
+
+      // Zero-out expire_pos in subtreemap because offsets have changed
+      // (expire_pos is just an optimization so it's safe to eliminate it)
+      if (le->get_type() == EVENT_SUBTREEMAP
+          || le->get_type() == EVENT_SUBTREEMAP_TEST) {
+        auto& sle = dynamic_cast<ESubtreeMap&>(*le);
+        dout(20) << __func__ << " zeroing expire_pos in subtreemap event at "
+          << le_pos << " seq=" << sle.event_seq << dendl;
+        sle.expire_pos = 0;
+        modified = true;
+      }
+
+      if (modified) {
+        bl.clear();
+        le->encode_with_header(bl, mds->mdsmap->get_up_features());
+      }
+    } else {
+      // Failure from LogEvent::decode, our job is to change the journal wrapper,
+      // not validate the contents, so pass it through.
+      dout(1) << __func__ << " transcribing un-decodable LogEvent at old position "
+        << old_journal->get_read_pos() << ", new position " << new_journal->get_write_pos()
+        << dendl;
+    }
+
+    // Write (buffered, synchronous) one serialized LogEvent
+    events_transcribed += 1;
+    new_journal->append_entry(bl);
+  }
+
+  dout(1) << "Transcribed " << events_transcribed << " events, flushing new journal" << dendl;
+  C_SaferCond flush_waiter;
+  new_journal->flush(&flush_waiter);
+  flush_waiter.wait();
+
+  // If failed to rewrite journal, leave the part written journal
+  // as garbage to be cleaned up next startup.
+  ceph_assert(r == 0);
+
+  /* Now that the new journal is safe, we can flip the pointers */
+  inodeno_t const tmp = jp.front;
+  jp.front = jp.back;
+  jp.back = tmp;
+  write_result = jp.save(mds->objecter);
+  ceph_assert(write_result == 0);
+
+  /* Delete the old journal to free space */
+  dout(1) << "New journal flushed, erasing old journal" << dendl;
+  C_SaferCond erase_waiter;
+  old_journal->erase(&erase_waiter);
+  int erase_result = erase_waiter.wait();
+  ceph_assert(erase_result == 0);
+  {
+    std::lock_guard l(mds->mds_lock);
+    if (mds->is_daemon_stopping()) {
+      delete new_journal;
+      return;
+    }
+    ceph_assert(journaler == old_journal);
+    journaler = NULL;
+    delete old_journal;
+
+    /* Update the pointer to reflect we're back in clean single journal state. */
+    jp.back = 0;
+    write_result = jp.save(mds->objecter);
+    ceph_assert(write_result == 0);
+
+    /* Reset the Journaler object to its default state */
+    dout(1) << "Journal rewrite complete, continuing with normal startup" << dendl;
+    if (mds->is_daemon_stopping()) {
+      delete new_journal;
+      return;
+    }
+    journaler = new_journal;
+    journaler->set_readonly();
+    journaler->set_write_error_handler(new C_MDL_WriteError(this));
+
+    /* Trigger completion */
+    if (mds->is_daemon_stopping()) {
+      return;
+    }
+    completion->complete(0);
+  }
+}
+
+
+// i am a separate thread
+void MDLog::_replay_thread()
+{
+  dout(10) << "_replay_thread start" << dendl;
+
+  // loop
+  int r = 0;
+  while (1) {
+    // wait for read?
+    while (!journaler->is_readable() &&
+	   journaler->get_read_pos() < journaler->get_write_pos() &&
+	   !journaler->get_error()) {
+      C_SaferCond readable_waiter;
+      journaler->wait_for_readable(&readable_waiter);
+      r = readable_waiter.wait();
+    }
+    if (journaler->get_error()) {
+      r = journaler->get_error();
+      dout(0) << "_replay journaler got error " << r << ", aborting" << dendl;
+      if (r == -CEPHFS_ENOENT) {
+        if (mds->is_standby_replay()) {
+          // journal has been trimmed by somebody else
+          r = -CEPHFS_EAGAIN;
+        } else {
+          mds->clog->error() << "missing journal object";
+          mds->damaged_unlocked();
+          ceph_abort();  // Should be unreachable because damaged() calls respawn()
+        }
+      } else if (r == -CEPHFS_EINVAL) {
+        if (journaler->get_read_pos() < journaler->get_expire_pos()) {
+          // this should only happen if you're following somebody else
+          if(journaler->is_readonly()) {
+            dout(0) << "expire_pos is higher than read_pos, returning CEPHFS_EAGAIN" << dendl;
+            r = -CEPHFS_EAGAIN;
+          } else {
+            mds->clog->error() << "invalid journaler offsets";
+            mds->damaged_unlocked();
+            ceph_abort();  // Should be unreachable because damaged() calls respawn()
+          }
+        } else {
+          /* re-read head and check it
+           * Given that replay happens in a separate thread and
+           * the MDS is going to either shut down or restart when
+           * we return this error, doing it synchronously is fine
+           * -- as long as we drop the main mds lock--. */
+          C_SaferCond reread_fin;
+          journaler->reread_head(&reread_fin);
+          int err = reread_fin.wait();
+          if (err) {
+            if (err == -CEPHFS_ENOENT && mds->is_standby_replay()) {
+              r = -CEPHFS_EAGAIN;
+              dout(1) << "Journal header went away while in standby replay, journal rewritten?"
+                      << dendl;
+              break;
+            } else {
+                dout(0) << "got error while reading head: " << cpp_strerror(err)
+                        << dendl;
+
+                mds->clog->error() << "error reading journal header";
+                mds->damaged_unlocked();
+                ceph_abort();  // Should be unreachable because damaged() calls
+                            // respawn()
+            }
+          }
+	  standby_trim_segments();
+          if (journaler->get_read_pos() < journaler->get_expire_pos()) {
+            dout(0) << "expire_pos is higher than read_pos, returning CEPHFS_EAGAIN" << dendl;
+            r = -CEPHFS_EAGAIN;
+          }
+        }
+      }
+      break;
+    }
+
+    if (!journaler->is_readable() &&
+	journaler->get_read_pos() == journaler->get_write_pos())
+      break;
+    
+    ceph_assert(journaler->is_readable() || mds->is_daemon_stopping());
+    
+    // read it
+    uint64_t pos = journaler->get_read_pos();
+    bufferlist bl;
+    bool r = journaler->try_read_entry(bl);
+    if (!r && journaler->get_error())
+      continue;
+    ceph_assert(r);
+    
+    // unpack event
+    auto le = LogEvent::decode_event(bl.cbegin());
+    if (!le) {
+      dout(0) << "_replay " << pos << "~" << bl.length() << " / " << journaler->get_write_pos() 
+	      << " -- unable to decode event" << dendl;
+      dout(0) << "dump of unknown or corrupt event:\n";
+      bl.hexdump(*_dout);
+      *_dout << dendl;
+
+      mds->clog->error() << "corrupt journal event at " << pos << "~"
+                         << bl.length() << " / "
+                         << journaler->get_write_pos();
+      if (g_conf()->mds_log_skip_corrupt_events) {
+        continue;
+      } else {
+        mds->damaged_unlocked();
+        ceph_abort();  // Should be unreachable because damaged() calls
+                    // respawn()
+      }
+
+    }
+    le->set_start_off(pos);
+
+    // new segment?
+    if (le->get_type() == EVENT_SUBTREEMAP ||
+	le->get_type() == EVENT_RESETJOURNAL) {
+      auto sle = dynamic_cast<ESubtreeMap*>(le.get());
+      if (sle && sle->event_seq > 0)
+	event_seq = sle->event_seq;
+      else
+	event_seq = pos;
+      segments[event_seq] = new LogSegment(event_seq, pos);
+      logger->set(l_mdl_seg, segments.size());
+    } else {
+      event_seq++;
+    }
+
+    // have we seen an import map yet?
+    if (segments.empty()) {
+      dout(10) << "_replay " << pos << "~" << bl.length() << " / " << journaler->get_write_pos() 
+	       << " " << le->get_stamp() << " -- waiting for subtree_map.  (skipping " << *le << ")" << dendl;
+    } else {
+      dout(10) << "_replay " << pos << "~" << bl.length() << " / " << journaler->get_write_pos() 
+	       << " " << le->get_stamp() << ": " << *le << dendl;
+      le->_segment = get_current_segment();    // replay may need this
+      le->_segment->num_events++;
+      le->_segment->end = journaler->get_read_pos();
+      num_events++;
+
+      {
+        std::lock_guard l(mds->mds_lock);
+        if (mds->is_daemon_stopping()) {
+          return;
+        }
+        logger->inc(l_mdl_replayed);
+        le->replay(mds);
+      }
+    }
+
+    logger->set(l_mdl_rdpos, pos);
+  }
+
+  // done!
+  if (r == 0) {
+    ceph_assert(journaler->get_read_pos() == journaler->get_write_pos());
+    dout(10) << "_replay - complete, " << num_events
+	     << " events" << dendl;
+
+    logger->set(l_mdl_expos, journaler->get_expire_pos());
+  }
+
+  safe_pos = journaler->get_write_safe_pos();
+
+  dout(10) << "_replay_thread kicking waiters" << dendl;
+  {
+    std::lock_guard l(mds->mds_lock);
+    if (mds->is_daemon_stopping()) {
+      return;
+    }
+    pre_segments_size = segments.size();  // get num of logs when replay is finished
+    finish_contexts(g_ceph_context, waitfor_replay, r);  
+  }
+
+  dout(10) << "_replay_thread finish" << dendl;
+}
+
+void MDLog::standby_trim_segments()
+{
+  dout(10) << "standby_trim_segments" << dendl;
+  uint64_t expire_pos = journaler->get_expire_pos();
+  dout(10) << " expire_pos=" << expire_pos << dendl;
+
+  mds->mdcache->open_file_table.trim_destroyed_inos(expire_pos);
+
+  bool removed_segment = false;
+  while (have_any_segments()) {
+    LogSegment *seg = get_oldest_segment();
+    dout(10) << " segment seq=" << seg->seq << " " << seg->offset <<
+      "~" << seg->end - seg->offset << dendl;
+
+    if (seg->end > expire_pos) {
+      dout(10) << " won't remove, not expired!" << dendl;
+      break;
+    }
+
+    if (segments.size() == 1) {
+      dout(10) << " won't remove, last segment!" << dendl;
+      break;
+    }
+
+    dout(10) << " removing segment" << dendl;
+    mds->mdcache->standby_trim_segment(seg);
+    remove_oldest_segment();
+    removed_segment = true;
+  }
+
+  if (removed_segment) {
+    dout(20) << " calling mdcache->trim!" << dendl;
+    mds->mdcache->trim();
+  } else {
+    dout(20) << " removed no segments!" << dendl;
+  }
+}
+
+void MDLog::dump_replay_status(Formatter *f) const
+{
+  f->open_object_section("replay_status");
+  f->dump_unsigned("journal_read_pos", journaler ? journaler->get_read_pos() : 0);
+  f->dump_unsigned("journal_write_pos", journaler ? journaler->get_write_pos() : 0);
+  f->dump_unsigned("journal_expire_pos", journaler ? journaler->get_expire_pos() : 0);
+  f->dump_unsigned("num_events", get_num_events());
+  f->dump_unsigned("num_segments", get_num_segments());
+  f->close_section();
+}
diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h
new file mode 100644
index 000000000..55bb4c142
--- /dev/null
+++ b/src/mds/MDLog.h
@@ -0,0 +1,312 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#ifndef CEPH_MDLOG_H
+#define CEPH_MDLOG_H
+
+#include "include/common_fwd.h"
+
+enum {
+  l_mdl_first = 5000,
+  l_mdl_evadd,
+  l_mdl_evex,
+  l_mdl_evtrm,
+  l_mdl_ev,
+  l_mdl_evexg,
+  l_mdl_evexd,
+  l_mdl_segadd,
+  l_mdl_segex,
+  l_mdl_segtrm,
+  l_mdl_seg,
+  l_mdl_segexg,
+  l_mdl_segexd,
+  l_mdl_expos,
+  l_mdl_wrpos,
+  l_mdl_rdpos,
+  l_mdl_jlat,
+  l_mdl_replayed,
+  l_mdl_last,
+};
+
+#include "include/types.h"
+#include "include/Context.h"
+
+#include "MDSContext.h"
+#include "common/Cond.h"
+#include "common/Finisher.h"
+#include "common/Thread.h"
+
+#include "LogSegment.h"
+
+#include <list>
+#include <map>
+
+class Journaler;
+class JournalPointer;
+class LogEvent;
+class MDSRank;
+class LogSegment;
+class ESubtreeMap;
+
+class MDLog {
+public:
+  explicit MDLog(MDSRank *m) : mds(m),
+                      replay_thread(this),
+                      recovery_thread(this),
+                      submit_thread(this) {}
+  ~MDLog();
+
+  const std::set<LogSegment*> &get_expiring_segments() const
+  {
+    return expiring_segments;
+  }
+
+  void create_logger();
+  void set_write_iohint(unsigned iohint_flags);
+
+  void start_new_segment() {
+    std::lock_guard l(submit_mutex);
+    _start_new_segment();
+  }
+  void prepare_new_segment() {
+    std::lock_guard l(submit_mutex);
+    _prepare_new_segment();
+  }
+  void journal_segment_subtree_map(MDSContext *onsync=NULL) {
+    {
+      std::lock_guard l{submit_mutex};
+      _journal_segment_subtree_map(onsync);
+    }
+    if (onsync)
+      flush();
+  }
+
+  LogSegment *peek_current_segment() {
+    return segments.empty() ? NULL : segments.rbegin()->second;
+  }
+
+  LogSegment *get_current_segment() { 
+    ceph_assert(!segments.empty());
+    return segments.rbegin()->second;
+  }
+
+  LogSegment *get_segment(LogSegment::seq_t seq) {
+    if (segments.count(seq))
+      return segments[seq];
+    return NULL;
+  }
+
+  bool have_any_segments() const {
+    return !segments.empty();
+  }
+
+  void flush_logger();
+
+  size_t get_num_events() const { return num_events; }
+  size_t get_num_segments() const { return segments.size(); }
+
+  uint64_t get_read_pos() const;
+  uint64_t get_write_pos() const;
+  uint64_t get_safe_pos() const;
+  Journaler *get_journaler() { return journaler; }
+  bool empty() const { return segments.empty(); }
+
+  bool is_capped() const { return mds_is_shutting_down; }
+  void cap();
+
+  void kick_submitter();
+  void shutdown();
+
+  void _start_entry(LogEvent *e);
+  void start_entry(LogEvent *e) {
+    std::lock_guard l(submit_mutex);
+    _start_entry(e);
+  }
+  void cancel_entry(LogEvent *e);
+  void _submit_entry(LogEvent *e, MDSLogContextBase *c);
+  void submit_entry(LogEvent *e, MDSLogContextBase *c = 0) {
+    std::lock_guard l(submit_mutex);
+    _submit_entry(e, c);
+    submit_cond.notify_all();
+  }
+  void start_submit_entry(LogEvent *e, MDSLogContextBase *c = 0) {
+    std::lock_guard l(submit_mutex);
+    _start_entry(e);
+    _submit_entry(e, c);
+    submit_cond.notify_all();
+  }
+  bool entry_is_open() const { return cur_event != NULL; }
+
+  void wait_for_safe( MDSContext *c );
+  void flush();
+  bool is_flushed() const {
+    return unflushed == 0;
+  }
+
+  void trim_expired_segments();
+  void trim(int max=-1);
+  int trim_all();
+  bool expiry_done() const
+  {
+    return expiring_segments.empty() && expired_segments.empty();
+  };
+
+  void create(MDSContext *onfinish);  // fresh, empty log! 
+  void open(MDSContext *onopen);      // append() or replay() to follow!
+  void reopen(MDSContext *onopen);
+  void append();
+  void replay(MDSContext *onfinish);
+
+  void standby_trim_segments();
+
+  void dump_replay_status(Formatter *f) const;
+
+  MDSRank *mds;
+  // replay state
+  std::map<inodeno_t, set<inodeno_t>> pending_exports;
+
+protected:
+  struct PendingEvent {
+    PendingEvent(LogEvent *e, MDSContext *c, bool f=false) : le(e), fin(c), flush(f) {}
+    LogEvent *le;
+    MDSContext *fin;
+    bool flush;
+  };
+
+  // -- replay --
+  class ReplayThread : public Thread {
+  public:
+    explicit ReplayThread(MDLog *l) : log(l) {}
+    void* entry() override {
+      log->_replay_thread();
+      return 0;
+    }
+  private:
+    MDLog *log;
+  } replay_thread;
+
+  // Journal recovery/rewrite logic
+  class RecoveryThread : public Thread {
+  public:
+    explicit RecoveryThread(MDLog *l) : log(l) {}
+    void set_completion(MDSContext *c) {completion = c;}
+    void* entry() override {
+      log->_recovery_thread(completion);
+      return 0;
+    }
+  private:
+    MDLog *log;
+    MDSContext *completion = nullptr;
+  } recovery_thread;
+
+  class SubmitThread : public Thread {
+  public:
+    explicit SubmitThread(MDLog *l) : log(l) {}
+    void* entry() override {
+      log->_submit_thread();
+      return 0;
+    }
+  private:
+    MDLog *log;
+  } submit_thread;
+
+  friend class ReplayThread;
+  friend class C_MDL_Replay;
+  friend class MDSLogContextBase;
+  friend class SubmitThread;
+  // -- subtreemaps --
+  friend class ESubtreeMap;
+  friend class MDCache;
+
+  void _replay();         // old way
+  void _replay_thread();  // new way
+
+  void _recovery_thread(MDSContext *completion);
+  void _reformat_journal(JournalPointer const &jp, Journaler *old_journal, MDSContext *completion);
+
+  void set_safe_pos(uint64_t pos)
+  {
+    std::lock_guard l(submit_mutex);
+    ceph_assert(pos >= safe_pos);
+    safe_pos = pos;
+  }
+
+  void _submit_thread();
+
+  uint64_t get_last_segment_seq() const {
+    ceph_assert(!segments.empty());
+    return segments.rbegin()->first;
+  }
+  LogSegment *get_oldest_segment() {
+    return segments.begin()->second;
+  }
+  void remove_oldest_segment() {
+    std::map<uint64_t, LogSegment*>::iterator p = segments.begin();
+    delete p->second;
+    segments.erase(p);
+  }
+
+  int num_events = 0; // in events
+  int unflushed = 0;
+  bool mds_is_shutting_down = false;
+
+  // Log position which is persistent *and* for which
+  // submit_entry wait_for_safe callbacks have already
+  // been called.
+  uint64_t safe_pos = 0;
+
+  inodeno_t ino;
+  Journaler *journaler = nullptr;
+
+  PerfCounters *logger = nullptr;
+
+  bool already_replayed = false;
+
+  MDSContext::vec waitfor_replay;
+
+  // -- segments --
+  std::map<uint64_t,LogSegment*> segments;
+  set<LogSegment*> expiring_segments;
+  set<LogSegment*> expired_segments;
+  std::size_t pre_segments_size = 0;            // the num of segments when the mds finished replay-journal, to calc the num of segments growing
+  uint64_t event_seq = 0;
+  int expiring_events = 0;
+  int expired_events = 0;
+
+  int64_t mdsmap_up_features = 0;
+  std::map<uint64_t,list<PendingEvent> > pending_events; // log segment -> event list
+  ceph::mutex submit_mutex = ceph::make_mutex("MDLog::submit_mutex");
+  ceph::condition_variable submit_cond;
+
+private:
+  friend class C_MaybeExpiredSegment;
+  friend class C_MDL_Flushed;
+  friend class C_OFT_Committed;
+
+  // -- segments --
+  void _start_new_segment();
+  void _prepare_new_segment();
+  void _journal_segment_subtree_map(MDSContext *onsync);
+
+  void try_to_commit_open_file_table(uint64_t last_seq);
+
+  void try_expire(LogSegment *ls, int op_prio);
+  void _maybe_expired(LogSegment *ls, int op_prio);
+  void _expired(LogSegment *ls);
+  void _trim_expired_segments();
+  void write_head(MDSContext *onfinish);
+
+  // -- events --
+  LogEvent *cur_event = nullptr;
+};
+#endif
diff --git a/src/mds/MDSAuthCaps.cc b/src/mds/MDSAuthCaps.cc
new file mode 100644
index 000000000..b78ebd661
--- /dev/null
+++ b/src/mds/MDSAuthCaps.cc
@@ -0,0 +1,469 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <string_view>
+
+#include <errno.h>
+
+#include <boost/spirit/include/qi.hpp>
+#include <boost/spirit/include/phoenix_operator.hpp>
+#include <boost/spirit/include/phoenix.hpp>
+
+#include "common/debug.h"
+#include "MDSAuthCaps.h"
+#include "mdstypes.h"
+#include "include/ipaddr.h"
+
+#define dout_subsys ceph_subsys_mds
+
+#undef dout_prefix
+#define dout_prefix *_dout << "MDSAuthCap "
+
+using std::ostream;
+using std::string;
+using std::vector;
+namespace qi = boost::spirit::qi;
+namespace ascii = boost::spirit::ascii;
+namespace phoenix = boost::phoenix;
+
+template <typename Iterator>
+struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()>
+{
+  MDSCapParser() : MDSCapParser::base_type(mdscaps)
+  {
+    using qi::attr;
+    using qi::bool_;
+    using qi::char_;
+    using qi::int_;
+    using qi::uint_;
+    using qi::lexeme;
+    using qi::alnum;
+    using qi::_val;
+    using qi::_1;
+    using qi::_2;
+    using qi::_3;
+    using qi::eps;
+    using qi::lit;
+
+    spaces = +(lit(' ') | lit('\n') | lit('\t'));
+
+    quoted_path %=
+      lexeme[lit("\"") >> *(char_ - '"') >> '"'] | 
+      lexeme[lit("'") >> *(char_ - '\'') >> '\''];
+    unquoted_path %= +char_("a-zA-Z0-9_./-");
+    network_str %= +char_("/.:a-fA-F0-9][");
+    fs_name_str %= +char_("a-zA-Z0-9_.-");
+
+    // match := [path=<path>] [uid=<uid> [gids=<gid>[,<gid>...]]
+    // TODO: allow fsname, and root_squash to be specified with uid, and gidlist
+    path %= (spaces >> lit("path") >> lit('=') >> (quoted_path | unquoted_path));
+    uid %= (spaces >> lit("uid") >> lit('=') >> uint_);
+    uintlist %= (uint_ % lit(','));
+    gidlist %= -(spaces >> lit("gids") >> lit('=') >> uintlist);
+    fs_name %= -(spaces >> lit("fsname") >> lit('=') >> fs_name_str);
+    root_squash %= (spaces >> lit("root_squash") >> attr(true));
+    match = -(
+             (fs_name >> path >> root_squash)[_val = phoenix::construct<MDSCapMatch>(_2, _1, _3)] |
+	     (uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2)] |
+	     (path >> uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2, _3)] |
+             (fs_name >> path)[_val = phoenix::construct<MDSCapMatch>(_2, _1)] |
+             (fs_name >> root_squash)[_val = phoenix::construct<MDSCapMatch>(std::string(), _1, _2)] |
+             (path >> root_squash)[_val = phoenix::construct<MDSCapMatch>(_1, std::string(), _2)] |
+             (path)[_val = phoenix::construct<MDSCapMatch>(_1)] |
+             (root_squash)[_val = phoenix::construct<MDSCapMatch>(std::string(), std::string(), _1)] |
+             (fs_name)[_val = phoenix::construct<MDSCapMatch>(std::string(),
+							      _1)]);
+
+    // capspec = * | r[w][f][p][s]
+    capspec = spaces >> (
+        lit("*")[_val = MDSCapSpec(MDSCapSpec::ALL)]
+        |
+        lit("all")[_val = MDSCapSpec(MDSCapSpec::ALL)]
+        |
+        (lit("rwfps"))[_val = MDSCapSpec(MDSCapSpec::RWFPS)]
+        |
+        (lit("rwps"))[_val = MDSCapSpec(MDSCapSpec::RWPS)]
+        |
+        (lit("rwfp"))[_val = MDSCapSpec(MDSCapSpec::RWFP)]
+        |
+        (lit("rwfs"))[_val = MDSCapSpec(MDSCapSpec::RWFS)]
+        |
+        (lit("rwp"))[_val = MDSCapSpec(MDSCapSpec::RWP)]
+        |
+        (lit("rws"))[_val = MDSCapSpec(MDSCapSpec::RWS)]
+        |
+        (lit("rwf"))[_val = MDSCapSpec(MDSCapSpec::RWF)]
+        |
+        (lit("rw"))[_val = MDSCapSpec(MDSCapSpec::RW)]
+        |
+        (lit("r"))[_val = MDSCapSpec(MDSCapSpec::READ)]
+        );
+
+    grant = lit("allow") >> (capspec >> match >>
+			     -(spaces >> lit("network") >> spaces >> network_str))
+      [_val = phoenix::construct<MDSCapGrant>(_1, _2, _3)];
+    grants %= (grant % (*lit(' ') >> (lit(';') | lit(',')) >> *lit(' ')));
+    mdscaps = grants  [_val = phoenix::construct<MDSAuthCaps>(_1)]; 
+  }
+  qi::rule<Iterator> spaces;
+  qi::rule<Iterator, string()> quoted_path, unquoted_path, network_str;
+  qi::rule<Iterator, string()> fs_name_str, fs_name, path;
+  qi::rule<Iterator, bool()> root_squash;
+  qi::rule<Iterator, MDSCapSpec()> capspec;
+  qi::rule<Iterator, uint32_t()> uid;
+  qi::rule<Iterator, std::vector<uint32_t>() > uintlist;
+  qi::rule<Iterator, std::vector<uint32_t>() > gidlist;
+  qi::rule<Iterator, MDSCapMatch()> match;
+  qi::rule<Iterator, MDSCapGrant()> grant;
+  qi::rule<Iterator, std::vector<MDSCapGrant>()> grants;
+  qi::rule<Iterator, MDSAuthCaps()> mdscaps;
+};
+
+void MDSCapMatch::normalize_path()
+{
+  // drop any leading /
+  while (path.length() && path[0] == '/') {
+    path = path.substr(1);
+  }
+
+  // drop dup //
+  // drop .
+  // drop ..
+}
+
+bool MDSCapMatch::match(std::string_view target_path,
+			const int caller_uid,
+			const int caller_gid,
+			const vector<uint64_t> *caller_gid_list) const
+{
+  if (uid != MDS_AUTH_UID_ANY) {
+    if (uid != caller_uid)
+      return false;
+    if (!gids.empty()) {
+      bool gid_matched = false;
+      if (std::find(gids.begin(), gids.end(), caller_gid) != gids.end())
+	gid_matched = true;
+      if (caller_gid_list) {
+	for (auto i = caller_gid_list->begin(); i != caller_gid_list->end(); ++i) {
+	  if (std::find(gids.begin(), gids.end(), *i) != gids.end()) {
+	    gid_matched = true;
+	    break;
+	  }
+	}
+      }
+      if (!gid_matched)
+	return false;
+    }
+  }
+
+  if (!match_path(target_path)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool MDSCapMatch::match_path(std::string_view target_path) const
+{
+  if (path.length()) {
+    if (target_path.find(path) != 0)
+      return false;
+    // if path doesn't already have a trailing /, make sure the target
+    // does so that path=/foo doesn't match target_path=/food
+    if (target_path.length() > path.length() &&
+	path[path.length()-1] != '/' &&
+	target_path[path.length()] != '/')
+      return false;
+  }
+
+  return true;
+}
+
+void MDSCapGrant::parse_network()
+{
+  network_valid = ::parse_network(network.c_str(), &network_parsed,
+				  &network_prefix);
+}
+
+/**
+ * Is the client *potentially* able to access this path?  Actual
+ * permission will depend on uids/modes in the full is_capable.
+ */
+bool MDSAuthCaps::path_capable(std::string_view inode_path) const
+{
+  for (const auto &i : grants) {
+    if (i.match.match_path(inode_path)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/**
+ * For a given filesystem path, query whether this capability carries`
+ * authorization to read or write.
+ *
+ * This is true if any of the 'grant' clauses in the capability match the
+ * requested path + op.
+ */
+bool MDSAuthCaps::is_capable(std::string_view inode_path,
+			     uid_t inode_uid, gid_t inode_gid,
+			     unsigned inode_mode,
+			     uid_t caller_uid, gid_t caller_gid,
+			     const vector<uint64_t> *caller_gid_list,
+			     unsigned mask,
+			     uid_t new_uid, gid_t new_gid,
+			     const entity_addr_t& addr) const
+{
+  if (cct)
+    ldout(cct, 10) << __func__ << " inode(path /" << inode_path
+		   << " owner " << inode_uid << ":" << inode_gid
+		   << " mode 0" << std::oct << inode_mode << std::dec
+		   << ") by caller " << caller_uid << ":" << caller_gid
+// << "[" << caller_gid_list << "]";
+		   << " mask " << mask
+		   << " new " << new_uid << ":" << new_gid
+		   << " cap: " << *this << dendl;
+
+  for (const auto& grant : grants) {
+    if (grant.network.size() &&
+	(!grant.network_valid ||
+	 !network_contains(grant.network_parsed,
+			   grant.network_prefix,
+			   addr))) {
+      continue;
+    }
+
+    if (grant.match.match(inode_path, caller_uid, caller_gid, caller_gid_list) &&
+	grant.spec.allows(mask & (MAY_READ|MAY_EXECUTE), mask & MAY_WRITE)) {
+      if (grant.match.root_squash && ((caller_uid == 0) || (caller_gid == 0)) &&
+          (mask & MAY_WRITE)) {
+	    continue;
+      }
+      // we have a match; narrow down GIDs to those specifically allowed here
+      vector<uint64_t> gids;
+      if (std::find(grant.match.gids.begin(), grant.match.gids.end(), caller_gid) !=
+	  grant.match.gids.end()) {
+	gids.push_back(caller_gid);
+      }
+      if (caller_gid_list) {
+	std::set_intersection(grant.match.gids.begin(), grant.match.gids.end(),
+			      caller_gid_list->begin(), caller_gid_list->end(),
+			      std::back_inserter(gids));
+	std::sort(gids.begin(), gids.end());
+      }
+      
+
+      // Spec is non-allowing if caller asked for set pool but spec forbids it
+      if (mask & MAY_SET_VXATTR) {
+        if (!grant.spec.allow_set_vxattr()) {
+          continue;
+        }
+      }
+
+      if (mask & MAY_SNAPSHOT) {
+        if (!grant.spec.allow_snapshot()) {
+          continue;
+        }
+      }
+
+      if (mask & MAY_FULL) {
+        if (!grant.spec.allow_full()) {
+          continue;
+        }
+      }
+
+      // check unix permissions?
+      if (grant.match.uid == MDSCapMatch::MDS_AUTH_UID_ANY) {
+        return true;
+      }
+
+      // chown/chgrp
+      if (mask & MAY_CHOWN) {
+	if (new_uid != caller_uid ||   // you can't chown to someone else
+	    inode_uid != caller_uid) { // you can't chown from someone else
+	  continue;
+	}
+      }
+      if (mask & MAY_CHGRP) {
+	// you can only chgrp *to* one of your groups... if you own the file.
+	if (inode_uid != caller_uid ||
+	    std::find(gids.begin(), gids.end(), new_gid) ==
+	    gids.end()) {
+	  continue;
+	}
+      }
+
+      if (inode_uid == caller_uid) {
+        if ((!(mask & MAY_READ) || (inode_mode & S_IRUSR)) &&
+	    (!(mask & MAY_WRITE) || (inode_mode & S_IWUSR)) &&
+	    (!(mask & MAY_EXECUTE) || (inode_mode & S_IXUSR))) {
+          return true;
+        }
+      } else if (std::find(gids.begin(), gids.end(),
+			   inode_gid) != gids.end()) {
+        if ((!(mask & MAY_READ) || (inode_mode & S_IRGRP)) &&
+	    (!(mask & MAY_WRITE) || (inode_mode & S_IWGRP)) &&
+	    (!(mask & MAY_EXECUTE) || (inode_mode & S_IXGRP))) {
+          return true;
+        }
+      } else {
+        if ((!(mask & MAY_READ) || (inode_mode & S_IROTH)) &&
+	    (!(mask & MAY_WRITE) || (inode_mode & S_IWOTH)) &&
+	    (!(mask & MAY_EXECUTE) || (inode_mode & S_IXOTH))) {
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+void MDSAuthCaps::set_allow_all()
+{
+    grants.clear();
+    grants.push_back(MDSCapGrant(MDSCapSpec(MDSCapSpec::ALL), MDSCapMatch(),
+				 {}));
+}
+
+bool MDSAuthCaps::parse(CephContext *c, std::string_view str, ostream *err)
+{
+  // Special case for legacy caps
+  if (str == "allow") {
+    grants.clear();
+    grants.push_back(MDSCapGrant(MDSCapSpec(MDSCapSpec::RWPS), MDSCapMatch(),
+				 {}));
+    return true;
+  }
+
+  auto iter = str.begin();
+  auto end = str.end();
+  MDSCapParser<decltype(iter)> g;
+
+  bool r = qi::phrase_parse(iter, end, g, ascii::space, *this);
+  cct = c;  // set after parser self-assignment
+  if (r && iter == end) {
+    for (auto& grant : grants) {
+      std::sort(grant.match.gids.begin(), grant.match.gids.end());
+      grant.parse_network();
+    }
+    return true;
+  } else {
+    // Make sure no grants are kept after parsing failed!
+    grants.clear();
+
+    if (err)
+      *err << "mds capability parse failed, stopped at '"
+	   << std::string(iter, end)
+           << "' of '" << str << "'";
+    return false; 
+  }
+}
+
+
+bool MDSAuthCaps::allow_all() const
+{
+  for (const auto& grant : grants) {
+    if (grant.match.is_match_all() && grant.spec.allow_all()) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+ostream &operator<<(ostream &out, const MDSCapMatch &match)
+{
+  if (!match.fs_name.empty()) {
+    out << " fsname=" << match.fs_name;
+  }
+  if (match.path.length()) {
+    out << " path=\"/" << match.path << "\"";
+  }
+  if (match.root_squash) {
+    out << " root_squash";
+  }
+  if (match.uid != MDSCapMatch::MDS_AUTH_UID_ANY) {
+    out << " uid=" << match.uid;
+    if (!match.gids.empty()) {
+      out << " gids=";
+      bool first = true;
+      for (const auto& gid : match.gids) {
+	if (!first)
+	  out << ',';
+	out << gid;
+        first = false;
+      }
+    }
+  }
+
+  return out;
+}
+
+
+ostream &operator<<(ostream &out, const MDSCapSpec &spec)
+{
+  if (spec.allow_all()) {
+    out << "*";
+  } else {
+    if (spec.allow_read()) {
+      out << "r";
+    }
+    if (spec.allow_write()) {
+      out << "w";
+    }
+    if (spec.allow_full()) {
+      out << "f";
+    }
+    if (spec.allow_set_vxattr()) {
+      out << "p";
+    }
+    if (spec.allow_snapshot()) {
+      out << "s";
+    }
+  }
+
+  return out;
+}
+
+
+ostream &operator<<(ostream &out, const MDSCapGrant &grant)
+{
+  out << "allow ";
+  out << grant.spec;
+  out << grant.match;
+  if (grant.network.size()) {
+    out << " network " << grant.network;
+  }
+  return out;
+}
+
+
+ostream &operator<<(ostream &out, const MDSAuthCaps &cap)
+{
+  out << "MDSAuthCaps[";
+  for (size_t i = 0; i < cap.grants.size(); ++i) {
+    out << cap.grants[i];
+    if (i < cap.grants.size() - 1) {
+      out << ", ";
+    }
+  }
+  out << "]";
+
+  return out;
+}
+
diff --git a/src/mds/MDSAuthCaps.h b/src/mds/MDSAuthCaps.h
new file mode 100644
index 000000000..395c921fd
--- /dev/null
+++ b/src/mds/MDSAuthCaps.h
@@ -0,0 +1,238 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#ifndef MDS_AUTH_CAPS_H
+#define MDS_AUTH_CAPS_H
+
+#include <ostream>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "include/common_fwd.h"
+#include "include/types.h"
+#include "common/debug.h"
+
+#include "mdstypes.h"
+
+// unix-style capabilities
+enum {
+  MAY_READ	= (1 << 0),
+  MAY_WRITE 	= (1 << 1),
+  MAY_EXECUTE	= (1 << 2),
+  MAY_CHOWN	= (1 << 4),
+  MAY_CHGRP	= (1 << 5),
+  MAY_SET_VXATTR = (1 << 6),
+  MAY_SNAPSHOT	= (1 << 7),
+  MAY_FULL	= (1 << 8),
+};
+
+// what we can do
+struct MDSCapSpec {
+  static const unsigned ALL		= (1 << 0);
+  static const unsigned READ		= (1 << 1);
+  static const unsigned WRITE		= (1 << 2);
+  // if the capability permits setting vxattrs (layout, quota, etc)
+  static const unsigned SET_VXATTR	= (1 << 3);
+  // if the capability permits mksnap/rmsnap
+  static const unsigned SNAPSHOT	= (1 << 4);
+  // if the capability permits to bypass osd full check
+  static const unsigned FULL	        = (1 << 5);
+
+  static const unsigned RW		= (READ|WRITE);
+  static const unsigned RWF		= (READ|WRITE|FULL);
+  static const unsigned RWP		= (READ|WRITE|SET_VXATTR);
+  static const unsigned RWS		= (READ|WRITE|SNAPSHOT);
+  static const unsigned RWFP		= (READ|WRITE|FULL|SET_VXATTR);
+  static const unsigned RWFS		= (READ|WRITE|FULL|SNAPSHOT);
+  static const unsigned RWPS		= (READ|WRITE|SET_VXATTR|SNAPSHOT);
+  static const unsigned RWFPS		= (READ|WRITE|FULL|SET_VXATTR|SNAPSHOT);
+
+  MDSCapSpec() = default;
+  MDSCapSpec(unsigned _caps) : caps(_caps) {
+    if (caps & ALL)
+      caps |= RWFPS;
+  }
+
+  bool allow_all() const {
+    return (caps & ALL);
+  }
+  bool allow_read() const {
+    return (caps & READ);
+  }
+  bool allow_write() const {
+    return (caps & WRITE);
+  }
+
+  bool allows(bool r, bool w) const {
+    if (allow_all())
+      return true;
+    if (r && !allow_read())
+      return false;
+    if (w && !allow_write())
+      return false;
+    return true;
+  }
+
+  bool allow_snapshot() const {
+    return (caps & SNAPSHOT);
+  }
+  bool allow_set_vxattr() const {
+    return (caps & SET_VXATTR);
+  }
+  bool allow_full() const {
+    return (caps & FULL);
+  }
+private:
+  unsigned caps = 0;
+};
+
+// conditions before we are allowed to do it
+struct MDSCapMatch {
+  static const int64_t MDS_AUTH_UID_ANY = -1;
+
+  MDSCapMatch() : uid(MDS_AUTH_UID_ANY), fs_name(std::string()) {}
+
+  MDSCapMatch(int64_t uid_, std::vector<gid_t>& gids_) :
+    uid(uid_), gids(gids_), fs_name(std::string()) {}
+
+  explicit MDSCapMatch(const std::string &path_)
+    : uid(MDS_AUTH_UID_ANY), path(path_), fs_name(std::string()) {
+    normalize_path();
+  }
+
+  explicit MDSCapMatch(std::string path, std::string fs_name) :
+    uid(MDS_AUTH_UID_ANY), path(std::move(path)), fs_name(std::move(fs_name))
+  {
+    normalize_path();
+  }
+
+  explicit MDSCapMatch(std::string path, std::string fs_name, bool root_squash_) :
+    uid(MDS_AUTH_UID_ANY), path(std::move(path)), fs_name(std::move(fs_name)), root_squash(root_squash_)
+  {
+    normalize_path();
+  }
+
+  MDSCapMatch(const std::string& path_, int64_t uid_, std::vector<gid_t>& gids_)
+    : uid(uid_), gids(gids_), path(path_), fs_name(std::string()) {
+    normalize_path();
+  }
+
+  void normalize_path();
+  
+  bool is_match_all() const
+  {
+    return uid == MDS_AUTH_UID_ANY && path == "";
+  }
+
+  // check whether this grant matches against a given file and caller uid:gid
+  bool match(std::string_view target_path,
+	     const int caller_uid,
+	     const int caller_gid,
+	     const std::vector<uint64_t> *caller_gid_list) const;
+
+  /**
+   * Check whether this path *might* be accessible (actual permission
+   * depends on the stronger check in match()).
+   *
+   * @param target_path filesystem path without leading '/'
+   */
+  bool match_path(std::string_view target_path) const;
+
+  int64_t uid;       // Require UID to be equal to this, if !=MDS_AUTH_UID_ANY
+  std::vector<gid_t> gids;  // Use these GIDs
+  std::string path;  // Require path to be child of this (may be "" or "/" for any)
+  std::string fs_name;
+  bool root_squash=false;
+};
+
+struct MDSCapGrant {
+  MDSCapGrant(const MDSCapSpec &spec_, const MDSCapMatch &match_,
+	      boost::optional<std::string> n)
+    : spec(spec_), match(match_) {
+    if (n) {
+      network = *n;
+      parse_network();
+    }
+  }
+  MDSCapGrant() {}
+
+  void parse_network();
+
+  MDSCapSpec spec;
+  MDSCapMatch match;
+
+  std::string network;
+
+  entity_addr_t network_parsed;
+  unsigned network_prefix = 0;
+  bool network_valid = true;
+};
+
+class MDSAuthCaps
+{
+public:
+  MDSAuthCaps() = default;
+  explicit MDSAuthCaps(CephContext *cct_) : cct(cct_) {}
+
+  // this ctor is used by spirit/phoenix; doesn't need cct.
+  explicit MDSAuthCaps(const std::vector<MDSCapGrant>& grants_) : grants(grants_) {}
+
+  void clear() {
+    grants.clear();
+  }
+
+  void set_allow_all();
+  bool parse(CephContext *cct, std::string_view str, std::ostream *err);
+
+  bool allow_all() const;
+  bool is_capable(std::string_view inode_path,
+		  uid_t inode_uid, gid_t inode_gid, unsigned inode_mode,
+		  uid_t uid, gid_t gid, const std::vector<uint64_t> *caller_gid_list,
+		  unsigned mask, uid_t new_uid, gid_t new_gid,
+		  const entity_addr_t& addr) const;
+  bool path_capable(std::string_view inode_path) const;
+
+  bool fs_name_capable(std::string_view fs_name, unsigned mask) const {
+    if (allow_all()) {
+      return true;
+    }
+
+    for (const MDSCapGrant &g : grants) {
+      if (g.match.fs_name == fs_name || g.match.fs_name.empty() ||
+	  g.match.fs_name == "*") {
+	if (mask & MAY_READ && g.spec.allow_read()) {
+	  return true;
+	}
+
+	if (mask & MAY_WRITE && g.spec.allow_write()) {
+	  return true;
+	}
+      }
+    }
+
+    return false;
+  }
+
+  friend std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap);
+private:
+  CephContext *cct = nullptr;
+  std::vector<MDSCapGrant> grants;
+};
+
+std::ostream &operator<<(std::ostream &out, const MDSCapMatch &match);
+std::ostream &operator<<(std::ostream &out, const MDSCapSpec &spec);
+std::ostream &operator<<(std::ostream &out, const MDSCapGrant &grant);
+std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap);
+
+#endif // MDS_AUTH_CAPS_H
diff --git a/src/mds/MDSCacheObject.cc b/src/mds/MDSCacheObject.cc
new file mode 100644
index 000000000..626623a81
--- /dev/null
+++ b/src/mds/MDSCacheObject.cc
@@ -0,0 +1,137 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "MDSCacheObject.h"
+#include "MDSContext.h"
+#include "common/Formatter.h"
+
+std::string_view MDSCacheObject::generic_pin_name(int p) const {
+  switch (p) {
+    case PIN_REPLICATED: return "replicated";
+    case PIN_DIRTY: return "dirty";
+    case PIN_LOCK: return "lock";
+    case PIN_REQUEST: return "request";
+    case PIN_WAITER: return "waiter";
+    case PIN_DIRTYSCATTERED: return "dirtyscattered";
+    case PIN_AUTHPIN: return "authpin";
+    case PIN_PTRWAITER: return "ptrwaiter";
+    case PIN_TEMPEXPORTING: return "tempexporting";
+    case PIN_CLIENTLEASE: return "clientlease";
+    case PIN_DISCOVERBASE: return "discoverbase";
+    case PIN_SCRUBQUEUE: return "scrubqueue";
+    default: ceph_abort(); return std::string_view();
+  }
+}
+
+void MDSCacheObject::finish_waiting(uint64_t mask, int result) {
+  MDSContext::vec finished;
+  take_waiting(mask, finished);
+  finish_contexts(g_ceph_context, finished, result);
+}
+
+void MDSCacheObject::dump(ceph::Formatter *f) const
+{
+  f->dump_bool("is_auth", is_auth());
+
+  // Fields only meaningful for auth
+  f->open_object_section("auth_state");
+  {
+    f->open_object_section("replicas");
+    for (const auto &it : get_replicas()) {
+      CachedStackStringStream css;
+      *css << it.first;
+      f->dump_int(css->strv(), it.second);
+    }
+    f->close_section();
+  }
+  f->close_section(); // auth_state
+
+  // Fields only meaningful for replica
+  f->open_object_section("replica_state");
+  {
+    f->open_array_section("authority");
+    f->dump_int("first", authority().first);
+    f->dump_int("second", authority().second);
+    f->close_section();
+    f->dump_unsigned("replica_nonce", get_replica_nonce());
+  }
+  f->close_section();  // replica_state
+
+  f->dump_int("auth_pins", auth_pins);
+  f->dump_bool("is_frozen", is_frozen());
+  f->dump_bool("is_freezing", is_freezing());
+
+#ifdef MDS_REF_SET
+    f->open_object_section("pins");
+    for(const auto& p : ref_map) {
+      f->dump_int(pin_name(p.first), p.second);
+    }
+    f->close_section();
+#endif
+    f->dump_int("nref", ref);
+}
+
+/*
+ * Use this in subclasses when printing their specialized
+ * states too.
+ */
+void MDSCacheObject::dump_states(ceph::Formatter *f) const
+{
+  if (state_test(STATE_AUTH)) f->dump_string("state", "auth");
+  if (state_test(STATE_DIRTY)) f->dump_string("state", "dirty");
+  if (state_test(STATE_NOTIFYREF)) f->dump_string("state", "notifyref");
+  if (state_test(STATE_REJOINING)) f->dump_string("state", "rejoining");
+  if (state_test(STATE_REJOINUNDEF))
+    f->dump_string("state", "rejoinundef");
+}
+
+bool MDSCacheObject::is_waiter_for(uint64_t mask, uint64_t min) {
+  if (!min) {
+    min = mask;
+    while (min & (min-1))  // if more than one bit is set
+      min &= min-1;        //  clear LSB
+  }
+  for (auto p = waiting.lower_bound(min); p != waiting.end(); ++p) {
+    if (p->first & mask) return true;
+    if (p->first > mask) return false;
+  }
+  return false;
+}
+
+void MDSCacheObject::take_waiting(uint64_t mask, MDSContext::vec& ls) {
+  if (waiting.empty()) return;
+
+  // process ordered waiters in the same order that they were added.
+  std::map<uint64_t, MDSContext*> ordered_waiters;
+
+  for (auto it = waiting.begin(); it != waiting.end(); ) {
+    if (it->first & mask) {
+        if (it->second.first > 0) {
+          ordered_waiters.insert(it->second);
+        } else {
+          ls.push_back(it->second.second);
+        }
+//      pdout(10,g_conf()->debug_mds) << (mdsco_db_line_prefix(this))
+//                                 << "take_waiting mask " << hex << mask << dec << " took " << it->second
+//                                 << " tag " << hex << it->first << dec
+//                                 << " on " << *this
+//                                 << dendl;
+        waiting.erase(it++);
+    } else {
+//      pdout(10,g_conf()->debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second
+//                                 << " tag " << hex << it->first << dec
+//                                 << " on " << *this 
+//                                 << dendl;
+        ++it;
+    }
+  }
+  for (auto it = ordered_waiters.begin(); it != ordered_waiters.end(); ++it) {
+    ls.push_back(it->second);
+  }
+  if (waiting.empty()) {
+    put(PIN_WAITER);
+    waiting.clear();
+  }
+}
+
+uint64_t MDSCacheObject::last_wait_seq = 0;
diff --git a/src/mds/MDSCacheObject.h b/src/mds/MDSCacheObject.h
new file mode 100644
index 000000000..53d33460b
--- /dev/null
+++ b/src/mds/MDSCacheObject.h
@@ -0,0 +1,342 @@
+#ifndef CEPH_MDSCACHEOBJECT_H
+#define CEPH_MDSCACHEOBJECT_H
+
+#include <ostream>
+#include <string_view>
+
+#include "common/config.h"
+
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "include/mempool.h"
+#include "include/types.h"
+#include "include/xlist.h"
+
+#include "mdstypes.h"
+#include "MDSContext.h"
+#include "include/elist.h"
+
+#define MDS_REF_SET      // define me for improved debug output, sanity checking
+//#define MDS_AUTHPIN_SET  // define me for debugging auth pin leaks
+//#define MDS_VERIFY_FRAGSTAT    // do (slow) sanity checking on frags
+
+/*
+ * for metadata leases to clients
+ */
+class MLock;
+class SimpleLock;
+class MDSCacheObject;
+class MDSContext;
+
+namespace ceph {
+class Formatter;
+}
+
+struct ClientLease {
+  ClientLease(client_t c, MDSCacheObject *p) :
+    client(c), parent(p),
+    item_session_lease(this),
+    item_lease(this) { }
+  ClientLease() = delete;
+
+  client_t client;
+  MDSCacheObject *parent;
+
+  ceph_seq_t seq = 0;
+  utime_t ttl;
+  xlist<ClientLease*>::item item_session_lease; // per-session list
+  xlist<ClientLease*>::item item_lease;         // global list
+};
+
+// print hack
+struct mdsco_db_line_prefix {
+  explicit mdsco_db_line_prefix(MDSCacheObject *o) : object(o) {}
+  MDSCacheObject *object;
+};
+
+class MDSCacheObject {
+ public:
+  typedef mempool::mds_co::compact_map<mds_rank_t,unsigned> replica_map_type;
+
+  struct ptr_lt {
+    bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const {
+      return l->is_lt(r);
+    }
+  };
+
+  // -- pins --
+  const static int PIN_REPLICATED =  1000;
+  const static int PIN_DIRTY      =  1001;
+  const static int PIN_LOCK       = -1002;
+  const static int PIN_REQUEST    = -1003;
+  const static int PIN_WAITER     =  1004;
+  const static int PIN_DIRTYSCATTERED = -1005;
+  static const int PIN_AUTHPIN    =  1006;
+  static const int PIN_PTRWAITER  = -1007;
+  const static int PIN_TEMPEXPORTING = 1008;  // temp pin between encode_ and finish_export
+  static const int PIN_CLIENTLEASE = 1009;
+  static const int PIN_DISCOVERBASE = 1010;
+  static const int PIN_SCRUBQUEUE = 1011;     // for scrub of inode and dir
+
+  // -- state --
+  const static int STATE_AUTH      = (1<<30);
+  const static int STATE_DIRTY     = (1<<29);
+  const static int STATE_NOTIFYREF = (1<<28); // notify dropping ref drop through _put()
+  const static int STATE_REJOINING = (1<<27);  // replica has not joined w/ primary copy
+  const static int STATE_REJOINUNDEF = (1<<26);  // contents undefined.
+
+  // -- wait --
+  const static uint64_t WAIT_ORDERED	 = (1ull<<61);
+  const static uint64_t WAIT_SINGLEAUTH  = (1ull<<60);
+  const static uint64_t WAIT_UNFREEZE    = (1ull<<59); // pka AUTHPINNABLE
+
+  elist<MDSCacheObject*>::item item_scrub;   // for scrub inode or dir
+
+  MDSCacheObject() {}
+  virtual ~MDSCacheObject() {}
+
+  std::string_view generic_pin_name(int p) const;
+
+  // printing
+  virtual void print(std::ostream& out) = 0;
+  virtual std::ostream& print_db_line_prefix(std::ostream& out) { 
+    return out << "mdscacheobject(" << this << ") "; 
+  }
+
+  unsigned get_state() const { return state; }
+  unsigned state_test(unsigned mask) const { return (state & mask); }
+  void state_clear(unsigned mask) { state &= ~mask; }
+  void state_set(unsigned mask) { state |= mask; }
+  void state_reset(unsigned s) { state = s; }
+
+  bool is_auth() const { return state_test(STATE_AUTH); }
+  bool is_dirty() const { return state_test(STATE_DIRTY); }
+  bool is_clean() const { return !is_dirty(); }
+  bool is_rejoining() const { return state_test(STATE_REJOINING); }
+
+  // --------------------------------------------
+  // authority
+  virtual mds_authority_t authority() const = 0;
+  virtual bool is_ambiguous_auth() const {
+    return authority().second != CDIR_AUTH_UNKNOWN;
+  }
+
+  int get_num_ref(int by = -1) const {
+#ifdef MDS_REF_SET
+    if (by >= 0) {
+      if (ref_map.find(by) == ref_map.end()) {
+	return 0;
+      } else {
+        return ref_map.find(by)->second;
+      }
+    }
+#endif
+    return ref;
+  }
+  virtual std::string_view pin_name(int by) const = 0;
+  //bool is_pinned_by(int by) { return ref_set.count(by); }
+  //multiset<int>& get_ref_set() { return ref_set; }
+
+  virtual void last_put() {}
+  virtual void bad_put(int by) {
+#ifdef MDS_REF_SET
+    ceph_assert(ref_map[by] > 0);
+#endif
+    ceph_assert(ref > 0);
+  }
+  virtual void _put() {}
+  void put(int by) {
+#ifdef MDS_REF_SET
+    if (ref == 0 || ref_map[by] == 0) {
+#else
+    if (ref == 0) {
+#endif
+      bad_put(by);
+    } else {
+      ref--;
+#ifdef MDS_REF_SET
+      ref_map[by]--;
+#endif
+      if (ref == 0)
+	last_put();
+      if (state_test(STATE_NOTIFYREF))
+	_put();
+    }
+  }
+
+  virtual void first_get() {}
+  virtual void bad_get(int by) {
+#ifdef MDS_REF_SET
+    ceph_assert(by < 0 || ref_map[by] == 0);
+#endif
+    ceph_abort();
+  }
+  void get(int by) {
+    if (ref == 0)
+      first_get();
+    ref++;
+#ifdef MDS_REF_SET
+    if (ref_map.find(by) == ref_map.end())
+      ref_map[by] = 0;
+    ref_map[by]++;
+#endif
+  }
+
+  void print_pin_set(std::ostream& out) const {
+#ifdef MDS_REF_SET
+    for(auto const &p : ref_map) {
+      out << " " << pin_name(p.first) << "=" << p.second;
+    }
+#else
+    out << " nref=" << ref;
+#endif
+  }
+
+  int get_num_auth_pins() const { return auth_pins; }
+#ifdef MDS_AUTHPIN_SET
+  void print_authpin_set(std::ostream& out) const {
+    out << " (" << auth_pin_set << ")";
+  }
+#endif
+
+  void dump_states(ceph::Formatter *f) const;
+  void dump(ceph::Formatter *f) const;
+
+  // auth pins
+  enum {
+    // can_auth_pin() error codes
+    ERR_NOT_AUTH = 1,
+    ERR_EXPORTING_TREE,
+    ERR_FRAGMENTING_DIR,
+    ERR_EXPORTING_INODE,
+  };
+  virtual bool can_auth_pin(int *err_code=nullptr) const = 0;
+  virtual void auth_pin(void *who) = 0;
+  virtual void auth_unpin(void *who) = 0;
+  virtual bool is_frozen() const = 0;
+  virtual bool is_freezing() const = 0;
+  virtual bool is_freezing_or_frozen() const {
+    return is_frozen() || is_freezing();
+  }
+
+  bool is_replicated() const { return !get_replicas().empty(); }
+  bool is_replica(mds_rank_t mds) const { return get_replicas().count(mds); }
+  int num_replicas() const { return get_replicas().size(); }
+  unsigned add_replica(mds_rank_t mds) {
+    if (get_replicas().count(mds))
+      return ++get_replicas()[mds];  // inc nonce
+    if (get_replicas().empty())
+      get(PIN_REPLICATED);
+    return get_replicas()[mds] = 1;
+  }
+  void add_replica(mds_rank_t mds, unsigned nonce) {
+    if (get_replicas().empty())
+      get(PIN_REPLICATED);
+    get_replicas()[mds] = nonce;
+  }
+  unsigned get_replica_nonce(mds_rank_t mds) {
+    ceph_assert(get_replicas().count(mds));
+    return get_replicas()[mds];
+  }
+  void remove_replica(mds_rank_t mds) {
+    ceph_assert(get_replicas().count(mds));
+    get_replicas().erase(mds);
+    if (get_replicas().empty()) {
+      put(PIN_REPLICATED);
+    }
+  }
+  void clear_replica_map() {
+    if (!get_replicas().empty())
+      put(PIN_REPLICATED);
+    replica_map.clear();
+  }
+  replica_map_type& get_replicas() { return replica_map; }
+  const replica_map_type& get_replicas() const { return replica_map; }
+  void list_replicas(std::set<mds_rank_t>& ls) const {
+    for (const auto &p : get_replicas()) {
+      ls.insert(p.first);
+    }
+  }
+
+  unsigned get_replica_nonce() const { return replica_nonce; }
+  void set_replica_nonce(unsigned n) { replica_nonce = n; }
+
+  bool is_waiter_for(uint64_t mask, uint64_t min=0);
+  virtual void add_waiter(uint64_t mask, MDSContext *c) {
+    if (waiting.empty())
+      get(PIN_WAITER);
+
+    uint64_t seq = 0;
+    if (mask & WAIT_ORDERED) {
+      seq = ++last_wait_seq;
+      mask &= ~WAIT_ORDERED;
+    }
+    waiting.insert(std::pair<uint64_t, std::pair<uint64_t, MDSContext*> >(
+			    mask,
+			    std::pair<uint64_t, MDSContext*>(seq, c)));
+//    pdout(10,g_conf()->debug_mds) << (mdsco_db_line_prefix(this)) 
+//			       << "add_waiter " << hex << mask << dec << " " << c
+//			       << " on " << *this
+//			       << dendl;
+    
+  }
+  virtual void take_waiting(uint64_t mask, MDSContext::vec& ls);
+  void finish_waiting(uint64_t mask, int result = 0);
+
+  // ---------------------------------------------
+  // locking
+  // noop unless overloaded.
+  virtual SimpleLock* get_lock(int type) { ceph_abort(); return 0; }
+  virtual void set_object_info(MDSCacheObjectInfo &info) { ceph_abort(); }
+  virtual void encode_lock_state(int type, ceph::buffer::list& bl) { ceph_abort(); }
+  virtual void decode_lock_state(int type, const ceph::buffer::list& bl) { ceph_abort(); }
+  virtual void finish_lock_waiters(int type, uint64_t mask, int r=0) { ceph_abort(); }
+  virtual void add_lock_waiter(int type, uint64_t mask, MDSContext *c) { ceph_abort(); }
+  virtual bool is_lock_waiting(int type, uint64_t mask) { ceph_abort(); return false; }
+
+  virtual void clear_dirty_scattered(int type) { ceph_abort(); }
+
+  // ---------------------------------------------
+  // ordering
+  virtual bool is_lt(const MDSCacheObject *r) const = 0;
+
+  // state
+ protected:
+  __u32 state = 0;     // state bits
+
+  // pins
+  __s32      ref = 0;       // reference count
+#ifdef MDS_REF_SET
+  mempool::mds_co::flat_map<int,int> ref_map;
+#endif
+
+  int auth_pins = 0;
+#ifdef MDS_AUTHPIN_SET
+  mempool::mds_co::multiset<void*> auth_pin_set;
+#endif
+
+  // replication (across mds cluster)
+  unsigned replica_nonce = 0; // [replica] defined on replica
+    replica_map_type replica_map;   // [auth] mds -> nonce
+
+  // ---------------------------------------------
+  // waiting
+ private:
+  mempool::mds_co::compact_multimap<uint64_t, std::pair<uint64_t, MDSContext*>> waiting;
+  static uint64_t last_wait_seq;
+};
+
+std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o);
+// printer
+std::ostream& operator<<(std::ostream& out, const MDSCacheObject &o);
+
+inline std::ostream& operator<<(std::ostream& out, MDSCacheObject &o) {
+  o.print(out);
+  return out;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o) {
+  o.object->print_db_line_prefix(out);
+  return out;
+}
+#endif
diff --git a/src/mds/MDSContext.cc b/src/mds/MDSContext.cc
new file mode 100644
index 000000000..210c836b1
--- /dev/null
+++ b/src/mds/MDSContext.cc
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "MDSRank.h"
+
+#include "MDSContext.h"
+
+#include "common/dout.h"
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+void MDSContext::complete(int r) {
+  MDSRank *mds = get_mds();
+  ceph_assert(mds != nullptr);
+  ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
+  dout(10) << "MDSContext::complete: " << typeid(*this).name() << dendl;
+  mds->heartbeat_reset();
+  return Context::complete(r);
+}
+
+void MDSInternalContextWrapper::finish(int r)
+{
+  fin->complete(r);
+}
+
+struct MDSIOContextList {
+  elist<MDSIOContextBase*> list;
+  ceph::spinlock lock;
+  MDSIOContextList() : list(member_offset(MDSIOContextBase, list_item)) {}
+  ~MDSIOContextList() {
+    list.clear(); // avoid assertion in elist's destructor
+  }
+} ioctx_list;
+
+MDSIOContextBase::MDSIOContextBase(bool track)
+{
+  created_at = ceph::coarse_mono_clock::now();
+  if (track) {
+    ioctx_list.lock.lock();
+    ioctx_list.list.push_back(&list_item);
+    ioctx_list.lock.unlock();
+  }
+}
+
+MDSIOContextBase::~MDSIOContextBase()
+{
+  ioctx_list.lock.lock();
+  list_item.remove_myself();
+  ioctx_list.lock.unlock();
+}
+
+bool MDSIOContextBase::check_ios_in_flight(ceph::coarse_mono_time cutoff,
+					   std::string& slow_count,
+					   ceph::coarse_mono_time& oldest)
+{
+  static const unsigned MAX_COUNT = 100;
+  unsigned slow = 0;
+
+  ioctx_list.lock.lock();
+  for (elist<MDSIOContextBase*>::iterator p = ioctx_list.list.begin(); !p.end(); ++p) {
+    MDSIOContextBase *c = *p;
+    if (c->created_at >= cutoff)
+      break;
+    ++slow;
+    if (slow > MAX_COUNT)
+      break;
+    if (slow == 1)
+      oldest = c->created_at;
+  }
+  ioctx_list.lock.unlock();
+
+  if (slow > 0) {
+    if (slow > MAX_COUNT)
+      slow_count = std::to_string(MAX_COUNT) + "+";
+    else
+      slow_count = std::to_string(slow);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void MDSIOContextBase::complete(int r) {
+  MDSRank *mds = get_mds();
+
+  dout(10) << "MDSIOContextBase::complete: " << typeid(*this).name() << dendl;
+  ceph_assert(mds != NULL);
+  // Note, MDSIOContext is passed outside the MDS and, strangely, we grab the
+  // lock here when MDSContext::complete would otherwise assume the lock is
+  // already acquired.
+  std::lock_guard l(mds->mds_lock);
+
+  if (mds->is_daemon_stopping()) {
+    dout(4) << "MDSIOContextBase::complete: dropping for stopping "
+            << typeid(*this).name() << dendl;
+    return;
+  }
+
+  // It's possible that the osd op requests will be stuck and then times out
+  // after "rados_osd_op_timeout", the mds won't know what we should it, just
+  // respawn it.
+  if (r == -CEPHFS_EBLOCKLISTED || r == -CEPHFS_ETIMEDOUT) {
+    derr << "MDSIOContextBase: failed with " << r << ", restarting..." << dendl;
+    mds->respawn();
+  } else {
+    MDSContext::complete(r);
+  }
+}
+
+void MDSLogContextBase::complete(int r) {
+  MDLog *mdlog = get_mds()->mdlog;
+  uint64_t safe_pos = write_pos;
+  pre_finish(r);
+  // MDSIOContext::complete() free this
+  MDSIOContextBase::complete(r);
+  // safe_pos must be updated after MDSIOContext::complete() call
+  mdlog->set_safe_pos(safe_pos);
+}
+
+void MDSIOContextWrapper::finish(int r)
+{
+  fin->complete(r);
+}
+
+void C_IO_Wrapper::complete(int r)
+{
+  if (async) {
+    async = false;
+    get_mds()->finisher->queue(this, r);
+  } else {
+    MDSIOContext::complete(r);
+  }
+}
diff --git a/src/mds/MDSContext.h b/src/mds/MDSContext.h
new file mode 100644
index 000000000..319af50d3
--- /dev/null
+++ b/src/mds/MDSContext.h
@@ -0,0 +1,212 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef MDS_CONTEXT_H
+#define MDS_CONTEXT_H
+
+#include <vector>
+#include <deque>
+
+#include "include/Context.h"
+#include "include/elist.h"
+#include "include/spinlock.h"
+#include "common/ceph_time.h"
+
+class MDSRank;
+
+/**
+ * Completion which has access to a reference to the global MDS instance.
+ *
+ * This class exists so that Context subclasses can provide the MDS pointer
+ * from a pointer they already had, e.g. MDCache or Locker, rather than
+ * necessarily having to carry around an extra MDS* pointer. 
+ */
+class MDSContext : public Context
+{
+public:
+template<template<typename> class A>
+  using vec_alloc = std::vector<MDSContext*, A<MDSContext*>>;
+  using vec = vec_alloc<std::allocator>;
+
+template<template<typename> class A>
+  using que_alloc = std::deque<MDSContext*, A<MDSContext*>>;
+  using que = que_alloc<std::allocator>;
+
+  void complete(int r) override;
+  virtual MDSRank *get_mds() = 0;
+};
+
+/* Children of this could have used multiple inheritance with MDSHolder and
+ * MDSContext but then get_mds() would be ambiguous.
+ */
+template<class T>
+class MDSHolder : public T
+{
+public:
+  MDSRank* get_mds() override {
+    return mds;
+  }
+
+protected:
+  MDSHolder() = delete;
+  MDSHolder(MDSRank* mds) : mds(mds) {
+    ceph_assert(mds != nullptr);
+  }
+
+  MDSRank* mds;
+};
+
+/**
+ * General purpose, lets you pass in an MDS pointer.
+ */
+class MDSInternalContext : public MDSHolder<MDSContext>
+{
+public:
+  MDSInternalContext() = delete;
+
+protected:
+  explicit MDSInternalContext(MDSRank *mds_) : MDSHolder(mds_) {}
+};
+
+/**
+ * Wrap a regular Context up as an Internal context. Useful
+ * if you're trying to work with one of our more generic frameworks.
+ */
+class MDSInternalContextWrapper : public MDSInternalContext
+{
+protected:
+  Context *fin = nullptr;
+  void finish(int r) override;
+public:
+  MDSInternalContextWrapper(MDSRank *m, Context *c) : MDSInternalContext(m), fin(c) {}
+};
+
+class MDSIOContextBase : public MDSContext
+{
+public:
+  MDSIOContextBase(bool track=true);
+  virtual ~MDSIOContextBase();
+  MDSIOContextBase(const MDSIOContextBase&) = delete;
+  MDSIOContextBase& operator=(const MDSIOContextBase&) = delete;
+
+  void complete(int r) override;
+
+  virtual void print(std::ostream& out) const = 0;
+
+  static bool check_ios_in_flight(ceph::coarse_mono_time cutoff,
+				  std::string& slow_count,
+				  ceph::coarse_mono_time& oldest);
+private:
+  ceph::coarse_mono_time created_at;
+  elist<MDSIOContextBase*>::item list_item;
+  
+  friend struct MDSIOContextList;
+};
+
+/**
+ * Completion for an log operation, takes big MDSRank lock
+ * before executing finish function. Update log's safe pos
+ * after finish function return.
+ */
+class MDSLogContextBase : public MDSIOContextBase
+{
+protected:
+  uint64_t write_pos = 0;
+public:
+  MDSLogContextBase() = default;
+  void complete(int r) final;
+  void set_write_pos(uint64_t wp) { write_pos = wp; }
+  virtual void pre_finish(int r) {}
+  void print(std::ostream& out) const override {
+    out << "log_event(" << write_pos << ")";
+  }
+};
+
+/**
+ * Completion for an I/O operation, takes big MDSRank lock
+ * before executing finish function.
+ */
+class MDSIOContext : public MDSHolder<MDSIOContextBase>
+{
+public:
+  explicit MDSIOContext(MDSRank *mds_) : MDSHolder(mds_) {}
+};
+
+/**
+ * Wrap a regular Context up as an IO Context. Useful
+ * if you're trying to work with one of our more generic frameworks.
+ */
+class MDSIOContextWrapper : public MDSHolder<MDSIOContextBase>
+{
+protected:
+  Context *fin;
+public:
+  MDSIOContextWrapper(MDSRank *m, Context *c) : MDSHolder(m), fin(c) {}
+  void finish(int r) override;
+  void print(std::ostream& out) const override {
+    out << "io_context_wrapper(" << fin << ")";
+  }
+};
+
+/**
+ * No-op for callers expecting MDSInternalContext
+ */
+class C_MDSInternalNoop : public MDSContext
+{
+public:
+  void finish(int r) override {}
+  void complete(int r) override { delete this; }
+protected:
+  MDSRank* get_mds() override final {ceph_abort();}
+};
+
+
+/**
+ * This class is used where you have an MDSInternalContext but
+ * you sometimes want to call it back from an I/O completion.
+ */
+class C_IO_Wrapper : public MDSIOContext
+{
+protected:
+  bool async;
+  MDSContext *wrapped;
+  void finish(int r) override {
+    wrapped->complete(r);
+    wrapped = nullptr;
+  }
+public:
+  C_IO_Wrapper(MDSRank *mds_, MDSContext *wrapped_) :
+    MDSIOContext(mds_), async(true), wrapped(wrapped_) {
+    ceph_assert(wrapped != NULL);
+  }
+
+  ~C_IO_Wrapper() override {
+    if (wrapped != nullptr) {
+      delete wrapped;
+      wrapped = nullptr;
+    }
+  }
+  void complete(int r) final;
+  void print(std::ostream& out) const override {
+    out << "io_wrapper(" << wrapped << ")";
+  }
+};
+
+using MDSGather = C_GatherBase<MDSContext, C_MDSInternalNoop>;
+using MDSGatherBuilder = C_GatherBuilderBase<MDSContext, MDSGather>;
+
+using MDSContextFactory = ContextFactory<MDSContext>;
+
+#endif  // MDS_CONTEXT_H
diff --git a/src/mds/MDSContinuation.h b/src/mds/MDSContinuation.h
new file mode 100644
index 000000000..e01522522
--- /dev/null
+++ b/src/mds/MDSContinuation.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+ 
+#include "common/Continuation.h"
+#include "mds/Mutation.h"
+#include "mds/Server.h"
+
+#include "MDSContext.h"
+ 
+class MDSContinuation : public Continuation {
+protected:
+  Server *server;
+  MDSInternalContext *get_internal_callback(int stage) {
+    return new MDSInternalContextWrapper(server->mds, get_callback(stage));
+  }
+  MDSIOContextBase *get_io_callback(int stage) {
+    return new MDSIOContextWrapper(server->mds, get_callback(stage));
+  }
+public:
+  MDSContinuation(Server *s) :
+    Continuation(NULL), server(s) {}
+};
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc
new file mode 100644
index 000000000..4aa4815d1
--- /dev/null
+++ b/src/mds/MDSDaemon.cc
@@ -0,0 +1,1141 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+
+#include "include/compat.h"
+#include "include/types.h"
+#include "include/str_list.h"
+
+#include "common/Clock.h"
+#include "common/HeartbeatMap.h"
+#include "common/Timer.h"
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/entity_name.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "common/signal.h"
+#include "common/version.h"
+
+#include "global/signal_handler.h"
+
+#include "msg/Messenger.h"
+#include "mon/MonClient.h"
+
+#include "osdc/Objecter.h"
+
+#include "MDSMap.h"
+
+#include "MDSDaemon.h"
+#include "Server.h"
+#include "Locker.h"
+
+#include "SnapServer.h"
+#include "SnapClient.h"
+
+#include "events/ESession.h"
+#include "events/ESubtreeMap.h"
+
+#include "auth/AuthAuthorizeHandler.h"
+#include "auth/RotatingKeyRing.h"
+#include "auth/KeyRing.h"
+
+#include "perfglue/cpu_profiler.h"
+#include "perfglue/heap_profiler.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << name << ' '
+using TOPNSPC::common::cmd_getval;
+// cons/des
+MDSDaemon::MDSDaemon(std::string_view n, Messenger *m, MonClient *mc,
+		     boost::asio::io_context& ioctx) :
+  Dispatcher(m->cct),
+  timer(m->cct, mds_lock),
+  gss_ktfile_client(m->cct->_conf.get_val<std::string>("gss_ktab_client_file")),
+  beacon(m->cct, mc, n),
+  name(n),
+  messenger(m),
+  monc(mc),
+  ioctx(ioctx),
+  mgrc(m->cct, m, &mc->monmap),
+  log_client(m->cct, messenger, &mc->monmap, LogClient::NO_FLAGS),
+  starttime(mono_clock::now())
+{
+  orig_argc = 0;
+  orig_argv = NULL;
+
+  clog = log_client.create_channel();
+  if (!gss_ktfile_client.empty()) {
+    // Assert we can export environment variable 
+    /* 
+        The default client keytab is used, if it is present and readable,
+        to automatically obtain initial credentials for GSSAPI client
+        applications. The principal name of the first entry in the client
+        keytab is used by default when obtaining initial credentials.
+        1. The KRB5_CLIENT_KTNAME environment variable.
+        2. The default_client_keytab_name profile variable in [libdefaults].
+        3. The hardcoded default, DEFCKTNAME.
+    */
+    const int32_t set_result(setenv("KRB5_CLIENT_KTNAME", 
+                                    gss_ktfile_client.c_str(), 1));
+    ceph_assert(set_result == 0);
+  }
+
+  mdsmap.reset(new MDSMap);
+}
+
+MDSDaemon::~MDSDaemon() {
+  std::lock_guard lock(mds_lock);
+
+  delete mds_rank;
+  mds_rank = NULL;
+}
+
+class MDSSocketHook : public AdminSocketHook {
+  MDSDaemon *mds;
+public:
+  explicit MDSSocketHook(MDSDaemon *m) : mds(m) {}
+  int call(
+    std::string_view command,
+    const cmdmap_t& cmdmap,
+    Formatter *f,
+    std::ostream& errss,
+    ceph::buffer::list& out) override {
+    ceph_abort("should go to call_async");
+  }
+  void call_async(
+    std::string_view command,
+    const cmdmap_t& cmdmap,
+    Formatter *f,
+    const bufferlist& inbl,
+    std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
+    mds->asok_command(command, cmdmap, f, inbl, on_finish);
+  }
+};
+
+void MDSDaemon::asok_command(
+  std::string_view command,
+  const cmdmap_t& cmdmap,
+  Formatter *f,
+  const bufferlist& inbl,
+  std::function<void(int,const std::string&,bufferlist&)> on_finish)
+{
+  dout(1) << "asok_command: " << command << " " << cmdmap
+	  << " (starting...)" << dendl;
+
+  int r = -CEPHFS_ENOSYS;
+  bufferlist outbl;
+  CachedStackStringStream css;
+  auto& ss = *css;
+  if (command == "status") {
+    dump_status(f);
+    r = 0;
+  } else if (command == "exit") {
+    outbl.append("Exiting...\n");
+    r = 0;
+    std::thread t([this](){
+		    // Wait a little to improve chances of caller getting
+		    // our response before seeing us disappear from mdsmap
+		    sleep(1);
+		    std::lock_guard l(mds_lock);
+		    suicide();
+		  });
+    t.detach();
+  } else if (command == "respawn") {
+    outbl.append("Respawning...\n");
+    r = 0;
+    std::thread t([this](){
+		    // Wait a little to improve chances of caller getting
+		    // our response before seeing us disappear from mdsmap
+		    sleep(1);
+		    std::lock_guard l(mds_lock);
+		    respawn();
+		  });
+    t.detach();
+  } else if (command == "heap") {
+    if (!ceph_using_tcmalloc()) {
+      ss << "not using tcmalloc";
+      r = -CEPHFS_EOPNOTSUPP;
+    } else {
+      string heapcmd;
+      cmd_getval(cmdmap, "heapcmd", heapcmd);
+      vector<string> heapcmd_vec;
+      get_str_vec(heapcmd, heapcmd_vec);
+      string value;
+      if (cmd_getval(cmdmap, "value", value)) {
+	heapcmd_vec.push_back(value);
+      }
+      std::stringstream outss;
+      ceph_heap_profiler_handle_command(heapcmd_vec, outss);
+      outbl.append(outss);
+      r = 0;
+    }
+  } else if (command == "cpu_profiler") {
+    string arg;
+    cmd_getval(cmdmap, "arg", arg);
+    vector<string> argvec;
+    get_str_vec(arg, argvec);
+    cpu_profiler_handle_command(argvec, ss);
+  } else {
+    if (mds_rank == NULL) {
+      dout(1) << "Can't run that command on an inactive MDS!" << dendl;
+      f->dump_string("error", "mds_not_active");
+    } else {
+      try {
+	mds_rank->handle_asok_command(command, cmdmap, f, inbl, on_finish);
+	return;
+      } catch (const TOPNSPC::common::bad_cmd_get& e) {
+	ss << e.what();
+	r = -CEPHFS_EINVAL;
+      }
+    }
+  }
+  on_finish(r, ss.str(), outbl);
+}
+
+void MDSDaemon::dump_status(Formatter *f)
+{
+  f->open_object_section("status");
+  f->dump_stream("cluster_fsid") << monc->get_fsid();
+  if (mds_rank) {
+    f->dump_int("whoami", mds_rank->get_nodeid());
+  } else {
+    f->dump_int("whoami", MDS_RANK_NONE);
+  }
+
+  f->dump_int("id", monc->get_global_id());
+  f->dump_string("want_state", ceph_mds_state_name(beacon.get_want_state()));
+  f->dump_string("state", ceph_mds_state_name(mdsmap->get_state_gid(mds_gid_t(
+	    monc->get_global_id()))));
+  if (mds_rank) {
+    std::lock_guard l(mds_lock);
+    mds_rank->dump_status(f);
+  }
+
+  f->dump_unsigned("mdsmap_epoch", mdsmap->get_epoch());
+  if (mds_rank) {
+    f->dump_unsigned("osdmap_epoch", mds_rank->get_osd_epoch());
+    f->dump_unsigned("osdmap_epoch_barrier", mds_rank->get_osd_epoch_barrier());
+  } else {
+    f->dump_unsigned("osdmap_epoch", 0);
+    f->dump_unsigned("osdmap_epoch_barrier", 0);
+  }
+
+  f->dump_float("uptime", get_uptime().count());
+
+  f->close_section(); // status
+}
+
+void MDSDaemon::set_up_admin_socket()
+{
+  int r;
+  AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+  ceph_assert(asok_hook == nullptr);
+  asok_hook = new MDSSocketHook(this);
+  r = admin_socket->register_command("status", asok_hook,
+				     "high-level status of MDS");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_ops_in_flight", asok_hook,
+				     "show the ops currently in flight");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("ops", asok_hook,
+				     "show the ops currently in flight");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_blocked_ops",
+      asok_hook,
+      "show the blocked ops currently in flight");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_historic_ops",
+				     asok_hook,
+				     "show recent ops");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_historic_ops_by_duration",
+				     asok_hook,
+				     "show recent ops, sorted by op duration");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("scrub_path name=path,type=CephString "
+				     "name=scrubops,type=CephChoices,"
+				     "strings=force|recursive|repair,n=N,req=false "
+				     "name=tag,type=CephString,req=false",
+                                     asok_hook,
+                                     "scrub an inode and output results");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("scrub start "
+				     "name=path,type=CephString "
+				     "name=scrubops,type=CephChoices,strings=force|recursive|repair,n=N,req=false "
+				     "name=tag,type=CephString,req=false",
+				     asok_hook,
+				     "scrub and inode and output results");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("scrub abort",
+                                     asok_hook,
+                                     "Abort in progress scrub operations(s)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("scrub pause",
+                                     asok_hook,
+                                     "Pause in progress scrub operations(s)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("scrub resume",
+                                     asok_hook,
+                                     "Resume paused scrub operations(s)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("scrub status",
+                                     asok_hook,
+                                     "Status of scrub operations(s)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("tag path name=path,type=CephString"
+                                     " name=tag,type=CephString",
+                                     asok_hook,
+                                     "Apply scrub tag recursively");
+   ceph_assert(r == 0);
+  r = admin_socket->register_command("flush_path name=path,type=CephString",
+                                     asok_hook,
+                                     "flush an inode (and its dirfrags)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("export dir "
+                                     "name=path,type=CephString "
+                                     "name=rank,type=CephInt",
+                                     asok_hook,
+                                     "migrate a subtree to named MDS");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump cache name=path,type=CephString,req=false",
+                                     asok_hook,
+                                     "dump metadata cache (optionally to a file)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("cache drop "
+				     "name=timeout,type=CephInt,range=0,req=false",
+				     asok_hook,
+				     "trim cache and optionally request client to release all caps and flush the journal");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("cache status",
+                                     asok_hook,
+                                     "show cache status");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump tree "
+				     "name=root,type=CephString,req=true "
+				     "name=depth,type=CephInt,req=false ",
+				     asok_hook,
+				     "dump metadata cache for subtree");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump loads",
+                                     asok_hook,
+                                     "dump metadata loads");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump snaps name=server,type=CephChoices,strings=--server,req=false",
+                                     asok_hook,
+                                     "dump snapshots");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("session ls "
+				     "name=cap_dump,type=CephBool,req=false "
+		                     "name=filters,type=CephString,n=N,req=false ",
+				     asok_hook,
+				     "List client sessions based on a filter");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("client ls "
+				     "name=cap_dump,type=CephBool,req=false "
+		                     "name=filters,type=CephString,n=N,req=false ",
+				     asok_hook,
+				     "List client sessions based on a filter");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("session evict name=filters,type=CephString,n=N,req=false",
+				     asok_hook,
+				     "Evict client session(s) based on a filter");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("client evict name=filters,type=CephString,n=N,req=false",
+				     asok_hook,
+				     "Evict client session(s) based on a filter");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("session kill name=client_id,type=CephString",
+				     asok_hook,
+				     "Evict a client session by id");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("session ls name=cap_dump,type=CephBool,req=false",
+				     asok_hook,
+				     "Enumerate connected CephFS clients");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("session config "
+				     "name=client_id,type=CephInt,req=true "
+				     "name=option,type=CephString,req=true "
+				     "name=value,type=CephString,req=false ",
+				     asok_hook,
+				     "Config a CephFS client session");
+  assert(r == 0);
+  r = admin_socket->register_command("client config "
+				     "name=client_id,type=CephInt,req=true "
+				     "name=option,type=CephString,req=true "
+				     "name=value,type=CephString,req=false ",
+				     asok_hook,
+				     "Config a CephFS client session");
+  assert(r == 0);
+  r = admin_socket->register_command("damage ls",
+				     asok_hook,
+				     "List detected metadata damage");
+  assert(r == 0);
+  r = admin_socket->register_command("damage rm "
+				     "name=damage_id,type=CephInt",
+				     asok_hook,
+				     "Remove a damage table entry");
+  assert(r == 0);
+  r = admin_socket->register_command("osdmap barrier name=target_epoch,type=CephInt",
+				     asok_hook,
+				     "Wait until the MDS has this OSD map epoch");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("flush journal",
+				     asok_hook,
+				     "Flush the journal to the backing store");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("force_readonly",
+				     asok_hook,
+				     "Force MDS to read-only mode");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("get subtrees",
+				     asok_hook,
+				     "Return the subtree map");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dirfrag split "
+                                     "name=path,type=CephString,req=true "
+                                     "name=frag,type=CephString,req=true "
+                                     "name=bits,type=CephInt,req=true ",
+				     asok_hook,
+				     "Fragment directory by path");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dirfrag merge "
+                                     "name=path,type=CephString,req=true "
+                                     "name=frag,type=CephString,req=true",
+				     asok_hook,
+				     "De-fragment directory by path");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dirfrag ls "
+                                     "name=path,type=CephString,req=true",
+				     asok_hook,
+				     "List fragments in directory");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("openfiles ls",
+                                     asok_hook,
+                                     "List the opening files and their caps");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump inode "
+                                     "name=number,type=CephInt,req=true",
+				     asok_hook,
+				     "dump inode by inode number");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("exit",
+				     asok_hook,
+				     "Terminate this MDS");
+  r = admin_socket->register_command("respawn",
+				     asok_hook,
+				     "Respawn this MDS");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "heap " \
+    "name=heapcmd,type=CephChoices,strings="				\
+    "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
+    "name=value,type=CephString,req=false",
+    asok_hook,
+    "show heap usage info (available only if compiled with tcmalloc)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "cpu_profiler " \
+    "name=arg,type=CephChoices,strings=status|flush",
+    asok_hook,
+    "run cpu profiling on daemon");
+  ceph_assert(r == 0);
+}
+
+void MDSDaemon::clean_up_admin_socket()
+{
+  g_ceph_context->get_admin_socket()->unregister_commands(asok_hook);
+  delete asok_hook;
+  asok_hook = NULL;
+}
+
+int MDSDaemon::init()
+{
+#ifdef _WIN32
+  // Some file related flags and types are stubbed on Windows. In order to avoid
+  // incorrect behavior, we're going to prevent the MDS from running on Windows
+  // until those limitations are addressed. MDS clients, however, are allowed
+  // to run on Windows.
+  derr << "The Ceph MDS does not support running on Windows at the moment."
+       << dendl;
+  return -CEPHFS_ENOSYS;
+#endif // _WIN32
+
+  dout(10) << "Dumping misc struct sizes:" << dendl;
+  dout(10) << sizeof(MDSCacheObject) << "\tMDSCacheObject" << dendl;
+  dout(10) << sizeof(CInode) << "\tCInode" << dendl;
+  dout(10) << sizeof(elist<void*>::item) << "\telist<>::item" << dendl;
+  dout(10) << sizeof(CInode::mempool_inode) << "\tinode" << dendl;
+  dout(10) << sizeof(CInode::mempool_old_inode) << "\told_inode" << dendl;
+  dout(10) << sizeof(nest_info_t) << "\tnest_info_t" << dendl;
+  dout(10) << sizeof(frag_info_t) << "\tfrag_info_t" << dendl;
+  dout(10) << sizeof(SimpleLock) << "\tSimpleLock" << dendl;
+  dout(10) << sizeof(ScatterLock) << "\tScatterLock" << dendl;
+  dout(10) << sizeof(CDentry) << "\tCDentry" << dendl;
+  dout(10) << sizeof(elist<void*>::item) << "\telist<>::item" << dendl;
+  dout(10) << sizeof(SimpleLock) << "\tSimpleLock" << dendl;
+  dout(10) << sizeof(CDir) << "\tCDir" << dendl;
+  dout(10) << sizeof(elist<void*>::item) << "\telist<>::item" << dendl;
+  dout(10) << sizeof(fnode_t) << "\tfnode_t" << dendl;
+  dout(10) << sizeof(nest_info_t) << "\tnest_info_t" << dendl;
+  dout(10) << sizeof(frag_info_t) << "\tfrag_info_t" << dendl;
+  dout(10) << sizeof(Capability) << "\tCapability" << dendl;
+  dout(10) << sizeof(xlist<void*>::item) << "\txlist<>::item" << dendl;
+
+  messenger->add_dispatcher_tail(&beacon);
+  messenger->add_dispatcher_tail(this);
+
+  // init monc
+  monc->set_messenger(messenger);
+
+  monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD |
+                      CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_MGR);
+  int r = 0;
+  r = monc->init();
+  if (r < 0) {
+    derr << "ERROR: failed to init monc: " << cpp_strerror(-r) << dendl;
+    mds_lock.lock();
+    suicide();
+    mds_lock.unlock();
+    return r;
+  }
+
+  messenger->set_auth_client(monc);
+  messenger->set_auth_server(monc);
+  monc->set_handle_authentication_dispatcher(this);
+
+  // tell monc about log_client so it will know about mon session resets
+  monc->set_log_client(&log_client);
+
+  r = monc->authenticate();
+  if (r < 0) {
+    derr << "ERROR: failed to authenticate: " << cpp_strerror(-r) << dendl;
+    mds_lock.lock();
+    suicide();
+    mds_lock.unlock();
+    return r;
+  }
+
+  int rotating_auth_attempts = 0;
+  auto rotating_auth_timeout =
+    g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
+  while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
+    if (++rotating_auth_attempts <= g_conf()->max_rotating_auth_attempts) {
+      derr << "unable to obtain rotating service keys; retrying" << dendl;
+      continue;
+    }
+    derr << "ERROR: failed to refresh rotating keys, "
+         << "maximum retry time reached." << dendl;
+    std::lock_guard locker{mds_lock};
+    suicide();
+    return -CEPHFS_ETIMEDOUT;
+  }
+
+  mds_lock.lock();
+  if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) {
+    dout(4) << __func__ << ": terminated already, dropping out" << dendl;
+    mds_lock.unlock();
+    return 0;
+  }
+
+  monc->sub_want("mdsmap", 0, 0);
+  monc->renew_subs();
+
+  mds_lock.unlock();
+
+  // Set up admin socket before taking mds_lock, so that ordering
+  // is consistent (later we take mds_lock within asok callbacks)
+  set_up_admin_socket();
+  std::lock_guard locker{mds_lock};
+  if (beacon.get_want_state() == MDSMap::STATE_DNE) {
+    suicide();  // we could do something more graceful here
+    dout(4) << __func__ << ": terminated already, dropping out" << dendl;
+    return 0; 
+  }
+
+  timer.init();
+
+  beacon.init(*mdsmap);
+  messenger->set_myname(entity_name_t::MDS(MDS_RANK_NONE));
+
+  // schedule tick
+  reset_tick();
+  return 0;
+}
+
+void MDSDaemon::reset_tick()
+{
+  // cancel old
+  if (tick_event) timer.cancel_event(tick_event);
+
+  // schedule
+  tick_event = timer.add_event_after(
+    g_conf()->mds_tick_interval,
+    new LambdaContext([this](int) {
+	ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
+	tick();
+      }));
+}
+
+void MDSDaemon::tick()
+{
+  // reschedule
+  reset_tick();
+
+  // Call through to subsystems' tick functions
+  if (mds_rank) {
+    mds_rank->tick();
+  }
+}
+
+void MDSDaemon::handle_command(const cref_t<MCommand> &m)
+{
+  auto priv = m->get_connection()->get_priv();
+  auto session = static_cast<Session *>(priv.get());
+  ceph_assert(session != NULL);
+
+  int r = 0;
+  cmdmap_t cmdmap;
+  CachedStackStringStream css;
+  auto& ss = *css;
+  bufferlist outbl;
+
+  // If someone is using a closed session for sending commands (e.g.
+  // the ceph CLI) then we should feel free to clean up this connection
+  // as soon as we've sent them a response.
+  const bool live_session =
+    session->get_state_seq() > 0 &&
+    mds_rank &&
+    mds_rank->sessionmap.get_session(session->info.inst.name);
+
+  if (!live_session) {
+    // This session only existed to issue commands, so terminate it
+    // as soon as we can.
+    ceph_assert(session->is_closed());
+    session->get_connection()->mark_disposable();
+  }
+  priv.reset();
+
+  if (!session->auth_caps.allow_all()) {
+    dout(1) << __func__
+      << ": received command from client without `tell` capability: "
+      << *m->get_connection()->peer_addrs << dendl;
+
+    ss << "permission denied";
+    r = -CEPHFS_EACCES;
+  } else if (m->cmd.empty()) {
+    r = -CEPHFS_EINVAL;
+    ss << "no command given";
+  } else if (!TOPNSPC::common::cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    r = -CEPHFS_EINVAL;
+  } else {
+    cct->get_admin_socket()->queue_tell_command(m);
+    return;
+  }
+
+  auto reply = make_message<MCommandReply>(r, ss.str());
+  reply->set_tid(m->get_tid());
+  reply->set_data(outbl);
+  m->get_connection()->send_message2(reply);
+}
+
+void MDSDaemon::handle_mds_map(const cref_t<MMDSMap> &m)
+{
+  version_t epoch = m->get_epoch();
+
+  // is it new?
+  if (epoch <= mdsmap->get_epoch()) {
+    dout(5) << "handle_mds_map old map epoch " << epoch << " <= "
+            << mdsmap->get_epoch() << ", discarding" << dendl;
+    return;
+  }
+
+  dout(1) << "Updating MDS map to version " << epoch << " from " << m->get_source() << dendl;
+
+  // keep old map, for a moment
+  std::unique_ptr<MDSMap> oldmap;
+  oldmap.swap(mdsmap);
+
+  // decode and process
+  mdsmap.reset(new MDSMap);
+  mdsmap->decode(m->get_encoded());
+
+  monc->sub_got("mdsmap", mdsmap->get_epoch());
+
+  // verify compatset
+  CompatSet mdsmap_compat(MDSMap::get_compat_set_all());
+  dout(10) << "     my compat " << mdsmap_compat << dendl;
+  dout(10) << " mdsmap compat " << mdsmap->compat << dendl;
+  if (!mdsmap_compat.writeable(mdsmap->compat)) {
+    dout(0) << "handle_mds_map mdsmap compatset " << mdsmap->compat
+	    << " not writeable with daemon features " << mdsmap_compat
+	    << ", killing myself" << dendl;
+    suicide();
+    return;
+  }
+
+  // Calculate my effective rank (either my owned rank or the rank I'm following if STATE_STANDBY_REPLAY
+  const auto addrs = messenger->get_myaddrs();
+  const auto myid = monc->get_global_id();
+  const auto mygid = mds_gid_t(myid);
+  const auto whoami = mdsmap->get_rank_gid(mygid);
+  const auto old_state = oldmap->get_state_gid(mygid);
+  const auto new_state = mdsmap->get_state_gid(mygid);
+  const auto incarnation = mdsmap->get_inc_gid(mygid);
+  dout(10) << "my gid is " << myid << dendl;
+  dout(10) << "map says I am mds." << whoami << "." << incarnation
+	   << " state " << ceph_mds_state_name(new_state) << dendl;
+  dout(10) << "msgr says I am " << addrs << dendl;
+
+  // If we're removed from the MDSMap, stop all processing.
+  using DS = MDSMap::DaemonState;
+  if (old_state != DS::STATE_NULL && new_state == DS::STATE_NULL) {
+    const auto& oldinfo = oldmap->get_info_gid(mygid);
+    dout(1) << "Map removed me " << oldinfo
+            << " from cluster; respawning! See cluster/monitor logs for details." << dendl;
+    respawn();
+  }
+
+  if (old_state == DS::STATE_NULL && new_state != DS::STATE_NULL) {
+    /* The MDS has been added to the FSMap, now we can init the MgrClient */
+    mgrc.init();
+    messenger->add_dispatcher_tail(&mgrc);
+    monc->sub_want("mgrmap", 0, 0);
+    monc->renew_subs(); /* MgrMap receipt drives connection to ceph-mgr */
+  }
+
+  // mark down any failed peers
+  for (const auto& [gid, info] : oldmap->get_mds_info()) {
+    if (mdsmap->get_mds_info().count(gid) == 0) {
+      dout(10) << " peer mds gid " << gid << " removed from map" << dendl;
+      messenger->mark_down_addrs(info.addrs);
+    }
+  }
+
+  if (whoami == MDS_RANK_NONE) {
+    // We do not hold a rank:
+    dout(10) <<  __func__ << ": handling map in rankless mode" << dendl;
+
+    if (new_state == DS::STATE_STANDBY) {
+      /* Note: STATE_BOOT is never an actual state in the FSMap. The Monitors
+       * generally mark a new MDS as STANDBY (although it's possible to
+       * immediately be assigned a rank).
+       */
+      if (old_state == DS::STATE_NULL) {
+        dout(1) << "Monitors have assigned me to become a standby." << dendl;
+        beacon.set_want_state(*mdsmap, new_state);
+      } else if (old_state == DS::STATE_STANDBY) {
+        dout(5) << "I am still standby" << dendl;
+      }
+    } else if (new_state == DS::STATE_NULL) {
+      /* We are not in the MDSMap yet! Keep waiting: */
+      ceph_assert(beacon.get_want_state() == DS::STATE_BOOT);
+      dout(10) << "not in map yet" << dendl;
+    } else {
+      /* We moved to standby somehow from another state */
+      ceph_abort("invalid transition to standby");
+    }
+  } else {
+    // Did we already hold a different rank?  MDSMonitor shouldn't try
+    // to change that out from under me!
+    if (mds_rank && whoami != mds_rank->get_nodeid()) {
+      derr << "Invalid rank transition " << mds_rank->get_nodeid() << "->"
+           << whoami << dendl;
+      respawn();
+    }
+
+    // Did I previously not hold a rank?  Initialize!
+    if (mds_rank == NULL) {
+      mds_rank = new MDSRankDispatcher(whoami, m->map_fs_name, mds_lock, clog,
+          timer, beacon, mdsmap, messenger, monc, &mgrc,
+          new LambdaContext([this](int r){respawn();}),
+          new LambdaContext([this](int r){suicide();}),
+	  ioctx);
+      dout(10) <<  __func__ << ": initializing MDS rank "
+               << mds_rank->get_nodeid() << dendl;
+      mds_rank->init();
+    }
+
+    // MDSRank is active: let him process the map, we have no say.
+    dout(10) <<  __func__ << ": handling map as rank "
+             << mds_rank->get_nodeid() << dendl;
+    mds_rank->handle_mds_map(m, *oldmap);
+  }
+
+  beacon.notify_mdsmap(*mdsmap);
+}
+
+void MDSDaemon::handle_signal(int signum)
+{
+  ceph_assert(signum == SIGINT || signum == SIGTERM);
+  derr << "*** got signal " << sig_str(signum) << " ***" << dendl;
+  {
+    std::lock_guard l(mds_lock);
+    if (stopping) {
+      return;
+    }
+    suicide();
+  }
+}
+
+void MDSDaemon::suicide()
+{
+  ceph_assert(ceph_mutex_is_locked(mds_lock));
+  
+  // make sure we don't suicide twice
+  ceph_assert(stopping == false);
+  stopping = true;
+
+  dout(1) << "suicide! Wanted state "
+          << ceph_mds_state_name(beacon.get_want_state()) << dendl;
+
+  if (tick_event) {
+    timer.cancel_event(tick_event);
+    tick_event = 0;
+  }
+
+  clean_up_admin_socket();
+
+  // Notify the Monitors (MDSMonitor) that we're dying, so that it doesn't have
+  // to wait for us to go laggy. Only do this if we're actually in the MDSMap,
+  // because otherwise the MDSMonitor will drop our message.
+  beacon.set_want_state(*mdsmap, MDSMap::STATE_DNE);
+  if (!mdsmap->is_dne_gid(mds_gid_t(monc->get_global_id()))) {
+    beacon.send_and_wait(1);
+  }
+  beacon.shutdown();
+
+  if (mgrc.is_initialized())
+    mgrc.shutdown();
+
+  if (mds_rank) {
+    mds_rank->shutdown();
+  } else {
+    timer.shutdown();
+
+    monc->shutdown();
+    messenger->shutdown();
+  }
+}
+
+void MDSDaemon::respawn()
+{
+  // --- WARNING TO FUTURE COPY/PASTERS ---
+  // You must also add a call like
+  //
+  //   ceph_pthread_setname(pthread_self(), "ceph-mds");
+  //
+  // to main() so that /proc/$pid/stat field 2 contains "(ceph-mds)"
+  // instead of "(exe)", so that killall (and log rotation) will work.
+
+  dout(1) << "respawn!" << dendl;
+
+  /* Dump recent in case the MDS was stuck doing something which caused it to
+   * be removed from the MDSMap leading to respawn. */
+  g_ceph_context->_log->dump_recent();
+
+  /* valgrind can't handle execve; just exit and let QA infra restart */
+  if (g_conf().get_val<bool>("mds_valgrind_exit")) {
+    _exit(0);
+  }
+
+  char *new_argv[orig_argc+1];
+  dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
+  for (int i=0; i<orig_argc; i++) {
+    new_argv[i] = (char *)orig_argv[i];
+    dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl;
+  }
+  new_argv[orig_argc] = NULL;
+
+  /* Determine the path to our executable, test if Linux /proc/self/exe exists.
+   * This allows us to exec the same executable even if it has since been
+   * unlinked.
+   */
+  char exe_path[PATH_MAX] = "";
+#ifdef PROCPREFIX
+  if (readlink(PROCPREFIX "/proc/self/exe", exe_path, PATH_MAX-1) != -1) {
+    dout(1) << "respawning with exe " << exe_path << dendl;
+    strcpy(exe_path, PROCPREFIX "/proc/self/exe");
+  } else {
+#else
+  {
+#endif
+    /* Print CWD for the user's interest */
+    char buf[PATH_MAX];
+    char *cwd = getcwd(buf, sizeof(buf));
+    ceph_assert(cwd);
+    dout(1) << " cwd " << cwd << dendl;
+
+    /* Fall back to a best-effort: just running in our CWD */
+    strncpy(exe_path, orig_argv[0], PATH_MAX-1);
+  }
+
+  dout(1) << " exe_path " << exe_path << dendl;
+
+  unblock_all_signals(NULL);
+  execv(exe_path, new_argv);
+
+  dout(0) << "respawn execv " << orig_argv[0]
+	  << " failed with " << cpp_strerror(errno) << dendl;
+
+  // We have to assert out here, because suicide() returns, and callers
+  // to respawn expect it never to return.
+  ceph_abort();
+}
+
+
+
+bool MDSDaemon::ms_dispatch2(const ref_t<Message> &m)
+{
+  std::lock_guard l(mds_lock);
+  if (stopping) {
+    return false;
+  }
+
+  // Drop out early if shutting down
+  if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) {
+    dout(10) << " stopping, discarding " << *m << dendl;
+    return true;
+  }
+
+  // First see if it's a daemon message
+  const bool handled_core = handle_core_message(m);
+  if (handled_core) {
+    return true;
+  }
+
+  // Not core, try it as a rank message
+  if (mds_rank) {
+    return mds_rank->ms_dispatch(m);
+  } else {
+    return false;
+  }
+}
+
+/*
+ * high priority messages we always process
+ */
+
+#define ALLOW_MESSAGES_FROM(peers)                                      \
+  do {                                                                  \
+    if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
+      dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" \
+              << m->get_connection()->get_peer_type() << " allowing="   \
+              << #peers << " message=" << *m << dendl;                  \
+      return true;                                                      \
+    }                                                                   \
+  } while (0)
+
+bool MDSDaemon::handle_core_message(const cref_t<Message> &m)
+{
+  switch (m->get_type()) {
+  case CEPH_MSG_MON_MAP:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON);
+    break;
+
+    // MDS
+  case CEPH_MSG_MDS_MAP:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_MDS);
+    handle_mds_map(ref_cast<MMDSMap>(m));
+    break;
+
+  case MSG_REMOVE_SNAPS:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON);
+    mds_rank->snapserver->handle_remove_snaps(ref_cast<MRemoveSnaps>(m));
+    break;
+
+    // OSD
+  case MSG_COMMAND:
+    handle_command(ref_cast<MCommand>(m));
+    break;
+  case CEPH_MSG_OSD_MAP:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD);
+
+    if (mds_rank) {
+      mds_rank->handle_osd_map();
+    }
+    break;
+
+  case MSG_MON_COMMAND:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON);
+    clog->warn() << "dropping `mds tell` command from legacy monitor";
+    break;
+
+  default:
+    return false;
+  }
+  return true;
+}
+
+void MDSDaemon::ms_handle_connect(Connection *con)
+{
+}
+
+bool MDSDaemon::ms_handle_reset(Connection *con)
+{
+  if (con->get_peer_type() != CEPH_ENTITY_TYPE_CLIENT)
+    return false;
+
+  std::lock_guard l(mds_lock);
+  if (stopping) {
+    return false;
+  }
+  dout(5) << "ms_handle_reset on " << con->get_peer_socket_addr() << dendl;
+  if (beacon.get_want_state() == CEPH_MDS_STATE_DNE)
+    return false;
+
+  auto priv = con->get_priv();
+  if (auto session = static_cast<Session *>(priv.get()); session) {
+    if (session->is_closed()) {
+      dout(3) << "ms_handle_reset closing connection for session " << session->info.inst << dendl;
+      con->mark_down();
+      con->set_priv(nullptr);
+    }
+  } else {
+    con->mark_down();
+  }
+  return false;
+}
+
+
+void MDSDaemon::ms_handle_remote_reset(Connection *con)
+{
+  if (con->get_peer_type() != CEPH_ENTITY_TYPE_CLIENT)
+    return;
+
+  std::lock_guard l(mds_lock);
+  if (stopping) {
+    return;
+  }
+
+  dout(5) << "ms_handle_remote_reset on " << con->get_peer_socket_addr() << dendl;
+  if (beacon.get_want_state() == CEPH_MDS_STATE_DNE)
+    return;
+
+  auto priv = con->get_priv();
+  if (auto session = static_cast<Session *>(priv.get()); session) {
+    if (session->is_closed()) {
+      dout(3) << "ms_handle_remote_reset closing connection for session " << session->info.inst << dendl;
+      con->mark_down();
+      con->set_priv(nullptr);
+    }
+  }
+}
+
+bool MDSDaemon::ms_handle_refused(Connection *con)
+{
+  // do nothing for now
+  return false;
+}
+
+bool MDSDaemon::parse_caps(const AuthCapsInfo& info, MDSAuthCaps& caps)
+{
+  caps.clear();
+  if (info.allow_all) {
+    caps.set_allow_all();
+    return true;
+  } else {
+    auto it = info.caps.begin();
+    string auth_cap_str;
+    try {
+      decode(auth_cap_str, it);
+    } catch (const buffer::error& e) {
+      dout(1) << __func__ << ": cannot decode auth caps buffer of length " << info.caps.length() << dendl;
+      return false;
+    }
+
+    dout(10) << __func__ << ": parsing auth_cap_str='" << auth_cap_str << "'" << dendl;
+    CachedStackStringStream cs;
+    if (caps.parse(g_ceph_context, auth_cap_str, cs.get())) {
+      return true;
+    } else {
+      dout(1) << __func__ << ": auth cap parse error: " << cs->strv() << " parsing '" << auth_cap_str << "'" << dendl;
+      return false;
+    }
+  }
+}
+
+int MDSDaemon::ms_handle_authentication(Connection *con)
+{
+  /* N.B. without mds_lock! */
+  MDSAuthCaps caps;
+  return parse_caps(con->get_peer_caps_info(), caps) ? 0 : -1;
+}
+
+void MDSDaemon::ms_handle_accept(Connection *con)
+{
+  entity_name_t n(con->get_peer_type(), con->get_peer_global_id());
+  std::lock_guard l(mds_lock);
+  if (stopping) {
+    return;
+  }
+
+  // We allow connections and assign Session instances to connections
+  // even if we have not been assigned a rank, because clients with
+  // "allow *" are allowed to connect and do 'tell' operations before
+  // we have a rank.
+  Session *s = NULL;
+  if (mds_rank) {
+    // If we do hold a rank, see if this is an existing client establishing
+    // a new connection, rather than a new client
+    s = mds_rank->sessionmap.get_session(n);
+  }
+
+  // Wire up a Session* to this connection
+  // It doesn't go into a SessionMap instance until it sends an explicit
+  // request to open a session (initial state of Session is `closed`)
+  if (!s) {
+    s = new Session(con);
+    dout(10) << " new session " << s << " for " << s->info.inst
+	     << " con " << con << dendl;
+    con->set_priv(RefCountedPtr{s, false});
+    if (mds_rank) {
+      mds_rank->kick_waiters_for_any_client_connection();
+    }
+  } else {
+    dout(10) << " existing session " << s << " for " << s->info.inst
+	     << " existing con " << s->get_connection()
+	     << ", new/authorizing con " << con << dendl;
+    con->set_priv(RefCountedPtr{s});
+  }
+
+  parse_caps(con->get_peer_caps_info(), s->auth_caps);
+
+  dout(10) << "ms_handle_accept " << con->get_peer_socket_addr() << " con " << con << " session " << s << dendl;
+  if (s) {
+    if (s->get_connection() != con) {
+      dout(10) << " session connection " << s->get_connection()
+	       << " -> " << con << dendl;
+      s->set_connection(con);
+
+      // send out any queued messages
+      while (!s->preopen_out_queue.empty()) {
+	con->send_message2(s->preopen_out_queue.front());
+	s->preopen_out_queue.pop_front();
+      }
+    }
+  }
+}
+
+bool MDSDaemon::is_clean_shutdown()
+{
+  if (mds_rank) {
+    return mds_rank->is_stopped();
+  } else {
+    return true;
+  }
+}
diff --git a/src/mds/MDSDaemon.h b/src/mds/MDSDaemon.h
new file mode 100644
index 000000000..97162cc88
--- /dev/null
+++ b/src/mds/MDSDaemon.h
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_H
+#define CEPH_MDS_H
+
+#include <string_view>
+
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MGenericMessage.h"
+#include "messages/MMDSMap.h"
+#include "messages/MMonCommand.h"
+
+#include "common/LogClient.h"
+#include "common/ceph_mutex.h"
+#include "common/fair_mutex.h"
+#include "common/Timer.h"
+#include "include/Context.h"
+#include "include/types.h"
+#include "mgr/MgrClient.h"
+#include "msg/Dispatcher.h"
+
+#include "Beacon.h"
+#include "MDSMap.h"
+#include "MDSRank.h"
+
+#define CEPH_MDS_PROTOCOL    36 /* cluster internal */
+
+class Messenger;
+class MonClient;
+
+class MDSDaemon : public Dispatcher {
+ public:
+  MDSDaemon(std::string_view n, Messenger *m, MonClient *mc,
+	    boost::asio::io_context& ioctx);
+
+  ~MDSDaemon() override;
+
+  mono_time get_starttime() const {
+    return starttime;
+  }
+  chrono::duration<double> get_uptime() const {
+    mono_time now = mono_clock::now();
+    return chrono::duration<double>(now-starttime);
+  }
+
+  // handle a signal (e.g., SIGTERM)
+  void handle_signal(int signum);
+
+  int init();
+
+  /**
+   * Hint at whether we were shutdown gracefully (i.e. we were only
+   * in standby, or our rank was stopped).  Should be removed once
+   * we handle shutdown properly (e.g. clear out all message queues)
+   * such that deleting xlists doesn't assert.
+   */
+  bool is_clean_shutdown();
+
+  /* Global MDS lock: every time someone takes this, they must
+   * also check the `stopping` flag.  If stopping is true, you
+   * must either do nothing and immediately drop the lock, or
+   * never drop the lock again (i.e. call respawn()) */
+  ceph::fair_mutex mds_lock{"MDSDaemon::mds_lock"};
+  bool stopping = false;
+
+  class CommonSafeTimer<ceph::fair_mutex> timer;
+  std::string gss_ktfile_client{};
+
+  int orig_argc;
+  const char **orig_argv;
+
+
+ protected:
+  // admin socket handling
+  friend class MDSSocketHook;
+
+  // special message types
+  friend class C_MDS_Send_Command_Reply;
+
+  void reset_tick();
+  void wait_for_omap_osds();
+
+  void set_up_admin_socket();
+  void clean_up_admin_socket();
+  void check_ops_in_flight(); // send off any slow ops to monitor
+  void asok_command(
+    std::string_view command,
+    const cmdmap_t& cmdmap,
+    Formatter *f,
+    const bufferlist &inbl,
+    std::function<void(int,const std::string&,bufferlist&)> on_finish);
+
+  void dump_status(Formatter *f);
+
+  /**
+   * Terminate this daemon process.
+   *
+   * This function will return, but once it does so the calling thread
+   * must do no more work as all subsystems will have been shut down.
+   */
+  void suicide();
+
+  /**
+   * Start a new daemon process with the same command line parameters that
+   * this process was run with, then terminate this process
+   */
+  void respawn();
+
+  void tick();
+
+  bool handle_core_message(const cref_t<Message> &m);
+  
+  void handle_command(const cref_t<MCommand> &m);
+  void handle_mds_map(const cref_t<MMDSMap> &m);
+
+  Beacon beacon;
+
+  std::string name;
+
+  Messenger    *messenger;
+  MonClient    *monc;
+  boost::asio::io_context& ioctx;
+  MgrClient     mgrc;
+  std::unique_ptr<MDSMap> mdsmap;
+  LogClient    log_client;
+  LogChannelRef clog;
+
+  MDSRankDispatcher *mds_rank = nullptr;
+
+  // tick and other timer fun
+  Context *tick_event = nullptr;
+  class MDSSocketHook *asok_hook = nullptr;
+
+ private:
+  bool ms_dispatch2(const ref_t<Message> &m) override;
+  int ms_handle_authentication(Connection *con) override;
+  void ms_handle_accept(Connection *con) override;
+  void ms_handle_connect(Connection *con) override;
+  bool ms_handle_reset(Connection *con) override;
+  void ms_handle_remote_reset(Connection *con) override;
+  bool ms_handle_refused(Connection *con) override;
+
+  bool parse_caps(const AuthCapsInfo&, MDSAuthCaps&);
+
+  mono_time starttime = mono_clock::zero();
+};
+
+#endif
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc
new file mode 100644
index 000000000..f611d86a5
--- /dev/null
+++ b/src/mds/MDSMap.cc
@@ -0,0 +1,1146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <ostream>
+
+#include "common/debug.h"
+#include "mon/health_check.h"
+
+#include "MDSMap.h"
+
+using std::dec;
+using std::hex;
+using std::list;
+using std::make_pair;
+using std::map;
+using std::multimap;
+using std::ostream;
+using std::pair;
+using std::string;
+using std::set;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_
+
+// features
+CompatSet MDSMap::get_compat_set_all() {
+  CompatSet::FeatureSet feature_compat;
+  CompatSet::FeatureSet feature_ro_compat;
+  CompatSet::FeatureSet feature_incompat;
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2);
+
+  return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
+}
+
+CompatSet MDSMap::get_compat_set_default() {
+  CompatSet::FeatureSet feature_compat;
+  CompatSet::FeatureSet feature_ro_compat;
+  CompatSet::FeatureSet feature_incompat;
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2);
+
+  return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
+}
+
+// base (pre v0.20)
+CompatSet MDSMap::get_compat_set_base() {
+  CompatSet::FeatureSet feature_compat_base;
+  CompatSet::FeatureSet feature_incompat_base;
+  feature_incompat_base.insert(MDS_FEATURE_INCOMPAT_BASE);
+  CompatSet::FeatureSet feature_ro_compat_base;
+
+  return CompatSet(feature_compat_base, feature_ro_compat_base, feature_incompat_base);
+}
+
+// pre-v16.2.5 CompatSet in MDS beacon
+CompatSet MDSMap::get_compat_set_v16_2_4() {
+  CompatSet::FeatureSet feature_compat;
+  CompatSet::FeatureSet feature_ro_compat;
+  CompatSet::FeatureSet feature_incompat;
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
+  feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2);
+  return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
+}
+
+void MDSMap::mds_info_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("gid", global_id);
+  f->dump_string("name", name);
+  f->dump_int("rank", rank);
+  f->dump_int("incarnation", inc);
+  f->dump_stream("state") << ceph_mds_state_name(state);
+  f->dump_int("state_seq", state_seq);
+  f->dump_stream("addr") << addrs.get_legacy_str();
+  f->dump_object("addrs", addrs);
+  f->dump_int("join_fscid", join_fscid);
+  if (laggy_since != utime_t())
+    f->dump_stream("laggy_since") << laggy_since;
+  
+  f->open_array_section("export_targets");
+  for (set<mds_rank_t>::iterator p = export_targets.begin();
+       p != export_targets.end(); ++p) {
+    f->dump_int("mds", *p);
+  }
+  f->close_section();
+  f->dump_unsigned("features", mds_features);
+  f->dump_unsigned("flags", flags);
+  f->dump_object("compat", compat);
+}
+
+void MDSMap::mds_info_t::dump(std::ostream& o) const
+{
+  o << "[mds." << name << "{" <<  rank << ":" << global_id << "}"
+       << " state " << ceph_mds_state_name(state)
+       << " seq " << state_seq;
+  if (laggy()) {
+    o << " laggy since " << laggy_since;
+  }
+  if (!export_targets.empty()) {
+    o << " export targets " << export_targets;
+  }
+  if (is_frozen()) {
+    o << " frozen";
+  }
+  if (join_fscid != FS_CLUSTER_ID_NONE) {
+    o << " join_fscid=" << join_fscid;
+  }
+  o << " addr " << addrs;
+  o << " compat ";
+  compat.printlite(o);
+  o << "]";
+}
+
+void MDSMap::mds_info_t::generate_test_instances(std::list<mds_info_t*>& ls)
+{
+  mds_info_t *sample = new mds_info_t();
+  ls.push_back(sample);
+  sample = new mds_info_t();
+  sample->global_id = 1;
+  sample->name = "test_instance";
+  sample->rank = 0;
+  ls.push_back(sample);
+}
+
+void MDSMap::dump(Formatter *f) const
+{
+  f->dump_int("epoch", epoch);
+  f->dump_unsigned("flags", flags);
+  f->dump_unsigned("ever_allowed_features", ever_allowed_features);
+  f->dump_unsigned("explicitly_allowed_features", explicitly_allowed_features);
+  f->dump_stream("created") << created;
+  f->dump_stream("modified") << modified;
+  f->dump_int("tableserver", tableserver);
+  f->dump_int("root", root);
+  f->dump_int("session_timeout", session_timeout);
+  f->dump_int("session_autoclose", session_autoclose);
+  f->open_object_section("required_client_features");
+  cephfs_dump_features(f, required_client_features);
+  f->close_section();
+  f->dump_int("max_file_size", max_file_size);
+  f->dump_int("last_failure", last_failure);
+  f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
+  f->open_object_section("compat");
+  compat.dump(f);
+  f->close_section();
+  f->dump_int("max_mds", max_mds);
+  f->open_array_section("in");
+  for (set<mds_rank_t>::const_iterator p = in.begin(); p != in.end(); ++p)
+    f->dump_int("mds", *p);
+  f->close_section();
+  f->open_object_section("up");
+  for (map<mds_rank_t,mds_gid_t>::const_iterator p = up.begin(); p != up.end(); ++p) {
+    char s[14];
+    sprintf(s, "mds_%d", int(p->first));
+    f->dump_int(s, p->second);
+  }
+  f->close_section();
+  f->open_array_section("failed");
+  for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p)
+    f->dump_int("mds", *p);
+  f->close_section();
+  f->open_array_section("damaged");
+  for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p)
+    f->dump_int("mds", *p);
+  f->close_section();
+  f->open_array_section("stopped");
+  for (set<mds_rank_t>::const_iterator p = stopped.begin(); p != stopped.end(); ++p)
+    f->dump_int("mds", *p);
+  f->close_section();
+  f->open_object_section("info");
+  for (const auto& [gid, info] : mds_info) {
+    char s[25]; // 'gid_' + len(str(ULLONG_MAX)) + '\0'
+    sprintf(s, "gid_%llu", (long long unsigned)gid);
+    f->open_object_section(s);
+    info.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("data_pools");
+  for (const auto& p: data_pools)
+    f->dump_int("pool", p);
+  f->close_section();
+  f->dump_int("metadata_pool", metadata_pool);
+  f->dump_bool("enabled", enabled);
+  f->dump_string("fs_name", fs_name);
+  f->dump_string("balancer", balancer);
+  f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted));
+}
+
+void MDSMap::generate_test_instances(std::list<MDSMap*>& ls)
+{
+  MDSMap *m = new MDSMap();
+  m->max_mds = 1;
+  m->data_pools.push_back(0);
+  m->metadata_pool = 1;
+  m->cas_pool = 2;
+  m->compat = get_compat_set_all();
+
+  // these aren't the defaults, just in case anybody gets confused
+  m->session_timeout = 61;
+  m->session_autoclose = 301;
+  m->max_file_size = 1<<24;
+  ls.push_back(m);
+}
+
+void MDSMap::print(ostream& out) const
+{
+  out << "fs_name\t" << fs_name << "\n";
+  out << "epoch\t" << epoch << "\n";
+  out << "flags\t" << hex << flags << dec << "\n";
+  out << "created\t" << created << "\n";
+  out << "modified\t" << modified << "\n";
+  out << "tableserver\t" << tableserver << "\n";
+  out << "root\t" << root << "\n";
+  out << "session_timeout\t" << session_timeout << "\n"
+      << "session_autoclose\t" << session_autoclose << "\n";
+  out << "max_file_size\t" << max_file_size << "\n";
+  out << "required_client_features\t" << cephfs_stringify_features(required_client_features) << "\n";
+  out << "last_failure\t" << last_failure << "\n"
+      << "last_failure_osd_epoch\t" << last_failure_osd_epoch << "\n";
+  out << "compat\t" << compat << "\n";
+  out << "max_mds\t" << max_mds << "\n";
+  out << "in\t" << in << "\n"
+      << "up\t" << up << "\n"
+      << "failed\t" << failed << "\n"
+      << "damaged\t" << damaged << "\n"
+      << "stopped\t" << stopped << "\n";
+  out << "data_pools\t" << data_pools << "\n";
+  out << "metadata_pool\t" << metadata_pool << "\n";
+  out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n";
+  out << "balancer\t" << balancer << "\n";
+  out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n";
+
+  multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo;
+  for (const auto &p : mds_info) {
+    foo.insert(std::make_pair(
+          std::make_pair(p.second.rank, p.second.inc-1), p.first));
+  }
+
+  for (const auto &p : foo) {
+    out << mds_info.at(p.second) << "\n";
+  }
+}
+
+void MDSMap::print_summary(Formatter *f, ostream *out) const
+{
+  map<mds_rank_t,string> by_rank;
+  map<string,int> by_state;
+
+  if (f) {
+    f->dump_unsigned("epoch", get_epoch());
+    f->dump_unsigned("up", up.size());
+    f->dump_unsigned("in", in.size());
+    f->dump_unsigned("max", max_mds);
+  } else {
+    *out << "e" << get_epoch() << ": " << up.size() << "/" << in.size() << "/" << max_mds << " up";
+  }
+
+  if (f)
+    f->open_array_section("by_rank");
+  for (const auto &p : mds_info) {
+    string s = ceph_mds_state_name(p.second.state);
+    if (p.second.laggy())
+      s += "(laggy or crashed)";
+
+    if (p.second.rank >= 0 && p.second.state != MDSMap::STATE_STANDBY_REPLAY) {
+      if (f) {
+	f->open_object_section("mds");
+	f->dump_unsigned("rank", p.second.rank);
+	f->dump_string("name", p.second.name);
+	f->dump_string("status", s);
+	f->close_section();
+      } else {
+	by_rank[p.second.rank] = p.second.name + "=" + s;
+      }
+    } else {
+      by_state[s]++;
+    }
+  }
+  if (f) {
+    f->close_section();
+  } else {
+    if (!by_rank.empty())
+      *out << " " << by_rank;
+  }
+
+  for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) {
+    if (f) {
+      f->dump_unsigned(p->first.c_str(), p->second);
+    } else {
+      *out << ", " << p->second << " " << p->first;
+    }
+  }
+
+  if (!failed.empty()) {
+    if (f) {
+      f->dump_unsigned("failed", failed.size());
+    } else {
+      *out << ", " << failed.size() << " failed";
+    }
+  }
+
+  if (!damaged.empty()) {
+    if (f) {
+      f->dump_unsigned("damaged", damaged.size());
+    } else {
+      *out << ", " << damaged.size() << " damaged";
+    }
+  }
+  //if (stopped.size())
+  //out << ", " << stopped.size() << " stopped";
+}
+
+void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
+			list<pair<health_status_t,string> > *detail) const
+{
+  if (!failed.empty()) {
+    CachedStackStringStream css;
+    *css << "mds rank"
+	<< ((failed.size() > 1) ? "s ":" ")
+	<< failed
+	<< ((failed.size() > 1) ? " have":" has")
+	<< " failed";
+    summary.push_back(make_pair(HEALTH_ERR, css->str()));
+    if (detail) {
+      for (const auto& r : failed) {
+        CachedStackStringStream css;
+	*css << "mds." << r << " has failed";
+	detail->push_back(make_pair(HEALTH_ERR, css->str()));
+      }
+    }
+  }
+
+  if (!damaged.empty()) {
+    CachedStackStringStream css;
+    *css << "mds rank"
+	 << ((damaged.size() > 1) ? "s ":" ")
+	 << damaged
+	 << ((damaged.size() > 1) ? " are":" is")
+	 << " damaged";
+    summary.push_back(make_pair(HEALTH_ERR, css->str()));
+    if (detail) {
+      for (const auto& r : damaged) {
+        CachedStackStringStream css;
+	*css << "mds." << r << " is damaged";
+	detail->push_back(make_pair(HEALTH_ERR, css->str()));
+      }
+    }
+  }
+
+  if (is_degraded()) {
+    summary.push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
+    if (detail) {
+      detail->push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
+      for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
+	if (!is_up(i))
+	  continue;
+	mds_gid_t gid = up.find(i)->second;
+	const auto& info = mds_info.at(gid);
+        CachedStackStringStream css;
+	if (is_resolve(i))
+	  *css << "mds." << info.name << " at " << info.addrs
+	     << " rank " << i << " is resolving";
+	if (is_replay(i))
+	  *css << "mds." << info.name << " at " << info.addrs
+	     << " rank " << i << " is replaying journal";
+	if (is_rejoin(i))
+	  *css << "mds." << info.name << " at " << info.addrs
+	     << " rank " << i << " is rejoining";
+	if (is_reconnect(i))
+	  *css << "mds." << info.name << " at " << info.addrs
+	     << " rank " << i << " is reconnecting to clients";
+	if (css->strv().length())
+	  detail->push_back(make_pair(HEALTH_WARN, css->str()));
+      }
+    }
+  }
+
+  {
+    CachedStackStringStream css;
+    *css << fs_name << " max_mds " << max_mds;
+    summary.push_back(make_pair(HEALTH_WARN, css->str()));
+  }
+
+  if ((mds_rank_t)up.size() < max_mds) {
+    CachedStackStringStream css;
+    *css << fs_name << " has " << up.size()
+         << " active MDS(s), but has max_mds of " << max_mds;
+    summary.push_back(make_pair(HEALTH_WARN, css->str()));
+  }
+
+  set<string> laggy;
+  for (const auto &u : up) {
+    const auto& info = mds_info.at(u.second);
+    if (info.laggy()) {
+      laggy.insert(info.name);
+      if (detail) {
+        CachedStackStringStream css;
+	*css << "mds." << info.name << " at " << info.addrs
+	    << " is laggy/unresponsive";
+	detail->push_back(make_pair(HEALTH_WARN, css->str()));
+      }
+    }
+  }
+
+  if (!laggy.empty()) {
+    CachedStackStringStream css;
+    *css << "mds " << laggy
+	 << ((laggy.size() > 1) ? " are":" is")
+	 << " laggy";
+    summary.push_back(make_pair(HEALTH_WARN, css->str()));
+  }
+
+  if (get_max_mds() > 1 &&
+      was_snaps_ever_allowed() && !allows_multimds_snaps()) {
+    CachedStackStringStream css;
+    *css << "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
+    summary.push_back(make_pair(HEALTH_WARN, css->str()));
+  }
+}
+
+void MDSMap::get_health_checks(health_check_map_t *checks) const
+{
+  // MDS_DAMAGE
+  if (!damaged.empty()) {
+    health_check_t& check = checks->get_or_add("MDS_DAMAGE", HEALTH_ERR,
+					       "%num% mds daemon%plurals% damaged",
+					       damaged.size());
+    for (const auto& p : damaged) {
+      CachedStackStringStream css;
+      *css << "fs " << fs_name << " mds." << p << " is damaged";
+      check.detail.push_back(css->str());
+    }
+  }
+
+  // FS_DEGRADED
+  if (is_degraded()) {
+    health_check_t& fscheck = checks->get_or_add(
+      "FS_DEGRADED", HEALTH_WARN,
+      "%num% filesystem%plurals% %isorare% degraded", 1);
+    CachedStackStringStream css;
+    *css << "fs " << fs_name << " is degraded";
+    fscheck.detail.push_back(css->str());
+
+    list<string> detail;
+    for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
+      if (!is_up(i))
+	continue;
+      mds_gid_t gid = up.find(i)->second;
+      const auto& info = mds_info.at(gid);
+      CachedStackStringStream css;
+      *css << "fs " << fs_name << " mds." << info.name << " at "
+	 << info.addrs << " rank " << i;
+      if (is_resolve(i))
+	*css << " is resolving";
+      if (is_replay(i))
+	*css << " is replaying journal";
+      if (is_rejoin(i))
+	*css << " is rejoining";
+      if (is_reconnect(i))
+	*css << " is reconnecting to clients";
+      if (css->strv().length())
+	detail.push_back(css->str());
+    }
+  }
+
+  // MDS_UP_LESS_THAN_MAX
+  if ((mds_rank_t)get_num_in_mds() < get_max_mds()) {
+    health_check_t& check = checks->add(
+      "MDS_UP_LESS_THAN_MAX", HEALTH_WARN,
+      "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds", 1);
+    CachedStackStringStream css;
+    *css << "fs " << fs_name << " has " << get_num_in_mds()
+         << " MDS online, but wants " << get_max_mds();
+    check.detail.push_back(css->str());
+  }
+
+  // MDS_ALL_DOWN
+  if ((mds_rank_t)get_num_up_mds() == 0 && get_max_mds() > 0) {
+    health_check_t &check = checks->add(
+      "MDS_ALL_DOWN", HEALTH_ERR,
+      "%num% filesystem%plurals% %isorare% offline", 1);
+    CachedStackStringStream css;
+    *css << "fs " << fs_name << " is offline because no MDS is active for it.";
+    check.detail.push_back(css->str());
+  }
+
+  if (get_max_mds() > 1 &&
+      was_snaps_ever_allowed() && !allows_multimds_snaps()) {
+    health_check_t &check = checks->add(
+      "MULTIMDS_WITH_OLDSNAPS", HEALTH_ERR,
+      "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots", 1);
+    CachedStackStringStream css;
+    *css << "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
+    check.detail.push_back(css->str());
+  }
+
+  if (get_inline_data_enabled()) {
+    health_check_t &check = checks->add(
+      "FS_INLINE_DATA_DEPRECATED", HEALTH_WARN,
+      "%num% filesystem%plurals% with deprecated feature inline_data", 1);
+    CachedStackStringStream css;
+    *css << "fs " << fs_name << " has deprecated feature inline_data enabled.";
+    check.detail.push_back(css->str());
+  }
+}
+
+void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const
+{
+  __u8 v = 10;
+  if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+    v = 7;
+  }
+  ENCODE_START(v, 4, bl);
+  encode(global_id, bl);
+  encode(name, bl);
+  encode(rank, bl);
+  encode(inc, bl);
+  encode((int32_t)state, bl);
+  encode(state_seq, bl);
+  if (v < 8) {
+    encode(addrs.legacy_addr(), bl, features);
+  } else {
+    encode(addrs, bl, features);
+  }
+  encode(laggy_since, bl);
+  encode(MDS_RANK_NONE, bl); /* standby_for_rank */
+  encode(std::string(), bl); /* standby_for_name */
+  encode(export_targets, bl);
+  encode(mds_features, bl);
+  encode(join_fscid, bl); /* formerly: standby_for_fscid */
+  encode(false, bl);
+  if (v >= 9) {
+    encode(flags, bl);
+  }
+  if (v >= 10) {
+    encode(compat, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void MDSMap::mds_info_t::encode_unversioned(bufferlist& bl) const
+{
+  __u8 struct_v = 3;
+  using ceph::encode;
+  encode(struct_v, bl);
+  encode(global_id, bl);
+  encode(name, bl);
+  encode(rank, bl);
+  encode(inc, bl);
+  encode((int32_t)state, bl);
+  encode(state_seq, bl);
+  encode(addrs.legacy_addr(), bl, 0);
+  encode(laggy_since, bl);
+  encode(MDS_RANK_NONE, bl);
+  encode(std::string(), bl);
+  encode(export_targets, bl);
+}
+
+void MDSMap::mds_info_t::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl);
+  decode(global_id, bl);
+  decode(name, bl);
+  decode(rank, bl);
+  decode(inc, bl);
+  int32_t raw_state;
+  decode(raw_state, bl);
+  state = (MDSMap::DaemonState)raw_state;
+  decode(state_seq, bl);
+  decode(addrs, bl);
+  decode(laggy_since, bl);
+  {
+    mds_rank_t standby_for_rank;
+    decode(standby_for_rank, bl);
+  }
+  {
+    std::string standby_for_name;
+    decode(standby_for_name, bl);
+  }
+  if (struct_v >= 2)
+    decode(export_targets, bl);
+  if (struct_v >= 5)
+    decode(mds_features, bl);
+  if (struct_v >= 6) {
+    decode(join_fscid, bl);
+  }
+  if (struct_v >= 7) {
+    bool standby_replay;
+    decode(standby_replay, bl);
+  }
+  if (struct_v >= 9) {
+    decode(flags, bl);
+  }
+  if (struct_v >= 10) {
+    decode(compat, bl);
+  } else {
+    compat = MDSMap::get_compat_set_v16_2_4();
+  }
+  DECODE_FINISH(bl);
+}
+
+std::string MDSMap::mds_info_t::human_name() const
+{
+  // Like "daemon mds.myhost restarted", "Activating daemon mds.myhost"
+  CachedStackStringStream css;
+  *css << "daemon mds." << name;
+  return css->str();
+}
+
+void MDSMap::encode(bufferlist& bl, uint64_t features) const
+{
+  std::map<mds_rank_t,int32_t> inc;  // Legacy field, fake it so that
+                                     // old-mon peers have something sane
+                                     // during upgrade
+  for (const auto rank : in) {
+    inc.insert(std::make_pair(rank, epoch));
+  }
+
+  using ceph::encode;
+  if ((features & CEPH_FEATURE_PGID64) == 0) {
+    __u16 v = 2;
+    encode(v, bl);
+    encode(epoch, bl);
+    encode(flags, bl);
+    encode(last_failure, bl);
+    encode(root, bl);
+    encode(session_timeout, bl);
+    encode(session_autoclose, bl);
+    encode(max_file_size, bl);
+    encode(max_mds, bl);
+    __u32 n = mds_info.size();
+    encode(n, bl);
+    for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
+	i != mds_info.end(); ++i) {
+      encode(i->first, bl);
+      encode(i->second, bl, features);
+    }
+    n = data_pools.size();
+    encode(n, bl);
+    for (const auto p: data_pools) {
+      n = p;
+      encode(n, bl);
+    }
+
+    int32_t m = cas_pool;
+    encode(m, bl);
+    return;
+  } else if ((features & CEPH_FEATURE_MDSENC) == 0) {
+    __u16 v = 3;
+    encode(v, bl);
+    encode(epoch, bl);
+    encode(flags, bl);
+    encode(last_failure, bl);
+    encode(root, bl);
+    encode(session_timeout, bl);
+    encode(session_autoclose, bl);
+    encode(max_file_size, bl);
+    encode(max_mds, bl);
+    __u32 n = mds_info.size();
+    encode(n, bl);
+    for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
+	i != mds_info.end(); ++i) {
+      encode(i->first, bl);
+      encode(i->second, bl, features);
+    }
+    encode(data_pools, bl);
+    encode(cas_pool, bl);
+
+    __u16 ev = 5;
+    encode(ev, bl);
+    encode(compat, bl);
+    encode(metadata_pool, bl);
+    encode(created, bl);
+    encode(modified, bl);
+    encode(tableserver, bl);
+    encode(in, bl);
+    encode(inc, bl);
+    encode(up, bl);
+    encode(failed, bl);
+    encode(stopped, bl);
+    encode(last_failure_osd_epoch, bl);
+    return;
+  }
+
+  ENCODE_START(5, 4, bl);
+  encode(epoch, bl);
+  encode(flags, bl);
+  encode(last_failure, bl);
+  encode(root, bl);
+  encode(session_timeout, bl);
+  encode(session_autoclose, bl);
+  encode(max_file_size, bl);
+  encode(max_mds, bl);
+  encode(mds_info, bl, features);
+  encode(data_pools, bl);
+  encode(cas_pool, bl);
+
+  __u16 ev = 16;
+  encode(ev, bl);
+  encode(compat, bl);
+  encode(metadata_pool, bl);
+  encode(created, bl);
+  encode(modified, bl);
+  encode(tableserver, bl);
+  encode(in, bl);
+  encode(inc, bl);
+  encode(up, bl);
+  encode(failed, bl);
+  encode(stopped, bl);
+  encode(last_failure_osd_epoch, bl);
+  encode(ever_allowed_features, bl);
+  encode(explicitly_allowed_features, bl);
+  encode(inline_data_enabled, bl);
+  encode(enabled, bl);
+  encode(fs_name, bl);
+  encode(damaged, bl);
+  encode(balancer, bl);
+  encode(standby_count_wanted, bl);
+  encode(old_max_mds, bl);
+  {
+    ceph_release_t min_compat_client = ceph_release_t::unknown;
+    encode(min_compat_client, bl);
+  }
+  encode(required_client_features, bl);
+  ENCODE_FINISH(bl);
+}
+
+void MDSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
+{
+  /* Before we did stricter checking, it was possible to remove a data pool
+   * without also deleting it from the MDSMap. Check for that here after
+   * decoding the data pools.
+   */
+
+  for (auto it = data_pools.begin(); it != data_pools.end();) {
+    if (!pool_exists(*it)) {
+      dout(0) << "removed non-existant data pool " << *it << " from MDSMap" << dendl;
+      it = data_pools.erase(it);
+    } else {
+      it++;
+    }
+  }
+}
+
+void MDSMap::decode(bufferlist::const_iterator& p)
+{
+  std::map<mds_rank_t,int32_t> inc;  // Legacy field, parse and drop
+
+  cached_up_features = 0;
+  DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p);
+  decode(epoch, p);
+  decode(flags, p);
+  decode(last_failure, p);
+  decode(root, p);
+  decode(session_timeout, p);
+  decode(session_autoclose, p);
+  decode(max_file_size, p);
+  decode(max_mds, p);
+  decode(mds_info, p);
+  if (struct_v < 3) {
+    __u32 n;
+    decode(n, p);
+    while (n--) {
+      __u32 m;
+      decode(m, p);
+      data_pools.push_back(m);
+    }
+    __s32 s;
+    decode(s, p);
+    cas_pool = s;
+  } else {
+    decode(data_pools, p);
+    decode(cas_pool, p);
+  }
+
+  // kclient ignores everything from here
+  __u16 ev = 1;
+  if (struct_v >= 2)
+    decode(ev, p);
+  if (ev >= 3)
+    decode(compat, p);
+  else
+    compat = get_compat_set_base();
+  if (ev < 5) {
+    __u32 n;
+    decode(n, p);
+    metadata_pool = n;
+  } else {
+    decode(metadata_pool, p);
+  }
+  decode(created, p);
+  decode(modified, p);
+  decode(tableserver, p);
+  decode(in, p);
+  decode(inc, p);
+  decode(up, p);
+  decode(failed, p);
+  decode(stopped, p);
+  if (ev >= 4)
+    decode(last_failure_osd_epoch, p);
+  if (ev >= 6) {
+    if (ev < 10) {
+      // previously this was a bool about snaps, not a flag map
+      bool flag;
+      decode(flag, p);
+      ever_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
+      decode(flag, p);
+      explicitly_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
+    } else {
+      decode(ever_allowed_features, p);
+      decode(explicitly_allowed_features, p);
+    }
+  } else {
+    ever_allowed_features = 0;
+    explicitly_allowed_features = 0;
+  }
+  if (ev >= 7)
+    decode(inline_data_enabled, p);
+
+  if (ev >= 8) {
+    ceph_assert(struct_v >= 5);
+    decode(enabled, p);
+    decode(fs_name, p);
+  } else {
+    if (epoch > 1) {
+      // If an MDS has ever been started, epoch will be greater than 1,
+      // assume filesystem is enabled.
+      enabled = true;
+    } else {
+      // Upgrading from a cluster that never used an MDS, switch off
+      // filesystem until it's explicitly enabled.
+      enabled = false;
+    }
+  }
+
+  if (ev >= 9) {
+    decode(damaged, p);
+  }
+
+  if (ev >= 11) {
+    decode(balancer, p);
+  }
+
+  if (ev >= 12) {
+    decode(standby_count_wanted, p);
+  }
+
+  if (ev >= 13) {
+    decode(old_max_mds, p);
+  }
+
+  if (ev >= 14) {
+    ceph_release_t min_compat_client;
+    if (ev == 14) {
+      int8_t r;
+      decode(r, p);
+      if (r < 0) {
+	min_compat_client = ceph_release_t::unknown;
+      } else {
+	min_compat_client = ceph_release_t{static_cast<uint8_t>(r)};
+      }
+    } else if (ev >= 15) {
+      decode(min_compat_client, p);
+    }
+    if (ev >= 16) {
+      decode(required_client_features, p);
+    } else {
+      set_min_compat_client(min_compat_client);
+    }
+  }
+
+  /* All MDS since at least v14.0.0 understand INLINE */
+  /* TODO: remove after R is released */
+  compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
+
+  for (auto& p: mds_info) {
+    static const CompatSet empty;
+    auto& info = p.second;
+    if (empty.compare(info.compat) == 0) {
+      /* bootstrap old compat; mds_info_t::decode does not have access to MDSMap */
+      info.compat = compat;
+    }
+    /* All MDS since at least v14.0.0 understand INLINE */
+    /* TODO: remove after R is released */
+    info.compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
+  }
+
+  DECODE_FINISH(p);
+}
+
+MDSMap::availability_t MDSMap::is_cluster_available() const
+{
+  if (epoch == 0) {
+    // If I'm a client, this means I'm looking at an MDSMap instance
+    // that was never actually initialized from the mons.  Client should
+    // wait.
+    return TRANSIENT_UNAVAILABLE;
+  }
+
+  // If a rank is marked damage (unavailable until operator intervenes)
+  if (damaged.size()) {
+    return STUCK_UNAVAILABLE;
+  }
+
+  // If no ranks are created (filesystem not initialized)
+  if (in.empty()) {
+    return STUCK_UNAVAILABLE;
+  }
+
+  for (const auto rank : in) {
+    if (up.count(rank) && mds_info.at(up.at(rank)).laggy()) {
+      // This might only be transient, but because we can't see
+      // standbys, we have no way of knowing whether there is a
+      // standby available to replace the laggy guy.
+      return STUCK_UNAVAILABLE;
+    }
+  }
+
+  if (get_num_mds(CEPH_MDS_STATE_ACTIVE) > 0) {
+    // Nobody looks stuck, so indicate to client they should go ahead
+    // and try mounting if anybody is active.  This may include e.g.
+    // one MDS failing over and another active: the client should
+    // proceed to start talking to the active one and let the
+    // transiently-unavailable guy catch up later.
+    return AVAILABLE;
+  } else {
+    // Nothing indicating we were stuck, but nobody active (yet)
+    //return TRANSIENT_UNAVAILABLE;
+
+    // Because we don't have standbys in the MDSMap any more, we can't
+    // reliably indicate transient vs. stuck, so always say stuck so
+    // that the client doesn't block.
+    return STUCK_UNAVAILABLE;
+  }
+}
+
+bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next)
+{
+  if (next == prev)
+    return true;
+  if (next == MDSMap::STATE_DAMAGED)
+    return true;
+
+  if (prev == MDSMap::STATE_BOOT) {
+    return next == MDSMap::STATE_STANDBY;
+  } else if (prev == MDSMap::STATE_STANDBY) {
+    return next == MDSMap::STATE_STANDBY_REPLAY ||
+           next == MDSMap::STATE_REPLAY ||
+           next == MDSMap::STATE_CREATING ||
+           next == MDSMap::STATE_STARTING;
+  } else if (prev == MDSMap::STATE_CREATING || prev == MDSMap::STATE_STARTING) {
+    return next == MDSMap::STATE_ACTIVE;
+  } else if (prev == MDSMap::STATE_STANDBY_REPLAY) {
+    return next == MDSMap::STATE_REPLAY;
+  } else if (prev == MDSMap::STATE_REPLAY) {
+    return next == MDSMap::STATE_RESOLVE ||
+           next == MDSMap::STATE_RECONNECT;
+  } else if (prev >= MDSMap::STATE_RESOLVE && prev < MDSMap::STATE_ACTIVE) {
+    // Once I have entered replay, the only allowable transitions are to
+    // the next next along in the sequence.
+    // Except...
+    if (prev == MDSMap::STATE_REJOIN &&
+        (next == MDSMap::STATE_ACTIVE ||    // No need to do client replay
+         next == MDSMap::STATE_STOPPED)) {  // no subtrees
+      return true;         
+    }
+    return next == prev + 1;
+  } else if (prev == MDSMap::STATE_ACTIVE) {
+    return next == MDSMap::STATE_STOPPING;
+  } else if (prev == MDSMap::STATE_STOPPING) {
+    return next == MDSMap::STATE_STOPPED;
+  } else {
+    derr << __func__ << ": Unknown prev state "
+         << ceph_mds_state_name(prev) << "(" << prev << ")" << dendl;
+    return false;
+  }
+}
+
+bool MDSMap::check_health(mds_rank_t standby_daemon_count)
+{
+  std::set<mds_rank_t> standbys;
+  get_standby_replay_mds_set(standbys);
+  std::set<mds_rank_t> actives;
+  get_active_mds_set(actives);
+  mds_rank_t standbys_avail = (mds_rank_t)standbys.size()+standby_daemon_count;
+
+  /* If there are standby daemons available/replaying and
+   * standby_count_wanted is unset (default), then we set it to 1. This will
+   * happen during health checks by the mons. Also, during initial creation
+   * of the FS we will have no actives so we don't want to change the default
+   * yet.
+   */
+  if (standby_count_wanted == -1 && actives.size() > 0 && standbys_avail > 0) {
+    set_standby_count_wanted(1);
+    return true;
+  }
+  return false;
+}
+
+mds_gid_t MDSMap::find_mds_gid_by_name(std::string_view s) const {
+  for (const auto& [gid, info] : mds_info) {
+    if (info.name == s) {
+      return gid;
+    }
+  }
+  return MDS_GID_NONE;
+}
+
+unsigned MDSMap::get_num_mds(int state) const {
+  unsigned n = 0;
+  for (std::map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin();
+       p != mds_info.end();
+       ++p)
+    if (p->second.state == state) ++n;
+  return n;
+}
+
+void MDSMap::get_up_mds_set(std::set<mds_rank_t>& s) const {
+  for (std::map<mds_rank_t, mds_gid_t>::const_iterator p = up.begin();
+       p != up.end();
+       ++p)
+    s.insert(p->first);
+}
+
+uint64_t MDSMap::get_up_features() {
+  if (!cached_up_features) {
+    bool first = true;
+    for (std::map<mds_rank_t, mds_gid_t>::const_iterator p = up.begin();
+         p != up.end();
+         ++p) {
+      std::map<mds_gid_t, mds_info_t>::const_iterator q =
+        mds_info.find(p->second);
+      ceph_assert(q != mds_info.end());
+      if (first) {
+        cached_up_features = q->second.mds_features;
+        first = false;
+      } else {
+        cached_up_features &= q->second.mds_features;
+      }
+    }
+  }
+  return cached_up_features;
+}
+
+void MDSMap::get_recovery_mds_set(std::set<mds_rank_t>& s) const {
+  s = failed;
+  for (const auto& p : damaged)
+    s.insert(p);
+  for (const auto& p : mds_info)
+    if (p.second.state >= STATE_REPLAY && p.second.state <= STATE_STOPPING)
+      s.insert(p.second.rank);
+}
+
+void MDSMap::get_mds_set_lower_bound(std::set<mds_rank_t>& s, DaemonState first) const {
+  for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin();
+       p != mds_info.end();
+       ++p)
+    if (p->second.state >= first && p->second.state <= STATE_STOPPING)
+      s.insert(p->second.rank);
+}
+
+void MDSMap::get_mds_set(std::set<mds_rank_t>& s, DaemonState state) const {
+  for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin();
+       p != mds_info.end();
+       ++p)
+    if (p->second.state == state)
+      s.insert(p->second.rank);
+}
+
+mds_gid_t MDSMap::get_standby_replay(mds_rank_t r) const {
+  for (auto& [gid,info] : mds_info) {
+    if (info.rank == r && info.state == STATE_STANDBY_REPLAY) {
+      return gid;
+    }
+  }
+  return MDS_GID_NONE;
+}
+
+bool MDSMap::is_degraded() const {
+  if (!failed.empty() || !damaged.empty())
+    return true;
+  for (const auto& p : mds_info) {
+    if (p.second.is_degraded())
+      return true;
+  }
+  return false;
+}
+
+void MDSMap::set_min_compat_client(ceph_release_t version)
+{
+  vector<size_t> bits = CEPHFS_FEATURES_MDS_REQUIRED;
+
+  if (version >= ceph_release_t::octopus)
+    bits.push_back(CEPHFS_FEATURE_OCTOPUS);
+  else if (version >= ceph_release_t::nautilus)
+    bits.push_back(CEPHFS_FEATURE_NAUTILUS);
+  else if (version >= ceph_release_t::mimic)
+    bits.push_back(CEPHFS_FEATURE_MIMIC);
+  else if (version >= ceph_release_t::luminous)
+    bits.push_back(CEPHFS_FEATURE_LUMINOUS);
+  else if (version >= ceph_release_t::kraken)
+    bits.push_back(CEPHFS_FEATURE_KRAKEN);
+  else if (version >= ceph_release_t::jewel)
+    bits.push_back(CEPHFS_FEATURE_JEWEL);
+
+  std::sort(bits.begin(), bits.end());
+  required_client_features = feature_bitset_t(bits);
+}
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
new file mode 100644
index 000000000..3b2ad1bfd
--- /dev/null
+++ b/src/mds/MDSMap.h
@@ -0,0 +1,652 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDSMAP_H
+#define CEPH_MDSMAP_H
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+#include <string_view>
+
+#include <errno.h>
+
+#include "include/types.h"
+#include "include/ceph_features.h"
+#include "include/health.h"
+#include "include/CompatSet.h"
+#include "include/common_fwd.h"
+
+#include "common/Clock.h"
+#include "common/Formatter.h"
+#include "common/ceph_releases.h"
+#include "common/config.h"
+
+#include "mds/mdstypes.h"
+#include "mds/cephfs_features.h"
+
+static inline const auto MDS_FEATURE_INCOMPAT_BASE = CompatSet::Feature(1, "base v0.20");
+static inline const auto MDS_FEATURE_INCOMPAT_CLIENTRANGES = CompatSet::Feature(2, "client writeable ranges");
+static inline const auto MDS_FEATURE_INCOMPAT_FILELAYOUT = CompatSet::Feature(3, "default file layouts on dirs");
+static inline const auto MDS_FEATURE_INCOMPAT_DIRINODE = CompatSet::Feature(4, "dir inode in separate object");
+static inline const auto MDS_FEATURE_INCOMPAT_ENCODING = CompatSet::Feature(5, "mds uses versioned encoding");
+static inline const auto MDS_FEATURE_INCOMPAT_OMAPDIRFRAG = CompatSet::Feature(6, "dirfrag is stored in omap");
+static inline const auto MDS_FEATURE_INCOMPAT_INLINE = CompatSet::Feature(7, "mds uses inline data");
+static inline const auto MDS_FEATURE_INCOMPAT_NOANCHOR = CompatSet::Feature(8, "no anchor table");
+static inline const auto MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2 = CompatSet::Feature(9, "file layout v2");
+static inline const auto MDS_FEATURE_INCOMPAT_SNAPREALM_V2 = CompatSet::Feature(10, "snaprealm v2");
+
+#define MDS_FS_NAME_DEFAULT "cephfs"
+
+class health_check_map_t;
+
+class MDSMap {
+public:
+  /* These states are the union of the set of possible states of an MDS daemon,
+   * and the set of possible states of an MDS rank. See
+   * doc/cephfs/mds-states.rst for state descriptions and a visual state diagram, and
+   * doc/cephfs/mds-state-diagram.dot to update the diagram.
+   */
+  typedef enum {
+    // States of an MDS daemon not currently holding a rank
+    // ====================================================
+    STATE_NULL     =   CEPH_MDS_STATE_NULL,                                  // null value for fns returning this type.
+    STATE_BOOT     =   CEPH_MDS_STATE_BOOT,                // up, boot announcement.  destiny unknown.
+    STATE_STANDBY  =   CEPH_MDS_STATE_STANDBY,             // up, idle.  waiting for assignment by monitor.
+
+    // States of an MDS rank, and of any MDS daemon holding that rank
+    // ==============================================================
+    STATE_STANDBY_REPLAY = CEPH_MDS_STATE_STANDBY_REPLAY,  // up, replaying active node, ready to take over and not serving clients. Note: Up to two MDS hold the rank being replayed.
+    STATE_STOPPED  =   CEPH_MDS_STATE_STOPPED,        // down, once existed, but no subtrees. empty log.  may not be held by a daemon.
+
+    STATE_CREATING  =  CEPH_MDS_STATE_CREATING,       // up, creating MDS instance (new journal, idalloc..).
+    STATE_STARTING  =  CEPH_MDS_STATE_STARTING,       // up, starting prior stopped MDS instance.
+
+    STATE_REPLAY    =  CEPH_MDS_STATE_REPLAY,         // up, starting prior failed instance. scanning journal.
+    STATE_RESOLVE   =  CEPH_MDS_STATE_RESOLVE,        // up, disambiguating distributed operations (import, rename, etc.)
+    STATE_RECONNECT =  CEPH_MDS_STATE_RECONNECT,      // up, reconnect to clients
+    STATE_REJOIN    =  CEPH_MDS_STATE_REJOIN,         // up, replayed journal, rejoining distributed cache
+    STATE_CLIENTREPLAY = CEPH_MDS_STATE_CLIENTREPLAY, // up, active
+    STATE_ACTIVE =     CEPH_MDS_STATE_ACTIVE,         // up, active
+    STATE_STOPPING  =  CEPH_MDS_STATE_STOPPING,       // up, exporting metadata (-> standby or out)
+    STATE_DNE       =  CEPH_MDS_STATE_DNE,             // down, rank does not exist
+
+    // State which a daemon may send to MDSMonitor in its beacon
+    // to indicate that offline repair is required.  Daemon must stop
+    // immediately after indicating this state.
+    STATE_DAMAGED   = CEPH_MDS_STATE_DAMAGED
+
+    /*
+     * In addition to explicit states, an MDS rank implicitly in state:
+     *  - STOPPED if it is not currently associated with an MDS daemon gid but it
+     *    is in MDSMap::stopped
+     *  - FAILED if it is not currently associated with an MDS daemon gid but it
+     *    is in MDSMap::failed
+     *  - DNE if it is not currently associated with an MDS daemon gid and it is
+     *    missing from both MDSMap::failed and MDSMap::stopped
+     */
+  } DaemonState;
+
+  typedef enum
+  {
+    AVAILABLE = 0,
+    TRANSIENT_UNAVAILABLE = 1,
+    STUCK_UNAVAILABLE = 2
+
+  } availability_t;
+
+  struct mds_info_t {
+    enum mds_flags : uint64_t {
+      FROZEN = 1 << 0,
+    };
+
+    mds_info_t() = default;
+
+    bool laggy() const { return !(laggy_since == utime_t()); }
+    void clear_laggy() { laggy_since = utime_t(); }
+
+    bool is_degraded() const {
+      return STATE_REPLAY <= state && state <= STATE_CLIENTREPLAY;
+    }
+
+    void freeze() { flags |= mds_flags::FROZEN; }
+    void unfreeze() { flags &= ~mds_flags::FROZEN; }
+    bool is_frozen() const { return flags&mds_flags::FROZEN; }
+
+    const entity_addrvec_t& get_addrs() const {
+      return addrs;
+    }
+
+    void encode(ceph::buffer::list& bl, uint64_t features) const {
+      if ((features & CEPH_FEATURE_MDSENC) == 0 ) encode_unversioned(bl);
+      else encode_versioned(bl, features);
+    }
+    void decode(ceph::buffer::list::const_iterator& p);
+    void dump(ceph::Formatter *f) const;
+    void dump(std::ostream&) const;
+
+    // The long form name for use in cluster log messages`
+    std::string human_name() const;
+
+    static void generate_test_instances(std::list<mds_info_t*>& ls);
+
+    mds_gid_t global_id = MDS_GID_NONE;
+    std::string name;
+    mds_rank_t rank = MDS_RANK_NONE;
+    int32_t inc = 0;
+    MDSMap::DaemonState state = STATE_STANDBY;
+    version_t state_seq = 0;
+    entity_addrvec_t addrs;
+    utime_t laggy_since;
+    std::set<mds_rank_t> export_targets;
+    fs_cluster_id_t join_fscid = FS_CLUSTER_ID_NONE;
+    uint64_t mds_features = 0;
+    uint64_t flags = 0;
+    CompatSet compat;
+  private:
+    void encode_versioned(ceph::buffer::list& bl, uint64_t features) const;
+    void encode_unversioned(ceph::buffer::list& bl) const;
+  };
+
+  friend class MDSMonitor;
+  friend class Filesystem;
+  friend class FSMap;
+
+  static CompatSet get_compat_set_all();
+  static CompatSet get_compat_set_default();
+  static CompatSet get_compat_set_base(); // pre v0.20
+  static CompatSet get_compat_set_v16_2_4(); // pre-v16.2.5 CompatSet in MDS beacon
+
+  static MDSMap create_null_mdsmap() {
+    MDSMap null_map;
+    /* Use the largest epoch so it's always bigger than whatever the MDS has. */
+    null_map.epoch = std::numeric_limits<decltype(epoch)>::max();
+    return null_map;
+  }
+
+  bool get_inline_data_enabled() const { return inline_data_enabled; }
+  void set_inline_data_enabled(bool enabled) { inline_data_enabled = enabled; }
+
+  utime_t get_session_timeout() const {
+    return utime_t(session_timeout,0);
+  }
+  void set_session_timeout(uint32_t t) {
+    session_timeout = t;
+  }
+
+  utime_t get_session_autoclose() const {
+    return utime_t(session_autoclose, 0);
+  }
+  void set_session_autoclose(uint32_t t) {
+    session_autoclose = t;
+  }
+
+  uint64_t get_max_filesize() const { return max_file_size; }
+  void set_max_filesize(uint64_t m) { max_file_size = m; }
+
+  void set_min_compat_client(ceph_release_t version);
+
+  void add_required_client_feature(size_t bit) {
+    required_client_features.insert(bit);
+  }
+  void remove_required_client_feature(size_t bit) {
+    required_client_features.erase(bit);
+  }
+  const auto& get_required_client_features() const {
+    return required_client_features;
+  }
+  
+  int get_flags() const { return flags; }
+  bool test_flag(int f) const { return flags & f; }
+  void set_flag(int f) { flags |= f; }
+  void clear_flag(int f) { flags &= ~f; }
+
+  std::string_view get_fs_name() const {return fs_name;}
+
+  void set_snaps_allowed() {
+    set_flag(CEPH_MDSMAP_ALLOW_SNAPS);
+    ever_allowed_features |= CEPH_MDSMAP_ALLOW_SNAPS;
+    explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_SNAPS;
+  }
+  void clear_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
+  bool allows_snaps() const { return test_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
+  bool was_snaps_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_SNAPS; }
+
+  void set_standby_replay_allowed() {
+    set_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY);
+    ever_allowed_features |= CEPH_MDSMAP_ALLOW_STANDBY_REPLAY;
+    explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_STANDBY_REPLAY;
+  }
+  void clear_standby_replay_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); }
+  bool allows_standby_replay() const { return test_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); }
+  bool was_standby_replay_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY; }
+
+  void set_multimds_snaps_allowed() {
+    set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS);
+    ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS;
+    explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS;
+  }
+  void clear_multimds_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); }
+  bool allows_multimds_snaps() const { return test_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); }
+
+  epoch_t get_epoch() const { return epoch; }
+  void inc_epoch() { epoch++; }
+
+  bool get_enabled() const { return enabled; }
+
+  const utime_t& get_created() const { return created; }
+  void set_created(utime_t ct) { modified = created = ct; }
+  const utime_t& get_modified() const { return modified; }
+  void set_modified(utime_t mt) { modified = mt; }
+
+  epoch_t get_last_failure() const { return last_failure; }
+  epoch_t get_last_failure_osd_epoch() const { return last_failure_osd_epoch; }
+
+  mds_rank_t get_max_mds() const { return max_mds; }
+  void set_max_mds(mds_rank_t m) { max_mds = m; }
+  void set_old_max_mds() { old_max_mds = max_mds; }
+  mds_rank_t get_old_max_mds() const { return old_max_mds; }
+
+  mds_rank_t get_standby_count_wanted(mds_rank_t standby_daemon_count) const {
+    ceph_assert(standby_daemon_count >= 0);
+    std::set<mds_rank_t> s;
+    get_standby_replay_mds_set(s);
+    mds_rank_t standbys_avail = (mds_rank_t)s.size()+standby_daemon_count;
+    mds_rank_t wanted = std::max(0, standby_count_wanted);
+    return wanted > standbys_avail ? wanted - standbys_avail : 0;
+  }
+  void set_standby_count_wanted(mds_rank_t n) { standby_count_wanted = n; }
+  bool check_health(mds_rank_t standby_daemon_count);
+
+  const std::string get_balancer() const { return balancer; }
+  void set_balancer(std::string val) { balancer.assign(val); }
+
+  mds_rank_t get_tableserver() const { return tableserver; }
+  mds_rank_t get_root() const { return root; }
+
+  const std::vector<int64_t> &get_data_pools() const { return data_pools; }
+  int64_t get_first_data_pool() const { return *data_pools.begin(); }
+  int64_t get_metadata_pool() const { return metadata_pool; }
+  bool is_data_pool(int64_t poolid) const {
+    auto p = std::find(data_pools.begin(), data_pools.end(), poolid);
+    if (p == data_pools.end())
+      return false;
+    return true;
+  }
+
+  bool pool_in_use(int64_t poolid) const {
+    return get_enabled() && (is_data_pool(poolid) || metadata_pool == poolid);
+  }
+
+  const auto& get_mds_info() const { return mds_info; }
+  const auto& get_mds_info_gid(mds_gid_t gid) const {
+    return mds_info.at(gid);
+  }
+  const mds_info_t& get_mds_info(mds_rank_t m) const {
+    ceph_assert(up.count(m) && mds_info.count(up.at(m)));
+    return mds_info.at(up.at(m));
+  }
+  mds_gid_t find_mds_gid_by_name(std::string_view s) const;
+
+  // counts
+  unsigned get_num_in_mds() const {
+    return in.size();
+  }
+  unsigned get_num_up_mds() const {
+    return up.size();
+  }
+  mds_rank_t get_last_in_mds() const {
+    auto p = in.rbegin();
+    return p == in.rend() ? MDS_RANK_NONE : *p;
+  }
+  int get_num_failed_mds() const {
+    return failed.size();
+  }
+  unsigned get_num_standby_replay_mds() const {
+    unsigned num = 0;
+    for (auto& i : mds_info) {
+      if (i.second.state == MDSMap::STATE_STANDBY_REPLAY) {
+	++num;
+      }
+    }
+    return num;
+  }
+  unsigned get_num_mds(int state) const;
+  // data pools
+  void add_data_pool(int64_t poolid) {
+    data_pools.push_back(poolid);
+  }
+  int remove_data_pool(int64_t poolid) {
+    std::vector<int64_t>::iterator p = std::find(data_pools.begin(), data_pools.end(), poolid);
+    if (p == data_pools.end())
+      return -CEPHFS_ENOENT;
+    data_pools.erase(p);
+    return 0;
+  }
+
+  // sets
+  void get_mds_set(std::set<mds_rank_t>& s) const {
+    s = in;
+  }
+  void get_up_mds_set(std::set<mds_rank_t>& s) const;
+  void get_active_mds_set(std::set<mds_rank_t>& s) const {
+    get_mds_set(s, MDSMap::STATE_ACTIVE);
+  }
+  void get_standby_replay_mds_set(std::set<mds_rank_t>& s) const {
+    get_mds_set(s, MDSMap::STATE_STANDBY_REPLAY);
+  }
+  void get_failed_mds_set(std::set<mds_rank_t>& s) const {
+    s = failed;
+  }
+  void get_damaged_mds_set(std::set<mds_rank_t>& s) const {
+    s = damaged;
+  }
+
+  // features
+  uint64_t get_up_features();
+
+  /**
+   * Get MDS ranks which are in but not up.
+   */
+  void get_down_mds_set(std::set<mds_rank_t> *s) const
+  {
+    ceph_assert(s != NULL);
+    s->insert(failed.begin(), failed.end());
+    s->insert(damaged.begin(), damaged.end());
+  }
+
+  int get_failed() const {
+    if (!failed.empty()) return *failed.begin();
+    return -1;
+  }
+  void get_stopped_mds_set(std::set<mds_rank_t>& s) const {
+    s = stopped;
+  }
+  void get_recovery_mds_set(std::set<mds_rank_t>& s) const;
+
+  void get_mds_set_lower_bound(std::set<mds_rank_t>& s, DaemonState first) const;
+  void get_mds_set(std::set<mds_rank_t>& s, DaemonState state) const;
+
+  void get_health(std::list<std::pair<health_status_t,std::string> >& summary,
+		  std::list<std::pair<health_status_t,std::string> > *detail) const;
+
+  void get_health_checks(health_check_map_t *checks) const;
+
+  /**
+   * Return indication of whether cluster is available.  This is a
+   * heuristic for clients to see if they should bother waiting to talk to
+   * MDSs, or whether they should error out at startup/mount.
+   *
+   * A TRANSIENT_UNAVAILABLE result indicates that the cluster is in a
+   * transition state like replaying, or is potentially about the fail over.
+   * Clients should wait for an updated map before making a final decision
+   * about whether the filesystem is mountable.
+   *
+   * A STUCK_UNAVAILABLE result indicates that we can't see a way that
+   * the cluster is about to recover on its own, so it'll probably require
+   * administrator intervention: clients should probably not bother trying
+   * to mount.
+   */
+  availability_t is_cluster_available() const;
+
+  /**
+   * Return whether this MDSMap is suitable for resizing based on the state
+   * of the ranks.
+   */
+  bool is_resizeable() const {
+    return !is_degraded() &&
+        get_num_mds(CEPH_MDS_STATE_CREATING) == 0 &&
+        get_num_mds(CEPH_MDS_STATE_STARTING) == 0 &&
+        get_num_mds(CEPH_MDS_STATE_STOPPING) == 0;
+  }
+
+  // mds states
+  bool is_down(mds_rank_t m) const { return up.count(m) == 0; }
+  bool is_up(mds_rank_t m) const { return up.count(m); }
+  bool is_in(mds_rank_t m) const { return up.count(m) || failed.count(m); }
+  bool is_out(mds_rank_t m) const { return !is_in(m); }
+
+  bool is_failed(mds_rank_t m) const   { return failed.count(m); }
+  bool is_stopped(mds_rank_t m) const    { return stopped.count(m); }
+
+  bool is_dne(mds_rank_t m) const      { return in.count(m) == 0; }
+  bool is_dne_gid(mds_gid_t gid) const     { return mds_info.count(gid) == 0; }
+
+  /**
+   * Get MDS daemon status by GID
+   */
+  auto get_state_gid(mds_gid_t gid) const {
+    auto it = mds_info.find(gid);
+    if (it == mds_info.end())
+      return STATE_NULL;
+    return it->second.state;
+  }
+
+  /**
+   * Get MDS rank state if the rank is up, else STATE_NULL
+   */
+  auto get_state(mds_rank_t m) const {
+    auto it = up.find(m);
+    if (it == up.end())
+      return STATE_NULL;
+    return get_state_gid(it->second);
+  }
+
+  auto get_gid(mds_rank_t r) const {
+    return up.at(r);
+  }
+  const auto& get_info(mds_rank_t m) const {
+    return mds_info.at(up.at(m));
+  }
+  const auto& get_info_gid(mds_gid_t gid) const {
+    return mds_info.at(gid);
+  }
+
+  bool is_boot(mds_rank_t m) const { return get_state(m) == STATE_BOOT; }
+  bool is_bootstrapping(mds_rank_t m) const {
+    return is_creating(m) || is_starting(m) || is_replay(m);
+  }
+  bool is_creating(mds_rank_t m) const { return get_state(m) == STATE_CREATING; }
+  bool is_starting(mds_rank_t m) const { return get_state(m) == STATE_STARTING; }
+  bool is_replay(mds_rank_t m) const   { return get_state(m) == STATE_REPLAY; }
+  bool is_resolve(mds_rank_t m) const  { return get_state(m) == STATE_RESOLVE; }
+  bool is_reconnect(mds_rank_t m) const { return get_state(m) == STATE_RECONNECT; }
+  bool is_rejoin(mds_rank_t m) const   { return get_state(m) == STATE_REJOIN; }
+  bool is_clientreplay(mds_rank_t m) const { return get_state(m) == STATE_CLIENTREPLAY; }
+  bool is_active(mds_rank_t m) const  { return get_state(m) == STATE_ACTIVE; }
+  bool is_stopping(mds_rank_t m) const { return get_state(m) == STATE_STOPPING; }
+  bool is_active_or_stopping(mds_rank_t m) const {
+    return is_active(m) || is_stopping(m);
+  }
+  bool is_clientreplay_or_active_or_stopping(mds_rank_t m) const {
+    return is_clientreplay(m) || is_active(m) || is_stopping(m);
+  }
+
+  mds_gid_t get_standby_replay(mds_rank_t r) const;
+  bool has_standby_replay(mds_rank_t r) const {
+    return get_standby_replay(r) != MDS_GID_NONE;
+  }
+
+  bool is_followable(mds_rank_t r) const {
+    if (auto it1 = up.find(r); it1 != up.end()) {
+      if (auto it2 = mds_info.find(it1->second); it2 != mds_info.end()) {
+        auto& info = it2->second;
+        if (!info.is_degraded() && !has_standby_replay(r)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  bool is_laggy_gid(mds_gid_t gid) const {
+    auto it = mds_info.find(gid);
+    return it == mds_info.end() ? false : it->second.laggy();
+  }
+
+  // degraded = some recovery in process.  fixes active membership and
+  // recovery_set.
+  bool is_degraded() const;
+  bool is_any_failed() const {
+    return !failed.empty();
+  }
+  bool is_any_damaged() const {
+    return !damaged.empty();
+  }
+  bool is_resolving() const {
+    return
+      get_num_mds(STATE_RESOLVE) > 0 &&
+      get_num_mds(STATE_REPLAY) == 0 &&
+      failed.empty() && damaged.empty();
+  }
+  bool is_rejoining() const {
+    // nodes are rejoining cache state
+    return 
+      get_num_mds(STATE_REJOIN) > 0 &&
+      get_num_mds(STATE_REPLAY) == 0 &&
+      get_num_mds(STATE_RECONNECT) == 0 &&
+      get_num_mds(STATE_RESOLVE) == 0 &&
+      failed.empty() && damaged.empty();
+  }
+  bool is_stopped() const {
+    return up.empty();
+  }
+
+  /**
+   * Get whether a rank is 'up', i.e. has
+   * an MDS daemon's entity_inst_t associated
+   * with it.
+   */
+  bool have_inst(mds_rank_t m) const {
+    return up.count(m);
+  }
+
+  /**
+   * Get the MDS daemon entity_inst_t for a rank
+   * known to be up.
+   */
+  entity_addrvec_t get_addrs(mds_rank_t m) const {
+    return mds_info.at(up.at(m)).get_addrs();
+  }
+
+  mds_rank_t get_rank_gid(mds_gid_t gid) const {
+    if (mds_info.count(gid)) {
+      return mds_info.at(gid).rank;
+    } else {
+      return MDS_RANK_NONE;
+    }
+  }
+
+  /**
+   * Get MDS rank incarnation if the rank is up, else -1
+   */
+  mds_gid_t get_incarnation(mds_rank_t m) const {
+    auto it = up.find(m);
+    if (it == up.end())
+      return MDS_GID_NONE;
+    return (mds_gid_t)get_inc_gid(it->second);
+  }
+
+  int get_inc_gid(mds_gid_t gid) const {
+    auto mds_info_entry = mds_info.find(gid);
+    if (mds_info_entry != mds_info.end())
+      return mds_info_entry->second.inc;
+    return -1;
+  }
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void decode(const ceph::buffer::list& bl) {
+    auto p = bl.cbegin();
+    decode(p);
+  }
+  void sanitize(const std::function<bool(int64_t pool)>& pool_exists);
+
+  void print(std::ostream& out) const;
+  void print_summary(ceph::Formatter *f, std::ostream *out) const;
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<MDSMap*>& ls);
+
+  static bool state_transition_valid(DaemonState prev, DaemonState next);
+
+  CompatSet compat;
+protected:
+  // base map
+  epoch_t epoch = 0;
+  bool enabled = false;
+  std::string fs_name = MDS_FS_NAME_DEFAULT;
+  uint32_t flags = CEPH_MDSMAP_DEFAULTS; // flags
+  epoch_t last_failure = 0;  // mds epoch of last failure
+  epoch_t last_failure_osd_epoch = 0; // osd epoch of last failure; any mds entering replay needs
+                                  // at least this osdmap to ensure the blocklist propagates.
+  utime_t created;
+  utime_t modified;
+
+  mds_rank_t tableserver = 0;   // which MDS has snaptable
+  mds_rank_t root = 0;          // which MDS has root directory
+
+  __u32 session_timeout = 60;
+  __u32 session_autoclose = 300;
+  uint64_t max_file_size = 1ULL<<40; /* 1TB */
+
+  feature_bitset_t required_client_features;
+
+  std::vector<int64_t> data_pools;  // file data pools available to clients (via an ioctl).  first is the default.
+  int64_t cas_pool = -1;            // where CAS objects go
+  int64_t metadata_pool = -1;       // where fs metadata objects go
+
+  /*
+   * in: the set of logical mds #'s that define the cluster.  this is the set
+   *     of mds's the metadata may be distributed over.
+   * up: map from logical mds #'s to the addrs filling those roles.
+   * failed: subset of @in that are failed.
+   * stopped: set of nodes that have been initialized, but are not active.
+   *
+   *    @up + @failed = @in.  @in * @stopped = {}.
+   */
+
+  mds_rank_t max_mds = 1; /* The maximum number of active MDSes. Also, the maximum rank. */
+  mds_rank_t old_max_mds = 0; /* Value to restore when MDS cluster is marked up */
+  mds_rank_t standby_count_wanted = -1;
+  std::string balancer;    /* The name/version of the mantle balancer (i.e. the rados obj name) */
+
+  std::set<mds_rank_t> in;              // currently defined cluster
+
+  // which ranks are failed, stopped, damaged (i.e. not held by a daemon)
+  std::set<mds_rank_t> failed, stopped, damaged;
+  std::map<mds_rank_t, mds_gid_t> up;        // who is in those roles
+  std::map<mds_gid_t, mds_info_t> mds_info;
+
+  uint8_t ever_allowed_features = 0; //< bitmap of features the cluster has allowed
+  uint8_t explicitly_allowed_features = 0; //< bitmap of features explicitly enabled
+
+  bool inline_data_enabled = false;
+
+  uint64_t cached_up_features = 0;
+
+};
+WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t)
+WRITE_CLASS_ENCODER_FEATURES(MDSMap)
+
+inline std::ostream& operator<<(std::ostream &out, const MDSMap &m) {
+  m.print_summary(NULL, &out);
+  return out;
+}
+
+inline std::ostream& operator<<(std::ostream& o, const MDSMap::mds_info_t& info) {
+  info.dump(o);
+  return o;
+}
+#endif
diff --git a/src/mds/MDSPerfMetricTypes.h b/src/mds/MDSPerfMetricTypes.h
new file mode 100644
index 000000000..78b838c89
--- /dev/null
+++ b/src/mds/MDSPerfMetricTypes.h
@@ -0,0 +1,418 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MDS_PERF_METRIC_TYPES_H
+#define CEPH_MDS_PERF_METRIC_TYPES_H
+
+#include <ostream>
+
+#include "include/denc.h"
+#include "include/utime.h"
+#include "mdstypes.h"
+
+enum UpdateType : uint32_t {
+  UPDATE_TYPE_REFRESH = 0,
+  UPDATE_TYPE_REMOVE,
+};
+
+struct CapHitMetric {
+  uint64_t hits = 0;
+  uint64_t misses = 0;
+
+  DENC(CapHitMetric, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.hits, p);
+    denc(v.misses, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_unsigned("hits", hits);
+    f->dump_unsigned("misses", misses);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const CapHitMetric &metric) {
+    os << "{hits=" << metric.hits << ", misses=" << metric.misses << "}";
+    return os;
+  }
+};
+
+struct ReadLatencyMetric {
+  utime_t lat;
+  utime_t mean;
+  uint64_t sq_sum;
+  uint64_t count;
+  bool updated = false;
+
+  DENC(ReadLatencyMetric, v, p) {
+    DENC_START(3, 1, p);
+    denc(v.lat, p);
+    if (struct_v >= 2)
+      denc(v.updated, p);
+    if (struct_v >= 3) {
+      denc(v.mean, p);
+      denc(v.sq_sum, p);
+      denc(v.count, p);
+    }
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_object("read_latency", lat);
+    f->dump_object("avg_read_alatency", mean);
+    f->dump_unsigned("sq_sum", sq_sum);
+    f->dump_unsigned("count", count);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const ReadLatencyMetric &metric) {
+    os << "{latency=" << metric.lat << ", avg_latency=" << metric.mean
+       << ", sq_sum=" << metric.sq_sum << ", count=" << metric.count << "}";
+    return os;
+  }
+};
+
+struct WriteLatencyMetric {
+  utime_t lat;
+  utime_t mean;
+  uint64_t sq_sum;
+  uint64_t count;
+  bool updated = false;
+
+  DENC(WriteLatencyMetric, v, p) {
+    DENC_START(3, 1, p);
+    denc(v.lat, p);
+    if (struct_v >= 2)
+      denc(v.updated, p);
+    if (struct_v >= 3) {
+      denc(v.mean, p);
+      denc(v.sq_sum, p);
+      denc(v.count, p);
+    }
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_object("write_latency", lat);
+    f->dump_object("avg_write_alatency", mean);
+    f->dump_unsigned("sq_sum", sq_sum);
+    f->dump_unsigned("count", count);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const WriteLatencyMetric &metric) {
+    os << "{latency=" << metric.lat << ", avg_latency=" << metric.mean
+       << ", sq_sum=" << metric.sq_sum << ", count=" << metric.count  << "}";
+    return os;
+  }
+};
+
+struct MetadataLatencyMetric {
+  utime_t lat;
+  utime_t mean;
+  uint64_t sq_sum;
+  uint64_t count;
+  bool updated = false;
+
+  DENC(MetadataLatencyMetric, v, p) {
+    DENC_START(3, 1, p);
+    denc(v.lat, p);
+    if (struct_v >= 2)
+      denc(v.updated, p);
+    if (struct_v >= 3) {
+      denc(v.mean, p);
+      denc(v.sq_sum, p);
+      denc(v.count, p);
+    }
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_object("metadata_latency", lat);
+    f->dump_object("avg_metadata_alatency", mean);
+    f->dump_unsigned("sq_sum", sq_sum);
+    f->dump_unsigned("count", count);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const MetadataLatencyMetric &metric) {
+    os << "{latency=" << metric.lat << ", avg_latency=" << metric.mean
+       << ", sq_sum=" << metric.sq_sum << ", count=" << metric.count << "}";
+    return os;
+  }
+};
+
+struct DentryLeaseHitMetric {
+  uint64_t hits = 0;
+  uint64_t misses = 0;
+  bool updated = false;
+
+  DENC(DentryLeaseHitMetric, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.hits, p);
+    denc(v.misses, p);
+    denc(v.updated, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_unsigned("hits", hits);
+    f->dump_unsigned("misses", misses);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const DentryLeaseHitMetric &metric) {
+    os << "{hits=" << metric.hits << ", misses=" << metric.misses << "}";
+    return os;
+  }
+};
+
+struct OpenedFilesMetric {
+  uint64_t opened_files = 0;
+  uint64_t total_inodes = 0;
+  bool updated = false;
+
+  DENC(OpenedFilesMetric, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.opened_files, p);
+    denc(v.total_inodes, p);
+    denc(v.updated, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_unsigned("opened_files", opened_files);
+    f->dump_unsigned("total_inodes", total_inodes);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const OpenedFilesMetric &metric) {
+    os << "{opened_files=" << metric.opened_files  << ", total_inodes="
+       << metric.total_inodes << "}";
+    return os;
+  }
+};
+
+struct PinnedIcapsMetric {
+  uint64_t pinned_icaps = 0;
+  uint64_t total_inodes = 0;
+  bool updated = false;
+
+  DENC(PinnedIcapsMetric, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.pinned_icaps, p);
+    denc(v.total_inodes, p);
+    denc(v.updated, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_unsigned("pinned_icaps", pinned_icaps);
+    f->dump_unsigned("total_inodes", total_inodes);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const PinnedIcapsMetric &metric) {
+    os << "{pinned_icaps=" << metric.pinned_icaps << ", total_inodes="
+       << metric.total_inodes << "}";
+    return os;
+  }
+};
+
+struct OpenedInodesMetric {
+  uint64_t opened_inodes = 0;
+  uint64_t total_inodes = 0;
+  bool updated = false;
+
+  DENC(OpenedInodesMetric, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.opened_inodes, p);
+    denc(v.total_inodes, p);
+    denc(v.updated, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_unsigned("opened_inodes", opened_inodes);
+    f->dump_unsigned("total_inodes", total_inodes);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const OpenedInodesMetric &metric) {
+    os << "{opened_inodes=" << metric.opened_inodes << ", total_inodes="
+       << metric.total_inodes << "}";
+    return os;
+  }
+};
+
+struct ReadIoSizesMetric {
+  uint64_t total_ops = 0;
+  uint64_t total_size = 0;
+  bool updated = false;
+
+  DENC(ReadIoSizesMetric, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.total_ops, p);
+    denc(v.total_size, p);
+    denc(v.updated, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_unsigned("total_ops", total_ops);
+    f->dump_unsigned("total_size", total_size);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const ReadIoSizesMetric &metric) {
+    os << "{total_ops=" << metric.total_ops << ", total_size=" << metric.total_size <<"}";
+    return os;
+  }
+};
+
+struct WriteIoSizesMetric {
+  uint64_t total_ops = 0;
+  uint64_t total_size = 0;
+  bool updated = false;
+
+  DENC(WriteIoSizesMetric, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.total_ops, p);
+    denc(v.total_size, p);
+    denc(v.updated, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_unsigned("total_ops", total_ops);
+    f->dump_unsigned("total_size", total_size);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const WriteIoSizesMetric &metric) {
+    os << "{total_ops=" << metric.total_ops << ", total_size=" << metric.total_size <<"}";
+    return os;
+  }
+};
+
+WRITE_CLASS_DENC(CapHitMetric)
+WRITE_CLASS_DENC(ReadLatencyMetric)
+WRITE_CLASS_DENC(WriteLatencyMetric)
+WRITE_CLASS_DENC(MetadataLatencyMetric)
+WRITE_CLASS_DENC(DentryLeaseHitMetric)
+WRITE_CLASS_DENC(OpenedFilesMetric)
+WRITE_CLASS_DENC(PinnedIcapsMetric)
+WRITE_CLASS_DENC(OpenedInodesMetric)
+WRITE_CLASS_DENC(ReadIoSizesMetric)
+WRITE_CLASS_DENC(WriteIoSizesMetric)
+
+// metrics that are forwarded to the MDS by client(s).
+struct Metrics {
+  // metrics
+  CapHitMetric cap_hit_metric;
+  ReadLatencyMetric read_latency_metric;
+  WriteLatencyMetric write_latency_metric;
+  MetadataLatencyMetric metadata_latency_metric;
+  DentryLeaseHitMetric dentry_lease_metric;
+  OpenedFilesMetric opened_files_metric;
+  PinnedIcapsMetric pinned_icaps_metric;
+  OpenedInodesMetric opened_inodes_metric;
+  ReadIoSizesMetric read_io_sizes_metric;
+  WriteIoSizesMetric write_io_sizes_metric;
+
+  // metric update type
+  uint32_t update_type = UpdateType::UPDATE_TYPE_REFRESH;
+
+  DENC(Metrics, v, p) {
+    DENC_START(4, 1, p);
+    denc(v.update_type, p);
+    denc(v.cap_hit_metric, p);
+    denc(v.read_latency_metric, p);
+    denc(v.write_latency_metric, p);
+    denc(v.metadata_latency_metric, p);
+    if (struct_v >= 2) {
+      denc(v.dentry_lease_metric, p);
+    }
+    if (struct_v >= 3) {
+      denc(v.opened_files_metric, p);
+      denc(v.pinned_icaps_metric, p);
+      denc(v.opened_inodes_metric, p);
+    }
+    if (struct_v >= 4) {
+      denc(v.read_io_sizes_metric, p);
+      denc(v.write_io_sizes_metric, p);
+    }
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_int("update_type", static_cast<uint32_t>(update_type));
+    f->dump_object("cap_hit_metric", cap_hit_metric);
+    f->dump_object("read_latency_metric", read_latency_metric);
+    f->dump_object("write_latency_metric", write_latency_metric);
+    f->dump_object("metadata_latency_metric", metadata_latency_metric);
+    f->dump_object("dentry_lease_metric", dentry_lease_metric);
+    f->dump_object("opened_files_metric", opened_files_metric);
+    f->dump_object("pinned_icaps_metric", pinned_icaps_metric);
+    f->dump_object("opened_inodes_metric", opened_inodes_metric);
+    f->dump_object("read_io_sizes_metric", read_io_sizes_metric);
+    f->dump_object("write_io_sizes_metric", write_io_sizes_metric);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const Metrics& metrics) {
+    os << "[update_type=" << metrics.update_type << ", metrics={"
+       << "cap_hit_metric=" << metrics.cap_hit_metric
+       << ", read_latency=" << metrics.read_latency_metric
+       << ", write_latency=" << metrics.write_latency_metric
+       << ", metadata_latency=" << metrics.metadata_latency_metric
+       << ", dentry_lease=" << metrics.dentry_lease_metric
+       << ", opened_files_metric=" << metrics.opened_files_metric
+       << ", pinned_icaps_metric=" << metrics.pinned_icaps_metric
+       << ", opened_inodes_metric=" << metrics.opened_inodes_metric
+       << ", read_io_sizes_metric=" << metrics.read_io_sizes_metric
+       << ", write_io_sizes_metric=" << metrics.write_io_sizes_metric
+       << "}]";
+    return os;
+  }
+};
+WRITE_CLASS_DENC(Metrics)
+
+struct metrics_message_t {
+  version_t seq = 0;
+  mds_rank_t rank = MDS_RANK_NONE;
+  std::map<entity_inst_t, Metrics> client_metrics_map;
+
+  metrics_message_t() {
+  }
+  metrics_message_t(version_t seq, mds_rank_t rank)
+    : seq(seq), rank(rank) {
+  }
+
+  void encode(bufferlist &bl, uint64_t features) const {
+    using ceph::encode;
+    ENCODE_START(1, 1, bl);
+    encode(seq, bl);
+    encode(rank, bl);
+    encode(client_metrics_map, bl, features);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+    DECODE_START(1, iter);
+    decode(seq, iter);
+    decode(rank, iter);
+    decode(client_metrics_map, iter);
+    DECODE_FINISH(iter);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_unsigned("seq", seq);
+    f->dump_int("rank", rank);
+    for (auto &[client, metrics] : client_metrics_map) {
+      f->dump_object("client", client);
+      f->dump_object("metrics", metrics);
+    }
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const metrics_message_t &metrics_message) {
+    os << "[sequence=" << metrics_message.seq << ", rank=" << metrics_message.rank
+       << ", metrics=" << metrics_message.client_metrics_map << "]";
+    return os;
+  }
+};
+
+WRITE_CLASS_ENCODER_FEATURES(metrics_message_t)
+
+#endif // CEPH_MDS_PERF_METRIC_TYPES_H
diff --git a/src/mds/MDSPinger.cc b/src/mds/MDSPinger.cc
new file mode 100644
index 000000000..bc63a22f9
--- /dev/null
+++ b/src/mds/MDSPinger.cc
@@ -0,0 +1,103 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/dout.h"
+
+#include "mds/MDSRank.h"
+#include "mds/MDSPinger.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds.pinger " << __func__
+
+MDSPinger::MDSPinger(MDSRank *mds)
+  : mds(mds) {
+}
+
+void MDSPinger::send_ping(mds_rank_t rank, const entity_addrvec_t &addr) {
+  dout(10) << ": rank=" << rank << dendl;
+
+  std::scoped_lock locker(lock);
+  auto [it, inserted] = ping_state_by_rank.emplace(rank, PingState());
+  if (inserted) {
+    dout(20) << ": init ping pong state for rank=" << rank << dendl;
+  }
+
+  auto &ping_state = it->second;
+  auto last_seq = ping_state.last_seq++;
+
+  ping_state.seq_time_map.emplace(last_seq, clock::now());
+
+  dout(10) << ": sending ping with sequence=" << last_seq << " to rank="
+           << rank << dendl;
+  mds->send_message_mds(make_message<MMDSPing>(last_seq), addr);
+}
+
+bool MDSPinger::pong_received(mds_rank_t rank, version_t seq) {
+  dout(10) << ": rank=" << rank << ", sequence=" << seq << dendl;
+
+  std::scoped_lock locker(lock);
+  auto it1 = ping_state_by_rank.find(rank);
+  if (it1 == ping_state_by_rank.end()) {
+    // this *might* just happen on mds failover when a non-rank-0 mds
+    // acks backs a ping message from an earlier rank 0 mds to a newly
+    // appointed rank 0 mds (possible?).
+    // or when non rank 0 active MDSs begin sending metric updates before
+    // rank 0 can start pinging it (although, that should resolve out soon).
+    dout(10) << ": received pong from rank=" << rank << " to which ping was never"
+             << " sent (ignoring...)." << dendl;
+    return false;
+  }
+
+  auto &ping_state = it1->second;
+  // find incoming seq timestamp for updation
+  auto it2 = ping_state.seq_time_map.find(seq);
+  if (it2 == ping_state.seq_time_map.end()) {
+    // rank still bootstrapping
+    dout(10) << ": pong received for unknown ping sequence " << seq
+             << ", rank " << rank << " should catch up soon." << dendl;
+    return false;
+  }
+
+  ping_state.last_acked_time = it2->second;
+  ping_state.seq_time_map.erase(ping_state.seq_time_map.begin(), it2);
+
+  return true;
+}
+
+void MDSPinger::reset_ping(mds_rank_t rank) {
+  dout(10) << ": rank=" << rank << dendl;
+
+  std::scoped_lock locker(lock);
+  auto it = ping_state_by_rank.find(rank);
+  if (it == ping_state_by_rank.end()) {
+    dout(10) << ": rank=" << rank << " was never sent ping request." << dendl;
+    return;
+  }
+
+  // remove the rank from ping state, send_ping() will init it
+  // later when invoked.
+  ping_state_by_rank.erase(it);
+}
+
+bool MDSPinger::is_rank_lagging(mds_rank_t rank) {
+  dout(10) << ": rank=" << rank << dendl;
+
+  std::scoped_lock locker(lock);
+  auto it = ping_state_by_rank.find(rank);
+  if (it == ping_state_by_rank.end()) {
+    derr << ": rank=" << rank << " was never sent ping request." << dendl;
+    return false;
+  }
+
+  auto now = clock::now();
+  auto since = std::chrono::duration<double>(now - it->second.last_acked_time).count();
+  if (since > g_conf().get_val<std::chrono::seconds>("mds_ping_grace").count()) {
+    dout(5) << ": rank=" << rank << " is lagging a pong response (last ack time is "
+            <<  it->second.last_acked_time << ")" << dendl;
+    return true;
+  }
+
+  return false;
+}
diff --git a/src/mds/MDSPinger.h b/src/mds/MDSPinger.h
new file mode 100644
index 000000000..51c3ebeeb
--- /dev/null
+++ b/src/mds/MDSPinger.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MDS_PINGER_H
+#define CEPH_MDS_PINGER_H
+
+#include <map>
+
+#include "include/types.h"
+
+#include "msg/msg_types.h"
+#include "common/ceph_mutex.h"
+#include "common/ceph_time.h"
+#include "messages/MMDSPing.h"
+
+#include "mdstypes.h"
+
+class MDSRank;
+
+class MDSPinger {
+public:
+  MDSPinger(MDSRank *mds);
+
+  // send a ping message to an mds rank. initialize ping state if
+  // required.
+  void send_ping(mds_rank_t rank, const entity_addrvec_t &addr);
+
+  // check if a pong response is valid. a pong reponse from an
+  // mds is valid if at least one ping message was sent to the
+  // mds and the sequence number in the pong is outstanding.
+  bool pong_received(mds_rank_t rank, version_t seq);
+
+  // reset the ping state for a given rank
+  void reset_ping(mds_rank_t rank);
+
+  // check if a rank is lagging (based on pong response) responding
+  // to a ping message.
+  bool is_rank_lagging(mds_rank_t rank);
+
+private:
+  using clock = ceph::coarse_mono_clock;
+  using time = ceph::coarse_mono_time;
+
+  // Initial Sequence Number (ISN) of the first ping message sent
+  // by rank 0 to other active ranks (incuding itself).
+  static constexpr uint64_t MDS_PINGER_ISN = 1;
+
+  struct PingState {
+    version_t last_seq = MDS_PINGER_ISN;
+    std::map<version_t, time> seq_time_map;
+    time last_acked_time = clock::now();
+  };
+
+  MDSRank *mds;
+  // drop this lock when calling ->send_message_mds() else mds might
+  // deadlock
+  ceph::mutex lock = ceph::make_mutex("MDSPinger::lock");
+  std::map<mds_rank_t, PingState> ping_state_by_rank;
+};
+
+#endif // CEPH_MDS_PINGER_H
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
new file mode 100644
index 000000000..bae7706f6
--- /dev/null
+++ b/src/mds/MDSRank.cc
@@ -0,0 +1,3861 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <string_view>
+#include <typeinfo>
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/likely.h"
+#include "common/async/blocked_completion.h"
+
+#include "messages/MClientRequestForward.h"
+#include "messages/MMDSLoadTargets.h"
+#include "messages/MMDSTableRequest.h"
+#include "messages/MMDSMetrics.h"
+
+#include "mgr/MgrClient.h"
+
+#include "MDSDaemon.h"
+#include "MDSMap.h"
+#include "MetricAggregator.h"
+#include "SnapClient.h"
+#include "SnapServer.h"
+#include "MDBalancer.h"
+#include "Migrator.h"
+#include "Locker.h"
+#include "InoTable.h"
+#include "mon/MonClient.h"
+#include "common/HeartbeatMap.h"
+#include "ScrubStack.h"
+
+
+#include "MDSRank.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << whoami << '.' << incarnation << ' '
+using TOPNSPC::common::cmd_getval;
+class C_Flush_Journal : public MDSInternalContext {
+public:
+  C_Flush_Journal(MDCache *mdcache, MDLog *mdlog, MDSRank *mds,
+                  std::ostream *ss, Context *on_finish)
+    : MDSInternalContext(mds),
+      mdcache(mdcache), mdlog(mdlog), ss(ss), on_finish(on_finish),
+      whoami(mds->whoami), incarnation(mds->incarnation) {
+  }
+
+  void send() {
+    assert(ceph_mutex_is_locked(mds->mds_lock));
+
+    dout(20) << __func__ << dendl;
+
+    if (mdcache->is_readonly()) {
+      dout(5) << __func__ << ": read-only FS" << dendl;
+      complete(-CEPHFS_EROFS);
+      return;
+    }
+
+    if (!mds->is_active()) {
+      dout(5) << __func__ << ": MDS not active, no-op" << dendl;
+      complete(0);
+      return;
+    }
+
+    flush_mdlog();
+  }
+
+private:
+
+  void flush_mdlog() {
+    dout(20) << __func__ << dendl;
+
+    // I need to seal off the current segment, and then mark all
+    // previous segments for expiry
+    mdlog->start_new_segment();
+
+    Context *ctx = new LambdaContext([this](int r) {
+        handle_flush_mdlog(r);
+      });
+
+    // Flush initially so that all the segments older than our new one
+    // will be elegible for expiry
+    mdlog->flush();
+    mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
+  }
+
+  void handle_flush_mdlog(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    if (r != 0) {
+      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
+      complete(r);
+      return;
+    }
+
+    clear_mdlog();
+  }
+
+  void clear_mdlog() {
+    dout(20) << __func__ << dendl;
+
+    Context *ctx = new LambdaContext([this](int r) {
+        handle_clear_mdlog(r);
+      });
+
+    // Because we may not be the last wait_for_safe context on MDLog,
+    // and subsequent contexts might wake up in the middle of our
+    // later trim_all and interfere with expiry (by e.g. marking
+    // dirs/dentries dirty on previous log segments), we run a second
+    // wait_for_safe here. See #10368
+    mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
+  }
+
+  void handle_clear_mdlog(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    if (r != 0) {
+      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
+      complete(r);
+      return;
+    }
+
+    trim_mdlog();
+  }
+
+  void trim_mdlog() {
+    // Put all the old log segments into expiring or expired state
+    dout(5) << __func__ << ": beginning segment expiry" << dendl;
+
+    int ret = mdlog->trim_all();
+    if (ret != 0) {
+      *ss << "Error " << ret << " (" << cpp_strerror(ret) << ") while trimming log";
+      complete(ret);
+      return;
+    }
+
+    expire_segments();
+  }
+
+  void expire_segments() {
+    dout(20) << __func__ << dendl;
+
+    // Attach contexts to wait for all expiring segments to expire
+    MDSGatherBuilder expiry_gather(g_ceph_context);
+
+    const auto &expiring_segments = mdlog->get_expiring_segments();
+    for (auto p : expiring_segments) {
+      p->wait_for_expiry(expiry_gather.new_sub());
+    }
+    dout(5) << __func__ << ": waiting for " << expiry_gather.num_subs_created()
+            << " segments to expire" << dendl;
+
+    if (!expiry_gather.has_subs()) {
+      trim_segments();
+      return;
+    }
+
+    Context *ctx = new LambdaContext([this](int r) {
+        handle_expire_segments(r);
+      });
+    expiry_gather.set_finisher(new MDSInternalContextWrapper(mds, ctx));
+    expiry_gather.activate();
+  }
+
+  void handle_expire_segments(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    ceph_assert(r == 0); // MDLog is not allowed to raise errors via
+                         // wait_for_expiry
+    trim_segments();
+  }
+
+  void trim_segments() {
+    dout(20) << __func__ << dendl;
+
+    Context *ctx = new C_OnFinisher(new LambdaContext([this](int) {
+          std::lock_guard locker(mds->mds_lock);
+          trim_expired_segments();
+        }), mds->finisher);
+    ctx->complete(0);
+  }
+
+  void trim_expired_segments() {
+    dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now "
+            << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
+            << mdlog->get_journaler()->get_trimmed_pos() << dendl;
+
+    // Now everyone I'm interested in is expired
+    mdlog->trim_expired_segments();
+
+    dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now "
+            << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
+            << mdlog->get_journaler()->get_trimmed_pos() << dendl;
+
+    write_journal_head();
+  }
+
+  void write_journal_head() {
+    dout(20) << __func__ << dendl;
+
+    Context *ctx = new LambdaContext([this](int r) {
+        std::lock_guard locker(mds->mds_lock);
+        handle_write_head(r);
+      });
+    // Flush the journal header so that readers will start from after
+    // the flushed region
+    mdlog->get_journaler()->write_head(ctx);
+  }
+
+  void handle_write_head(int r) {
+    if (r != 0) {
+      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
+    } else {
+      dout(5) << __func__ << ": write_head complete, all done!" << dendl;
+    }
+
+    complete(r);
+  }
+
+  void finish(int r) override {
+    dout(20) << __func__ << ": r=" << r << dendl;
+    on_finish->complete(r);
+  }
+
+  MDCache *mdcache;
+  MDLog *mdlog;
+  std::ostream *ss;
+  Context *on_finish;
+
+  // so as to use dout
+  mds_rank_t whoami;
+  int incarnation;
+};
+
+class C_Drop_Cache : public MDSInternalContext {
+public:
+  C_Drop_Cache(Server *server, MDCache *mdcache, MDLog *mdlog,
+               MDSRank *mds, uint64_t recall_timeout,
+               Formatter *f, Context *on_finish)
+    : MDSInternalContext(mds),
+      server(server), mdcache(mdcache), mdlog(mdlog),
+      recall_timeout(recall_timeout), recall_start(mono_clock::now()),
+      f(f), on_finish(on_finish),
+      whoami(mds->whoami), incarnation(mds->incarnation) {
+  }
+
+  void send() {
+    // not really a hard requirement here, but lets ensure this in
+    // case we change the logic here.
+    assert(ceph_mutex_is_locked(mds->mds_lock));
+
+    dout(20) << __func__ << dendl;
+    f->open_object_section("result");
+    recall_client_state();
+  }
+
+private:
+  // context which completes itself (with -CEPHFS_ETIMEDOUT) after a specified
+  // timeout or when explicitly completed, whichever comes first. Note
+  // that the context does not detroy itself after completion -- it
+  // needs to be explicitly freed.
+  class C_ContextTimeout : public MDSInternalContext {
+  public:
+    C_ContextTimeout(MDSRank *mds, uint64_t timeout, Context *on_finish)
+      : MDSInternalContext(mds),
+        timeout(timeout),
+        on_finish(on_finish) {
+    }
+    ~C_ContextTimeout() {
+      ceph_assert(timer_task == nullptr);
+    }
+
+    void start_timer() {
+      if (!timeout) {
+        return;
+      }
+
+      timer_task = new LambdaContext([this](int) {
+          timer_task = nullptr;
+          complete(-CEPHFS_ETIMEDOUT);
+        });
+      mds->timer.add_event_after(timeout, timer_task);
+    }
+
+    void finish(int r) override {
+      Context *ctx = nullptr;
+      {
+        std::lock_guard locker(lock);
+        std::swap(on_finish, ctx);
+      }
+      if (ctx != nullptr) {
+        ctx->complete(r);
+      }
+    }
+    void complete(int r) override {
+      if (timer_task != nullptr) {
+        mds->timer.cancel_event(timer_task);
+      }
+
+      finish(r);
+    }
+
+    uint64_t timeout;
+    ceph::mutex lock = ceph::make_mutex("mds::context::timeout");
+    Context *on_finish = nullptr;
+    Context *timer_task = nullptr;
+  };
+
+  auto do_trim() {
+    auto [throttled, count] = mdcache->trim(UINT64_MAX);
+    dout(10) << __func__
+             << (throttled ? " (throttled)" : "")
+             << " trimmed " << count << " caps" << dendl;
+    dentries_trimmed += count;
+    return std::make_pair(throttled, count);
+  }
+
+  void recall_client_state() {
+    dout(20) << __func__ << dendl;
+    auto now = mono_clock::now();
+    auto duration = std::chrono::duration<double>(now-recall_start).count();
+
+    MDSGatherBuilder gather(g_ceph_context);
+    auto flags = Server::RecallFlags::STEADY|Server::RecallFlags::TRIM;
+    auto [throttled, count] = server->recall_client_state(&gather, flags);
+    dout(10) << __func__
+             << (throttled ? " (throttled)" : "")
+             << " recalled " << count << " caps" << dendl;
+
+    caps_recalled += count;
+    if ((throttled || count > 0) && (recall_timeout == 0 || duration < recall_timeout)) {
+      C_ContextTimeout *ctx = new C_ContextTimeout(
+        mds, 1, new LambdaContext([this](int r) {
+          recall_client_state();
+      }));
+      ctx->start_timer();
+      gather.set_finisher(new MDSInternalContextWrapper(mds, ctx));
+      gather.activate();
+      mdlog->flush(); /* use down-time to incrementally flush log */
+      do_trim(); /* use down-time to incrementally trim cache */
+    } else {
+      if (!gather.has_subs()) {
+        return handle_recall_client_state(0);
+      } else if (recall_timeout > 0 && duration > recall_timeout) {
+        gather.set_finisher(new C_MDSInternalNoop);
+        gather.activate();
+        return handle_recall_client_state(-CEPHFS_ETIMEDOUT);
+      } else {
+        uint64_t remaining = (recall_timeout == 0 ? 0 : recall_timeout-duration);
+        C_ContextTimeout *ctx = new C_ContextTimeout(
+          mds, remaining, new LambdaContext([this](int r) {
+              handle_recall_client_state(r);
+            }));
+
+        ctx->start_timer();
+        gather.set_finisher(new MDSInternalContextWrapper(mds, ctx));
+        gather.activate();
+      }
+    }
+  }
+
+  void handle_recall_client_state(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    // client recall section
+    f->open_object_section("client_recall");
+    f->dump_int("return_code", r);
+    f->dump_string("message", cpp_strerror(r));
+    f->dump_int("recalled", caps_recalled);
+    f->close_section();
+
+    // we can still continue after recall timeout
+    flush_journal();
+  }
+
+  void flush_journal() {
+    dout(20) << __func__ << dendl;
+
+    Context *ctx = new LambdaContext([this](int r) {
+        handle_flush_journal(r);
+      });
+
+    C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, mds, &ss, ctx);
+    flush_journal->send();
+  }
+
+  void handle_flush_journal(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    if (r != 0) {
+      cmd_err(f, ss.str());
+      complete(r);
+      return;
+    }
+
+    // journal flush section
+    f->open_object_section("flush_journal");
+    f->dump_int("return_code", r);
+    f->dump_string("message", ss.str());
+    f->close_section();
+
+    trim_cache();
+  }
+
+  void trim_cache() {
+    dout(20) << __func__ << dendl;
+
+    auto [throttled, count] = do_trim();
+    if (throttled && count > 0) {
+      auto timer = new LambdaContext([this](int) {
+        trim_cache();
+      });
+      mds->timer.add_event_after(1.0, timer);
+    } else {
+      cache_status();
+    }
+  }
+
+  void cache_status() {
+    dout(20) << __func__ << dendl;
+
+    f->open_object_section("trim_cache");
+    f->dump_int("trimmed", dentries_trimmed);
+    f->close_section();
+
+    // cache status section
+    mdcache->cache_status(f);
+
+    complete(0);
+  }
+
+  void finish(int r) override {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    auto d = std::chrono::duration<double>(mono_clock::now()-recall_start);
+    f->dump_float("duration", d.count());
+
+    f->close_section();
+    on_finish->complete(r);
+  }
+
+  Server *server;
+  MDCache *mdcache;
+  MDLog *mdlog;
+  uint64_t recall_timeout;
+  mono_time recall_start;
+  Formatter *f;
+  Context *on_finish;
+
+  int retval = 0;
+  std::stringstream ss;
+  uint64_t caps_recalled = 0;
+  uint64_t dentries_trimmed = 0;
+
+  // so as to use dout
+  mds_rank_t whoami;
+  int incarnation;
+
+  void cmd_err(Formatter *f, std::string_view err) {
+    f->reset();
+    f->open_object_section("result");
+    f->dump_string("error", err);
+    f->close_section();
+  }
+};
+
+MDSRank::MDSRank(
+    mds_rank_t whoami_,
+    std::string fs_name_,
+    ceph::fair_mutex &mds_lock_,
+    LogChannelRef &clog_,
+    CommonSafeTimer<ceph::fair_mutex> &timer_,
+    Beacon &beacon_,
+    std::unique_ptr<MDSMap>& mdsmap_,
+    Messenger *msgr,
+    MonClient *monc_,
+    MgrClient *mgrc,
+    Context *respawn_hook_,
+    Context *suicide_hook_,
+    boost::asio::io_context& ioc) :
+    cct(msgr->cct), mds_lock(mds_lock_), clog(clog_),
+    timer(timer_), mdsmap(mdsmap_),
+    objecter(new Objecter(g_ceph_context, msgr, monc_, ioc)),
+    damage_table(whoami_), sessionmap(this),
+    op_tracker(g_ceph_context, g_conf()->mds_enable_op_tracker,
+               g_conf()->osd_num_op_tracker_shard),
+    progress_thread(this), whoami(whoami_), fs_name(fs_name_),
+    purge_queue(g_ceph_context, whoami_,
+      mdsmap_->get_metadata_pool(), objecter,
+      new LambdaContext([this](int r) {
+	  std::lock_guard l(mds_lock);
+	  handle_write_error(r);
+	}
+      )
+    ),
+    metrics_handler(cct, this),
+    beacon(beacon_),
+    messenger(msgr), monc(monc_), mgrc(mgrc),
+    respawn_hook(respawn_hook_),
+    suicide_hook(suicide_hook_),
+    starttime(mono_clock::now()),
+    ioc(ioc)
+{
+  hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank", pthread_self());
+
+  // The metadata pool won't change in the whole life time
+  // of the fs, with this we can get rid of the mds_lock
+  // in many places too.
+  metadata_pool = mdsmap->get_metadata_pool();
+
+  purge_queue.update_op_limit(*mdsmap);
+
+  objecter->unset_honor_pool_full();
+
+  finisher = new Finisher(cct, "MDSRank", "MR_Finisher");
+
+  mdcache = new MDCache(this, purge_queue);
+  mdlog = new MDLog(this);
+  balancer = new MDBalancer(this, messenger, monc);
+
+  scrubstack = new ScrubStack(mdcache, clog, finisher);
+
+  inotable = new InoTable(this);
+  snapserver = new SnapServer(this, monc);
+  snapclient = new SnapClient(this);
+
+  server = new Server(this, &metrics_handler);
+  locker = new Locker(this, mdcache);
+
+  _heartbeat_reset_grace = g_conf().get_val<uint64_t>("mds_heartbeat_reset_grace");
+  heartbeat_grace = g_conf().get_val<double>("mds_heartbeat_grace");
+  op_tracker.set_complaint_and_threshold(cct->_conf->mds_op_complaint_time,
+                                         cct->_conf->mds_op_log_threshold);
+  op_tracker.set_history_size_and_duration(cct->_conf->mds_op_history_size,
+                                           cct->_conf->mds_op_history_duration);
+
+  schedule_update_timer_task();
+}
+
+MDSRank::~MDSRank()
+{
+  if (hb) {
+    g_ceph_context->get_heartbeat_map()->remove_worker(hb);
+  }
+
+  if (scrubstack) { delete scrubstack; scrubstack = NULL; }
+  if (mdcache) { delete mdcache; mdcache = NULL; }
+  if (mdlog) { delete mdlog; mdlog = NULL; }
+  if (balancer) { delete balancer; balancer = NULL; }
+  if (inotable) { delete inotable; inotable = NULL; }
+  if (snapserver) { delete snapserver; snapserver = NULL; }
+  if (snapclient) { delete snapclient; snapclient = NULL; }
+
+  if (server) { delete server; server = 0; }
+  if (locker) { delete locker; locker = 0; }
+
+  if (logger) {
+    g_ceph_context->get_perfcounters_collection()->remove(logger);
+    delete logger;
+    logger = 0;
+  }
+  if (mlogger) {
+    g_ceph_context->get_perfcounters_collection()->remove(mlogger);
+    delete mlogger;
+    mlogger = 0;
+  }
+
+  delete finisher;
+  finisher = NULL;
+
+  delete suicide_hook;
+  suicide_hook = NULL;
+
+  delete respawn_hook;
+  respawn_hook = NULL;
+
+  delete objecter;
+  objecter = nullptr;
+}
+
+void MDSRankDispatcher::init()
+{
+  objecter->init();
+  messenger->add_dispatcher_head(objecter);
+
+  objecter->start();
+
+  update_log_config();
+  create_logger();
+
+  // Expose the OSDMap (already populated during MDS::init) to anyone
+  // who is interested in it.
+  handle_osd_map();
+
+  progress_thread.create("mds_rank_progr");
+
+  purge_queue.init();
+
+  finisher->start();
+}
+
+void MDSRank::update_targets()
+{
+  // get MonMap's idea of my export_targets
+  const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
+
+  dout(20) << "updating export targets, currently " << map_targets.size() << " ranks are targets" << dendl;
+
+  bool send = false;
+  set<mds_rank_t> new_map_targets;
+
+  auto it = export_targets.begin();
+  while (it != export_targets.end()) {
+    mds_rank_t rank = it->first;
+    auto &counter = it->second;
+    dout(20) << "export target mds." << rank << " is " << counter << dendl;
+
+    double val = counter.get();
+    if (val <= 0.01) {
+      dout(15) << "export target mds." << rank << " is no longer an export target" << dendl;
+      export_targets.erase(it++);
+      send = true;
+      continue;
+    }
+    if (!map_targets.count(rank)) {
+      dout(15) << "export target mds." << rank << " not in map's export_targets" << dendl;
+      send = true;
+    }
+    new_map_targets.insert(rank);
+    it++;
+  }
+  if (new_map_targets.size() < map_targets.size()) {
+    dout(15) << "export target map holds stale targets, sending update" << dendl;
+    send = true;
+  }
+
+  if (send) {
+    dout(15) << "updating export_targets, now " << new_map_targets.size() << " ranks are targets" << dendl;
+    auto m = make_message<MMDSLoadTargets>(mds_gid_t(monc->get_global_id()), new_map_targets);
+    monc->send_mon_message(m.detach());
+  }
+}
+
+void MDSRank::hit_export_target(mds_rank_t rank, double amount)
+{
+  double rate = g_conf()->mds_bal_target_decay;
+  if (amount < 0.0) {
+    amount = 100.0/g_conf()->mds_bal_target_decay; /* a good default for "i am trying to keep this export_target active" */
+  }
+  auto em = export_targets.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple(DecayRate(rate)));
+  auto &counter = em.first->second;
+  counter.hit(amount);
+  if (em.second) {
+    dout(15) << "hit export target (new) is " << counter << dendl;
+  } else {
+    dout(15) << "hit export target is " << counter << dendl;
+  }
+}
+
+class C_MDS_MonCommand : public MDSInternalContext {
+  std::string cmd;
+public:
+  std::string outs;
+  C_MDS_MonCommand(MDSRank *m, std::string_view c)
+    : MDSInternalContext(m), cmd(c) {}
+  void finish(int r) override {
+    mds->_mon_command_finish(r, cmd, outs);
+  }
+};
+
+void MDSRank::_mon_command_finish(int r, std::string_view cmd, std::string_view outs)
+{
+  if (r < 0) {
+    dout(0) << __func__ << ": mon command " << cmd << " failed with errno " << r
+	    << " (" << outs << ")" << dendl;
+  } else {
+    dout(1) << __func__ << ": mon command " << cmd << " succeed" << dendl;
+  }
+}
+
+void MDSRank::set_mdsmap_multimds_snaps_allowed()
+{
+  static bool already_sent = false;
+  if (already_sent)
+    return;
+
+  CachedStackStringStream css;
+  *css << "{\"prefix\":\"fs set\", \"fs_name\":\"" <<  mdsmap->get_fs_name() << "\", ";
+  *css << "\"var\":\"allow_multimds_snaps\", \"val\":\"true\", ";
+  *css << "\"confirm\":\"--yes-i-am-really-a-mds\"}";
+  std::vector<std::string> cmd = {css->str()};
+
+  dout(0) << __func__ << ": sending mon command: " << cmd[0] << dendl;
+
+  C_MDS_MonCommand *fin = new C_MDS_MonCommand(this, cmd[0]);
+  monc->start_mon_command(cmd, {}, nullptr, &fin->outs, new C_IO_Wrapper(this, fin));
+
+  already_sent = true;
+}
+
+void MDSRankDispatcher::tick()
+{
+  heartbeat_reset();
+
+  if (beacon.is_laggy()) {
+    dout(1) << "skipping upkeep work because connection to Monitors appears laggy" << dendl;
+    return;
+  }
+
+  check_ops_in_flight();
+
+  // Wake up thread in case we use to be laggy and have waiting_for_nolaggy
+  // messages to progress.
+  progress_thread.signal();
+
+  // make sure mds log flushes, trims periodically
+  mdlog->flush();
+
+  // update average session uptime
+  sessionmap.update_average_session_age();
+
+  if (is_active() || is_stopping()) {
+    mdlog->trim();  // NOT during recovery!
+  }
+
+  // ...
+  if (is_clientreplay() || is_active() || is_stopping()) {
+    server->find_idle_sessions();
+    server->evict_cap_revoke_non_responders();
+    locker->tick();
+  }
+
+  // log
+  if (logger) {
+    logger->set(l_mds_subtrees, mdcache->num_subtrees());
+    mdcache->log_stat();
+  }
+
+  if (is_reconnect())
+    server->reconnect_tick();
+
+  if (is_active()) {
+    balancer->tick();
+    mdcache->find_stale_fragment_freeze();
+    mdcache->migrator->find_stale_export_freeze();
+
+    if (mdsmap->get_tableserver() == whoami) {
+      snapserver->check_osd_map(false);
+      // Filesystem was created by pre-mimic mds. Allow multi-active mds after
+      // all old snapshots are deleted.
+      if (!mdsmap->allows_multimds_snaps() &&
+	  snapserver->can_allow_multimds_snaps()) {
+	set_mdsmap_multimds_snaps_allowed();
+      }
+    }
+
+    if (whoami == 0)
+      scrubstack->advance_scrub_status();
+  }
+
+  if (is_active() || is_stopping()) {
+    update_targets();
+  }
+
+  // shut down?
+  if (is_stopping()) {
+    mdlog->trim();
+    if (mdcache->shutdown_pass()) {
+      uint64_t pq_progress = 0 ;
+      uint64_t pq_total = 0;
+      size_t pq_in_flight = 0;
+      if (!purge_queue.drain(&pq_progress, &pq_total, &pq_in_flight)) {
+        dout(7) << "shutdown_pass=true, but still waiting for purge queue"
+                << dendl;
+        // This takes unbounded time, so we must indicate progress
+        // to the administrator: we do it in a slightly imperfect way
+        // by sending periodic (tick frequency) clog messages while
+        // in this state.
+        clog->info() << "MDS rank " << whoami << " waiting for purge queue ("
+          << std::dec << pq_progress << "/" << pq_total << " " << pq_in_flight
+          << " files purging" << ")";
+      } else {
+        dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to "
+                   "down:stopped" << dendl;
+        stopping_done();
+      }
+    }
+    else {
+      dout(7) << "shutdown_pass=false" << dendl;
+    }
+  }
+
+  // Expose ourselves to Beacon to update health indicators
+  beacon.notify_health(this);
+}
+
+void MDSRankDispatcher::shutdown()
+{
+  // It should never be possible for shutdown to get called twice, because
+  // anyone picking up mds_lock checks if stopping is true and drops
+  // out if it is.
+  ceph_assert(stopping == false);
+  stopping = true;
+
+  dout(1) << __func__ << ": shutting down rank " << whoami << dendl;
+
+  g_conf().remove_observer(this);
+
+  timer.shutdown();
+
+  // MDLog has to shut down before the finisher, because some of its
+  // threads block on IOs that require finisher to complete.
+  mdlog->shutdown();
+
+  // shut down cache
+  mdcache->shutdown();
+
+  purge_queue.shutdown();
+
+  // shutdown metrics handler/updater -- this is ok even if it was not
+  // inited.
+  metrics_handler.shutdown();
+
+  // shutdown metric aggergator
+  if (metric_aggregator != nullptr) {
+    metric_aggregator->shutdown();
+  }
+
+  mds_lock.unlock();
+  finisher->stop(); // no flushing
+  mds_lock.lock();
+
+  if (objecter->initialized)
+    objecter->shutdown();
+
+  monc->shutdown();
+
+  op_tracker.on_shutdown();
+
+  progress_thread.shutdown();
+
+  // release mds_lock for finisher/messenger threads (e.g.
+  // MDSDaemon::ms_handle_reset called from Messenger).
+  mds_lock.unlock();
+
+  // shut down messenger
+  messenger->shutdown();
+
+  mds_lock.lock();
+
+  // Workaround unclean shutdown: HeartbeatMap will assert if
+  // worker is not removed (as we do in ~MDS), but ~MDS is not
+  // always called after suicide.
+  if (hb) {
+    g_ceph_context->get_heartbeat_map()->remove_worker(hb);
+    hb = NULL;
+  }
+}
+
+/**
+ * Helper for simple callbacks that call a void fn with no args.
+ */
+class C_MDS_VoidFn : public MDSInternalContext
+{
+  typedef void (MDSRank::*fn_ptr)();
+  protected:
+   fn_ptr fn;
+  public:
+  C_MDS_VoidFn(MDSRank *mds_, fn_ptr fn_)
+    : MDSInternalContext(mds_), fn(fn_)
+  {
+    ceph_assert(mds_);
+    ceph_assert(fn_);
+  }
+
+  void finish(int r) override
+  {
+    (mds->*fn)();
+  }
+};
+
+MDSTableClient *MDSRank::get_table_client(int t)
+{
+  switch (t) {
+  case TABLE_ANCHOR: return NULL;
+  case TABLE_SNAP: return snapclient;
+  default: ceph_abort();
+  }
+}
+
+MDSTableServer *MDSRank::get_table_server(int t)
+{
+  switch (t) {
+  case TABLE_ANCHOR: return NULL;
+  case TABLE_SNAP: return snapserver;
+  default: ceph_abort();
+  }
+}
+
+void MDSRank::suicide()
+{
+  if (suicide_hook) {
+    suicide_hook->complete(0);
+    suicide_hook = NULL;
+  }
+}
+
+void MDSRank::respawn()
+{
+  if (respawn_hook) {
+    respawn_hook->complete(0);
+    respawn_hook = NULL;
+  }
+}
+
+void MDSRank::damaged()
+{
+  ceph_assert(whoami != MDS_RANK_NONE);
+  ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
+
+  beacon.set_want_state(*mdsmap, MDSMap::STATE_DAMAGED);
+  monc->flush_log();  // Flush any clog error from before we were called
+  beacon.notify_health(this);  // Include latest status in our swan song
+  beacon.send_and_wait(g_conf()->mds_mon_shutdown_timeout);
+
+  // It's okay if we timed out and the mon didn't get our beacon, because
+  // another daemon (or ourselves after respawn) will eventually take the
+  // rank and report DAMAGED again when it hits same problem we did.
+
+  respawn();  // Respawn into standby in case mon has other work for us
+}
+
+void MDSRank::damaged_unlocked()
+{
+  std::lock_guard l(mds_lock);
+  damaged();
+}
+
+void MDSRank::handle_write_error(int err)
+{
+  if (err == -CEPHFS_EBLOCKLISTED) {
+    derr << "we have been blocklisted (fenced), respawning..." << dendl;
+    respawn();
+    return;
+  }
+
+  if (g_conf()->mds_action_on_write_error >= 2) {
+    derr << "unhandled write error " << cpp_strerror(err) << ", suicide..." << dendl;
+    respawn();
+  } else if (g_conf()->mds_action_on_write_error == 1) {
+    derr << "unhandled write error " << cpp_strerror(err) << ", force readonly..." << dendl;
+    mdcache->force_readonly();
+  } else {
+    // ignore;
+    derr << "unhandled write error " << cpp_strerror(err) << ", ignore..." << dendl;
+  }
+}
+
+void MDSRank::handle_write_error_with_lock(int err)
+{
+  std::scoped_lock l(mds_lock);
+  handle_write_error(err);
+}
+
+void *MDSRank::ProgressThread::entry()
+{
+  std::unique_lock l(mds->mds_lock);
+  while (true) {
+    cond.wait(l, [this] {
+      return (mds->stopping ||
+	      !mds->finished_queue.empty() ||
+	      (!mds->waiting_for_nolaggy.empty() && !mds->beacon.is_laggy()));
+    });
+
+    if (mds->stopping) {
+      break;
+    }
+
+    mds->_advance_queues();
+  }
+
+  return NULL;
+}
+
+
+void MDSRank::ProgressThread::shutdown()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
+  ceph_assert(mds->stopping);
+
+  if (am_self()) {
+    // Stopping is set, we will fall out of our main loop naturally
+  } else {
+    // Kick the thread to notice mds->stopping, and join it
+    cond.notify_all();
+    mds->mds_lock.unlock();
+    if (is_started())
+      join();
+    mds->mds_lock.lock();
+  }
+}
+
+bool MDSRankDispatcher::ms_dispatch(const cref_t<Message> &m)
+{
+  if (m->get_source().is_mds()) {
+    const Message *msg = m.get();
+    const MMDSOp *op = dynamic_cast<const MMDSOp*>(msg);
+    if (!op)
+      dout(0) << typeid(*msg).name() << " is not an MMDSOp type" << dendl;
+    ceph_assert(op);
+  }
+  else if (m->get_source().is_client()) {
+    Session *session = static_cast<Session*>(m->get_connection()->get_priv().get());
+    if (session)
+      session->last_seen = Session::clock::now();
+  }
+
+  inc_dispatch_depth();
+  bool ret = _dispatch(m, true);
+  dec_dispatch_depth();
+  return ret;
+}
+
+bool MDSRank::_dispatch(const cref_t<Message> &m, bool new_msg)
+{
+  if (is_stale_message(m)) {
+    return true;
+  }
+  // do not proceed if this message cannot be handled
+  if (!is_valid_message(m)) {
+    return false;
+  }
+
+  if (beacon.is_laggy()) {
+    dout(5) << " laggy, deferring " << *m << dendl;
+    waiting_for_nolaggy.push_back(m);
+  } else if (new_msg && !waiting_for_nolaggy.empty()) {
+    dout(5) << " there are deferred messages, deferring " << *m << dendl;
+    waiting_for_nolaggy.push_back(m);
+  } else {
+    handle_message(m);
+    heartbeat_reset();
+  }
+
+  if (dispatch_depth > 1)
+    return true;
+
+  // finish any triggered contexts
+  _advance_queues();
+
+  if (beacon.is_laggy()) {
+    // We've gone laggy during dispatch, don't do any
+    // more housekeeping
+    return true;
+  }
+
+  // hack: thrash exports
+  static utime_t start;
+  utime_t now = ceph_clock_now();
+  if (start == utime_t())
+    start = now;
+  /*double el = now - start;
+  if (el > 30.0 &&
+    el < 60.0)*/
+  for (int i=0; i<g_conf()->mds_thrash_exports; i++) {
+    set<mds_rank_t> s;
+    if (!is_active()) break;
+    mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE);
+    if (s.size() < 2 || CInode::count() < 10)
+      break;  // need peers for this to work.
+    if (mdcache->migrator->get_num_exporting() > g_conf()->mds_thrash_exports * 5 ||
+	mdcache->migrator->get_export_queue_size() > g_conf()->mds_thrash_exports * 10)
+      break;
+
+    dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf()->mds_thrash_exports << dendl;
+
+    // pick a random dir inode
+    CInode *in = mdcache->hack_pick_random_inode();
+
+    auto&& ls = in->get_dirfrags();
+    if (!ls.empty()) {	// must be an open dir.
+      const auto& dir = ls[rand() % ls.size()];
+      if (!dir->get_parent_dir()) continue;    // must be linked.
+      if (!dir->is_auth()) continue;           // must be auth.
+
+      mds_rank_t dest;
+      do {
+        int k = rand() % s.size();
+        set<mds_rank_t>::iterator p = s.begin();
+        while (k--) ++p;
+        dest = *p;
+      } while (dest == whoami);
+      mdcache->migrator->export_dir_nicely(dir,dest);
+    }
+  }
+  // hack: thrash fragments
+  for (int i=0; i<g_conf()->mds_thrash_fragments; i++) {
+    if (!is_active()) break;
+    if (mdcache->get_num_fragmenting_dirs() > 5 * g_conf()->mds_thrash_fragments) break;
+    dout(7) << "mds thrashing fragments pass " << (i+1) << "/" << g_conf()->mds_thrash_fragments << dendl;
+
+    // pick a random dir inode
+    CInode *in = mdcache->hack_pick_random_inode();
+
+    auto&& ls = in->get_dirfrags();
+    if (ls.empty()) continue;                // must be an open dir.
+    CDir *dir = ls.front();
+    if (!dir->get_parent_dir()) continue;    // must be linked.
+    if (!dir->is_auth()) continue;           // must be auth.
+    frag_t fg = dir->get_frag();
+    if ((fg == frag_t() || (rand() % (1 << fg.bits()) == 0))) {
+      mdcache->split_dir(dir, 1);
+    } else {
+      balancer->queue_merge(dir);
+    }
+  }
+
+  // hack: force hash root?
+  /*
+  if (false &&
+      mdcache->get_root() &&
+      mdcache->get_root()->dir &&
+      !(mdcache->get_root()->dir->is_hashed() ||
+        mdcache->get_root()->dir->is_hashing())) {
+    dout(0) << "hashing root" << dendl;
+    mdcache->migrator->hash_dir(mdcache->get_root()->dir);
+  }
+  */
+
+  update_mlogger();
+  return true;
+}
+
+void MDSRank::update_mlogger()
+{
+  if (mlogger) {
+    mlogger->set(l_mdm_ino, CInode::count());
+    mlogger->set(l_mdm_dir, CDir::count());
+    mlogger->set(l_mdm_dn, CDentry::count());
+    mlogger->set(l_mdm_cap, Capability::count());
+    mlogger->set(l_mdm_inoa, CInode::increments());
+    mlogger->set(l_mdm_inos, CInode::decrements());
+    mlogger->set(l_mdm_dira, CDir::increments());
+    mlogger->set(l_mdm_dirs, CDir::decrements());
+    mlogger->set(l_mdm_dna, CDentry::increments());
+    mlogger->set(l_mdm_dns, CDentry::decrements());
+    mlogger->set(l_mdm_capa, Capability::increments());
+    mlogger->set(l_mdm_caps, Capability::decrements());
+  }
+}
+
+// message types that the mds can handle
+bool MDSRank::is_valid_message(const cref_t<Message> &m) {
+  int port = m->get_type() & 0xff00;
+  int type = m->get_type();
+
+  if (port == MDS_PORT_CACHE ||
+      port == MDS_PORT_MIGRATOR ||
+      type == CEPH_MSG_CLIENT_SESSION ||
+      type == CEPH_MSG_CLIENT_RECONNECT ||
+      type == CEPH_MSG_CLIENT_RECLAIM ||
+      type == CEPH_MSG_CLIENT_REQUEST ||
+      type == MSG_MDS_PEER_REQUEST ||
+      type == MSG_MDS_HEARTBEAT ||
+      type == MSG_MDS_TABLE_REQUEST ||
+      type == MSG_MDS_LOCK ||
+      type == MSG_MDS_INODEFILECAPS ||
+      type == MSG_MDS_SCRUB ||
+      type == MSG_MDS_SCRUB_STATS ||
+      type == CEPH_MSG_CLIENT_CAPS ||
+      type == CEPH_MSG_CLIENT_CAPRELEASE ||
+      type == CEPH_MSG_CLIENT_LEASE) {
+    return true;
+  }
+
+  return false;
+}
+
+/*
+ * lower priority messages we defer if we seem laggy
+ */
+
+#define ALLOW_MESSAGES_FROM(peers)                                      \
+  do {                                                                  \
+    if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
+      dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
+              << " allowing=" << #peers << " message=" << *m << dendl;  \
+      return;                                                           \
+    }                                                                   \
+  } while (0)
+
+void MDSRank::handle_message(const cref_t<Message> &m)
+{
+  int port = m->get_type() & 0xff00;
+
+  switch (port) {
+  case MDS_PORT_CACHE:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+    mdcache->dispatch(m);
+    break;
+
+  case MDS_PORT_MIGRATOR:
+    ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+    mdcache->migrator->dispatch(m);
+    break;
+
+  default:
+    switch (m->get_type()) {
+      // SERVER
+    case CEPH_MSG_CLIENT_SESSION:
+    case CEPH_MSG_CLIENT_RECONNECT:
+    case CEPH_MSG_CLIENT_RECLAIM:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
+      // fall-thru
+    case CEPH_MSG_CLIENT_REQUEST:
+      server->dispatch(m);
+      break;
+    case MSG_MDS_PEER_REQUEST:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+      server->dispatch(m);
+      break;
+
+    case MSG_MDS_HEARTBEAT:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+      balancer->proc_message(m);
+      break;
+
+    case MSG_MDS_TABLE_REQUEST:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+      {
+        const cref_t<MMDSTableRequest> &req = ref_cast<MMDSTableRequest>(m);
+        if (req->op < 0) {
+          MDSTableClient *client = get_table_client(req->table);
+          client->handle_request(req);
+        } else {
+           MDSTableServer *server = get_table_server(req->table);
+           server->handle_request(req);
+        }
+      }
+      break;
+
+    case MSG_MDS_LOCK:
+    case MSG_MDS_INODEFILECAPS:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+      locker->dispatch(m);
+      break;
+
+    case CEPH_MSG_CLIENT_CAPS:
+    case CEPH_MSG_CLIENT_CAPRELEASE:
+    case CEPH_MSG_CLIENT_LEASE:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
+      locker->dispatch(m);
+      break;
+
+    case MSG_MDS_SCRUB:
+    case MSG_MDS_SCRUB_STATS:
+      ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+      scrubstack->dispatch(m);
+      break;
+
+    default:
+      derr << "unrecognized message " << *m << dendl;
+    }
+  }
+}
+
+/**
+ * Advance finished_queue and waiting_for_nolaggy.
+ *
+ * Usually drain both queues, but may not drain waiting_for_nolaggy
+ * if beacon is currently laggy.
+ */
+void MDSRank::_advance_queues()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
+
+  if (!finished_queue.empty()) {
+    dout(7) << "mds has " << finished_queue.size() << " queued contexts" << dendl;
+    while (!finished_queue.empty()) {
+      auto fin = finished_queue.front();
+      finished_queue.pop_front();
+
+      dout(10) << " finish " << fin << dendl;
+      fin->complete(0);
+
+      heartbeat_reset();
+    }
+  }
+
+  while (!waiting_for_nolaggy.empty()) {
+    // stop if we're laggy now!
+    if (beacon.is_laggy())
+      break;
+
+    cref_t<Message> old = waiting_for_nolaggy.front();
+    waiting_for_nolaggy.pop_front();
+
+    if (!is_stale_message(old)) {
+      dout(7) << " processing laggy deferred " << *old << dendl;
+      ceph_assert(is_valid_message(old));
+      handle_message(old);
+    }
+
+    heartbeat_reset();
+  }
+}
+
+/**
+ * Call this when you take mds_lock, or periodically if you're going to
+ * hold the lock for a long time (e.g. iterating over clients/inodes)
+ */
+void MDSRank::heartbeat_reset()
+{
+  // Any thread might jump into mds_lock and call us immediately
+  // after a call to suicide() completes, in which case MDSRank::hb
+  // has been freed and we are a no-op.
+  if (!hb) {
+      ceph_assert(stopping);
+      return;
+  }
+
+  // NB not enabling suicide grace, because the mon takes care of killing us
+  // (by blocklisting us) when we fail to send beacons, and it's simpler to
+  // only have one way of dying.
+  g_ceph_context->get_heartbeat_map()->reset_timeout(hb,
+    ceph::make_timespan(heartbeat_grace),
+    ceph::timespan::zero());
+}
+
+bool MDSRank::is_stale_message(const cref_t<Message> &m) const
+{
+  // from bad mds?
+  if (m->get_source().is_mds()) {
+    mds_rank_t from = mds_rank_t(m->get_source().num());
+    bool bad = false;
+    if (mdsmap->is_down(from)) {
+      bad = true;
+    } else {
+      // FIXME: this is a convoluted check.  we should be maintaining a nice
+      // clean map of current ConnectionRefs for current mdses!!!
+      auto c = messenger->connect_to(CEPH_ENTITY_TYPE_MDS,
+				     mdsmap->get_addrs(from));
+      if (c != m->get_connection()) {
+	bad = true;
+	dout(5) << " mds." << from << " should be " << c << " "
+		<< c->get_peer_addrs() << " but this message is "
+		<< m->get_connection() << " " << m->get_source_addrs()
+		<< dendl;
+      }
+    }
+    if (bad) {
+      // bogus mds?
+      if (m->get_type() == CEPH_MSG_MDS_MAP) {
+	dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source()
+		<< ", but it's an mdsmap, looking at it" << dendl;
+      } else if (m->get_type() == MSG_MDS_CACHEEXPIRE &&
+		 mdsmap->get_addrs(from) == m->get_source_addrs()) {
+	dout(5) << "got " << *m << " from down mds " << m->get_source()
+		<< ", but it's a cache_expire, looking at it" << dendl;
+      } else {
+	dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source()
+		<< ", dropping" << dendl;
+	return true;
+      }
+    }
+  }
+  return false;
+}
+
+Session *MDSRank::get_session(const cref_t<Message> &m)
+{
+  // do not carry ref
+  auto session = static_cast<Session *>(m->get_connection()->get_priv().get());
+  if (session) {
+    dout(20) << "get_session have " << session << " " << session->info.inst
+	     << " state " << session->get_state_name() << dendl;
+    // Check if we've imported an open session since (new sessions start closed)
+    if (session->is_closed()) {
+      Session *imported_session = sessionmap.get_session(session->info.inst.name);
+      if (imported_session && imported_session != session) {
+        dout(10) << __func__ << " replacing connection bootstrap session "
+		 << session << " with imported session " << imported_session
+		 << dendl;
+        imported_session->info.auth_name = session->info.auth_name;
+        //assert(session->info.auth_name == imported_session->info.auth_name);
+        ceph_assert(session->info.inst == imported_session->info.inst);
+        imported_session->set_connection(session->get_connection().get());
+        // send out any queued messages
+        while (!session->preopen_out_queue.empty()) {
+          imported_session->get_connection()->send_message2(std::move(session->preopen_out_queue.front()));
+          session->preopen_out_queue.pop_front();
+        }
+        imported_session->auth_caps = session->auth_caps;
+        imported_session->last_seen = session->last_seen;
+        ceph_assert(session->get_nref() == 1);
+        imported_session->get_connection()->set_priv(imported_session->get());
+        session = imported_session;
+      }
+    }
+  } else {
+    dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
+  }
+  return session;
+}
+
+void MDSRank::send_message(const ref_t<Message>& m, const ConnectionRef& c)
+{
+  ceph_assert(c);
+  c->send_message2(m);
+}
+
+class C_MDS_RetrySendMessageMDS : public MDSInternalContext {
+public:
+  C_MDS_RetrySendMessageMDS(MDSRank* mds, mds_rank_t who, ref_t<Message> m)
+    : MDSInternalContext(mds), who(who), m(std::move(m)) {}
+  void finish(int r) override {
+    mds->send_message_mds(m, who);
+  }
+private:
+  mds_rank_t who;
+  ref_t<Message> m;
+};
+
+
+void MDSRank::send_message_mds(const ref_t<Message>& m, mds_rank_t mds)
+{
+  if (!mdsmap->is_up(mds)) {
+    dout(10) << "send_message_mds mds." << mds << " not up, dropping " << *m << dendl;
+    return;
+  } else if (mdsmap->is_bootstrapping(mds)) {
+    dout(5) << __func__ << "mds." << mds << " is bootstrapping, deferring " << *m << dendl;
+    wait_for_bootstrapped_peer(mds, new C_MDS_RetrySendMessageMDS(this, mds, m));
+    return;
+  }
+
+  // send mdsmap first?
+  auto addrs = mdsmap->get_addrs(mds);
+  if (mds != whoami && peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) {
+    auto _m = make_message<MMDSMap>(monc->get_fsid(), *mdsmap,
+				    std::string(mdsmap->get_fs_name()));
+    send_message_mds(_m, addrs);
+    peer_mdsmap_epoch[mds] = mdsmap->get_epoch();
+  }
+
+  // send message
+  send_message_mds(m, addrs);
+}
+
+void MDSRank::send_message_mds(const ref_t<Message>& m, const entity_addrvec_t &addr)
+{
+  messenger->send_to_mds(ref_t<Message>(m).detach(), addr);
+}
+
+void MDSRank::forward_message_mds(const cref_t<MClientRequest>& m, mds_rank_t mds)
+{
+  ceph_assert(mds != whoami);
+
+  /*
+   * don't actually forward if non-idempotent!
+   * client has to do it.  although the MDS will ignore duplicate requests,
+   * the affected metadata may migrate, in which case the new authority
+   * won't have the metareq_id in the completed request map.
+   */
+  // NEW: always make the client resend!
+  bool client_must_resend = true;  //!creq->can_forward();
+
+  // tell the client where it should go
+  auto session = get_session(m);
+  auto f = make_message<MClientRequestForward>(m->get_tid(), mds, m->get_num_fwd()+1, client_must_resend);
+  send_message_client(f, session);
+}
+
+void MDSRank::send_message_client_counted(const ref_t<Message>& m, client_t client)
+{
+  Session *session = sessionmap.get_session(entity_name_t::CLIENT(client.v));
+  if (session) {
+    send_message_client_counted(m, session);
+  } else {
+    dout(10) << "send_message_client_counted no session for client." << client << " " << *m << dendl;
+  }
+}
+
+void MDSRank::send_message_client_counted(const ref_t<Message>& m, const ConnectionRef& connection)
+{
+  // do not carry ref
+  auto session = static_cast<Session *>(connection->get_priv().get());
+  if (session) {
+    send_message_client_counted(m, session);
+  } else {
+    dout(10) << "send_message_client_counted has no session for " << m->get_source_inst() << dendl;
+    // another Connection took over the Session
+  }
+}
+
+void MDSRank::send_message_client_counted(const ref_t<Message>& m, Session* session)
+{
+  version_t seq = session->inc_push_seq();
+  dout(10) << "send_message_client_counted " << session->info.inst.name << " seq "
+	   << seq << " " << *m << dendl;
+  if (session->get_connection()) {
+    session->get_connection()->send_message2(m);
+  } else {
+    session->preopen_out_queue.push_back(m);
+  }
+}
+
+void MDSRank::send_message_client(const ref_t<Message>& m, Session* session)
+{
+  dout(10) << "send_message_client " << session->info.inst << " " << *m << dendl;
+  if (session->get_connection()) {
+    session->get_connection()->send_message2(m);
+  } else {
+    session->preopen_out_queue.push_back(m);
+  }
+}
+
+/**
+ * This is used whenever a RADOS operation has been cancelled
+ * or a RADOS client has been blocklisted, to cause the MDS and
+ * any clients to wait for this OSD epoch before using any new caps.
+ *
+ * See doc/cephfs/eviction
+ */
+void MDSRank::set_osd_epoch_barrier(epoch_t e)
+{
+  dout(4) << __func__ << ": epoch=" << e << dendl;
+  osd_epoch_barrier = e;
+}
+
+void MDSRank::retry_dispatch(const cref_t<Message> &m)
+{
+  inc_dispatch_depth();
+  _dispatch(m, false);
+  dec_dispatch_depth();
+}
+
+double MDSRank::get_dispatch_queue_max_age(utime_t now) const
+{
+  return messenger->get_dispatch_queue_max_age(now);
+}
+
+bool MDSRank::is_daemon_stopping() const
+{
+  return stopping;
+}
+
+void MDSRank::request_state(MDSMap::DaemonState s)
+{
+  dout(3) << "request_state " << ceph_mds_state_name(s) << dendl;
+  beacon.set_want_state(*mdsmap, s);
+  beacon.send();
+}
+
+
+class C_MDS_BootStart : public MDSInternalContext {
+  MDSRank::BootStep nextstep;
+public:
+  C_MDS_BootStart(MDSRank *m, MDSRank::BootStep n)
+    : MDSInternalContext(m), nextstep(n) {}
+  void finish(int r) override {
+    mds->boot_start(nextstep, r);
+  }
+};
+
+
+void MDSRank::boot_start(BootStep step, int r)
+{
+  // Handle errors from previous step
+  if (r < 0) {
+    if (is_standby_replay() && (r == -CEPHFS_EAGAIN)) {
+      dout(0) << "boot_start encountered an error CEPHFS_EAGAIN"
+              << ", respawning since we fell behind journal" << dendl;
+      respawn();
+    } else if (r == -CEPHFS_EINVAL || r == -CEPHFS_ENOENT) {
+      // Invalid or absent data, indicates damaged on-disk structures
+      clog->error() << "Error loading MDS rank " << whoami << ": "
+        << cpp_strerror(r);
+      damaged();
+      ceph_assert(r == 0);  // Unreachable, damaged() calls respawn()
+    } else if (r == -CEPHFS_EROFS) {
+      dout(0) << "boot error forcing transition to read-only; MDS will try to continue" << dendl;
+    } else {
+      // Completely unexpected error, give up and die
+      dout(0) << "boot_start encountered an error, failing" << dendl;
+      suicide();
+      return;
+    }
+  }
+
+  ceph_assert(is_starting() || is_any_replay());
+
+  switch(step) {
+    case MDS_BOOT_INITIAL:
+      {
+        mdcache->init_layouts();
+
+        MDSGatherBuilder gather(g_ceph_context,
+            new C_MDS_BootStart(this, MDS_BOOT_OPEN_ROOT));
+        dout(2) << "Booting: " << step << ": opening inotable" << dendl;
+        inotable->set_rank(whoami);
+        inotable->load(gather.new_sub());
+
+        dout(2) << "Booting: " << step << ": opening sessionmap" << dendl;
+        sessionmap.set_rank(whoami);
+        sessionmap.load(gather.new_sub());
+
+        dout(2) << "Booting: " << step << ": opening mds log" << dendl;
+        mdlog->open(gather.new_sub());
+
+	if (is_starting()) {
+	  dout(2) << "Booting: " << step << ": opening purge queue" << dendl;
+	  purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
+	} else if (!standby_replaying) {
+	  dout(2) << "Booting: " << step << ": opening purge queue (async)" << dendl;
+	  purge_queue.open(NULL);
+	  dout(2) << "Booting: " << step << ": loading open file table (async)" << dendl;
+	  mdcache->open_file_table.load(nullptr);
+	}
+
+        if (mdsmap->get_tableserver() == whoami) {
+          dout(2) << "Booting: " << step << ": opening snap table" << dendl;
+          snapserver->set_rank(whoami);
+          snapserver->load(gather.new_sub());
+        }
+
+        gather.activate();
+      }
+      break;
+    case MDS_BOOT_OPEN_ROOT:
+      {
+        dout(2) << "Booting: " << step << ": loading/discovering base inodes" << dendl;
+
+        MDSGatherBuilder gather(g_ceph_context,
+            new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
+
+	if (is_starting()) {
+	  // load mydir frag for the first log segment (creating subtree map)
+	  mdcache->open_mydir_frag(gather.new_sub());
+	} else {
+	  mdcache->open_mydir_inode(gather.new_sub());
+	}
+
+	mdcache->create_global_snaprealm();
+
+	if (whoami == mdsmap->get_root()) {  // load root inode off disk if we are auth
+	  mdcache->open_root_inode(gather.new_sub());
+	} else if (is_any_replay()) {
+	  // replay.  make up fake root inode to start with
+	  mdcache->create_root_inode();
+	}
+        gather.activate();
+      }
+      break;
+    case MDS_BOOT_PREPARE_LOG:
+      if (is_any_replay()) {
+	dout(2) << "Booting: " << step << ": replaying mds log" << dendl;
+	MDSGatherBuilder gather(g_ceph_context,
+	    new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
+
+	if (!standby_replaying) {
+	  dout(2) << "Booting: " << step << ": waiting for purge queue recovered" << dendl;
+	  purge_queue.wait_for_recovery(new C_IO_Wrapper(this, gather.new_sub()));
+	}
+
+	mdlog->replay(gather.new_sub());
+	gather.activate();
+      } else {
+        dout(2) << "Booting: " << step << ": positioning at end of old mds log" << dendl;
+        mdlog->append();
+        starting_done();
+      }
+      break;
+    case MDS_BOOT_REPLAY_DONE:
+      ceph_assert(is_any_replay());
+
+      // Sessiontable and inotable should be in sync after replay, validate
+      // that they are consistent.
+      validate_sessions();
+
+      replay_done();
+      break;
+  }
+}
+
+void MDSRank::validate_sessions()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
+  bool valid = true;
+
+  // Identify any sessions which have state inconsistent with other,
+  // after they have been loaded from rados during startup.
+  // Mitigate bugs like: http://tracker.ceph.com/issues/16842
+  for (const auto &i : sessionmap.get_sessions()) {
+    Session *session = i.second;
+    ceph_assert(session->info.prealloc_inos == session->free_prealloc_inos);
+
+    interval_set<inodeno_t> badones;
+    if (inotable->intersects_free(session->info.prealloc_inos, &badones)) {
+      clog->error() << "client " << *session
+		    << "loaded with preallocated inodes that are inconsistent with inotable";
+      valid = false;
+    }
+  }
+
+  if (!valid) {
+    damaged();
+    ceph_assert(valid);
+  }
+}
+
+void MDSRank::starting_done()
+{
+  dout(3) << "starting_done" << dendl;
+  ceph_assert(is_starting());
+  request_state(MDSMap::STATE_ACTIVE);
+
+  mdlog->start_new_segment();
+
+  // sync snaptable cache
+  snapclient->sync(new C_MDSInternalNoop);
+}
+
+
+void MDSRank::calc_recovery_set()
+{
+  // initialize gather sets
+  set<mds_rank_t> rs;
+  mdsmap->get_recovery_mds_set(rs);
+  rs.erase(whoami);
+  mdcache->set_recovery_set(rs);
+
+  dout(1) << " recovery set is " << rs << dendl;
+}
+
+void MDSRank::replay_start()
+{
+  dout(1) << "replay_start" << dendl;
+
+  if (is_standby_replay()) {
+    standby_replaying = true;
+    if (unlikely(g_conf().get_val<bool>("mds_standby_replay_damaged"))) {
+      damaged();
+    }
+  }
+
+  // Check if we need to wait for a newer OSD map before starting
+  bool const ready = objecter->with_osdmap(
+    [this](const OSDMap& o) {
+      return o.get_epoch() >= mdsmap->get_last_failure_osd_epoch();
+    });
+
+  if (ready) {
+    boot_start();
+  } else {
+    dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
+	    << " (which blocklists prior instance)" << dendl;
+    Context *fin = new C_IO_Wrapper(this, new C_MDS_BootStart(this, MDS_BOOT_INITIAL));
+    objecter->wait_for_map(
+      mdsmap->get_last_failure_osd_epoch(),
+      lambdafy(fin));
+  }
+}
+
+
+class MDSRank::C_MDS_StandbyReplayRestartFinish : public MDSIOContext {
+  uint64_t old_read_pos;
+public:
+  C_MDS_StandbyReplayRestartFinish(MDSRank *mds_, uint64_t old_read_pos_) :
+    MDSIOContext(mds_), old_read_pos(old_read_pos_) {}
+  void finish(int r) override {
+    mds->_standby_replay_restart_finish(r, old_read_pos);
+  }
+  void print(ostream& out) const override {
+    out << "standby_replay_restart";
+  }
+};
+
+void MDSRank::_standby_replay_restart_finish(int r, uint64_t old_read_pos)
+{
+  if (old_read_pos < mdlog->get_journaler()->get_trimmed_pos()) {
+    dout(0) << "standby MDS fell behind active MDS journal's expire_pos, restarting" << dendl;
+    respawn(); /* we're too far back, and this is easier than
+		  trying to reset everything in the cache, etc */
+  } else {
+    mdlog->standby_trim_segments();
+    boot_start(MDS_BOOT_PREPARE_LOG, r);
+  }
+}
+
+class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext {
+public:
+  explicit C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {}
+  void finish(int r) override {
+    ceph_assert(!r);
+    mds->standby_replay_restart();
+  }
+};
+
+void MDSRank::standby_replay_restart()
+{
+  if (standby_replaying) {
+    /* Go around for another pass of replaying in standby */
+    dout(5) << "Restarting replay as standby-replay" << dendl;
+    mdlog->get_journaler()->reread_head_and_probe(
+      new C_MDS_StandbyReplayRestartFinish(
+        this,
+	mdlog->get_journaler()->get_read_pos()));
+  } else {
+    /* We are transitioning out of standby: wait for OSD map update
+       before making final pass */
+    dout(1) << "standby_replay_restart (final takeover pass)" << dendl;
+    bool ready = objecter->with_osdmap(
+      [this](const OSDMap& o) {
+	return o.get_epoch() >= mdsmap->get_last_failure_osd_epoch();
+      });
+    if (ready) {
+      mdlog->get_journaler()->reread_head_and_probe(
+        new C_MDS_StandbyReplayRestartFinish(
+          this,
+	  mdlog->get_journaler()->get_read_pos()));
+
+      dout(1) << " opening purge_queue (async)" << dendl;
+      purge_queue.open(NULL);
+      dout(1) << " opening open_file_table (async)" << dendl;
+      mdcache->open_file_table.load(nullptr);
+    } else {
+      auto fin = new C_IO_Wrapper(this, new C_MDS_StandbyReplayRestart(this));
+      dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
+	      << " (which blocklists prior instance)" << dendl;
+      objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(),
+			     lambdafy(fin));
+    }
+  }
+}
+
+void MDSRank::replay_done()
+{
+  if (!standby_replaying) {
+    dout(1) << "Finished replaying journal" << dendl;
+  } else {
+    dout(5) << "Finished replaying journal as standby-replay" << dendl;
+  }
+
+  if (is_standby_replay()) {
+    // The replay was done in standby state, and we are still in that state
+    ceph_assert(standby_replaying);
+    dout(10) << "setting replay timer" << dendl;
+    timer.add_event_after(g_conf()->mds_replay_interval,
+                          new C_MDS_StandbyReplayRestart(this));
+    return;
+  } else if (standby_replaying) {
+    // The replay was done in standby state, we have now _left_ that state
+    dout(10) << " last replay pass was as a standby; making final pass" << dendl;
+    standby_replaying = false;
+    standby_replay_restart();
+    return;
+  } else {
+    // Replay is complete, journal read should be up to date
+    ceph_assert(mdlog->get_journaler()->get_read_pos() == mdlog->get_journaler()->get_write_pos());
+    ceph_assert(!is_standby_replay());
+
+    // Reformat and come back here
+    if (mdlog->get_journaler()->get_stream_format() < g_conf()->mds_journal_format) {
+        dout(4) << "reformatting journal on standby-replay->replay transition" << dendl;
+        mdlog->reopen(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
+        return;
+    }
+  }
+
+  dout(1) << "making mds journal writeable" << dendl;
+  mdlog->get_journaler()->set_writeable();
+  mdlog->get_journaler()->trim_tail();
+
+  if (mdsmap->get_tableserver() == whoami &&
+      snapserver->upgrade_format()) {
+    dout(1) << "upgrading snaptable format" << dendl;
+    snapserver->save(new C_MDSInternalNoop);
+  }
+
+  if (g_conf()->mds_wipe_sessions) {
+    dout(1) << "wiping out client sessions" << dendl;
+    sessionmap.wipe();
+    sessionmap.save(new C_MDSInternalNoop);
+  }
+  if (g_conf()->mds_wipe_ino_prealloc) {
+    dout(1) << "wiping out ino prealloc from sessions" << dendl;
+    sessionmap.wipe_ino_prealloc();
+    sessionmap.save(new C_MDSInternalNoop);
+  }
+  if (g_conf()->mds_skip_ino) {
+    inodeno_t i = g_conf()->mds_skip_ino;
+    dout(1) << "skipping " << i << " inodes" << dendl;
+    inotable->skip_inos(i);
+    inotable->save(new C_MDSInternalNoop);
+  }
+
+  if (mdsmap->get_num_in_mds() == 1 &&
+      mdsmap->get_num_failed_mds() == 0) { // just me!
+    dout(2) << "i am alone, moving to state reconnect" << dendl;
+    request_state(MDSMap::STATE_RECONNECT);
+    // sync snaptable cache
+    snapclient->sync(new C_MDSInternalNoop);
+  } else {
+    dout(2) << "i am not alone, moving to state resolve" << dendl;
+    request_state(MDSMap::STATE_RESOLVE);
+  }
+}
+
+void MDSRank::reopen_log()
+{
+  dout(1) << "reopen_log" << dendl;
+  mdcache->rollback_uncommitted_fragments();
+}
+
+void MDSRank::resolve_start()
+{
+  dout(1) << "resolve_start" << dendl;
+
+  reopen_log();
+
+  calc_recovery_set();
+
+  mdcache->resolve_start(new C_MDS_VoidFn(this, &MDSRank::resolve_done));
+  finish_contexts(g_ceph_context, waiting_for_resolve);
+}
+
+void MDSRank::resolve_done()
+{
+  dout(1) << "resolve_done" << dendl;
+  request_state(MDSMap::STATE_RECONNECT);
+  // sync snaptable cache
+  snapclient->sync(new C_MDSInternalNoop);
+}
+
+void MDSRank::apply_blocklist(const std::set<entity_addr_t> &addrs, epoch_t epoch) {
+  auto victims = server->apply_blocklist();
+  dout(4) << __func__ << ": killed " << victims << ", blocklisted sessions ("
+          << addrs.size() << " blocklist entries, "
+          << sessionmap.get_sessions().size() << ")" << dendl;
+  if (victims) {
+    set_osd_epoch_barrier(epoch);
+  }
+}
+
+
+void MDSRank::reconnect_start()
+{
+  dout(1) << "reconnect_start" << dendl;
+
+  if (last_state == MDSMap::STATE_REPLAY) {
+    reopen_log();
+  }
+
+  // Drop any blocklisted clients from the SessionMap before going
+  // into reconnect, so that we don't wait for them.
+  objecter->enable_blocklist_events();
+  std::set<entity_addr_t> blocklist;
+  std::set<entity_addr_t> range;
+  epoch_t epoch = 0;
+  objecter->with_osdmap([&blocklist, &range, &epoch](const OSDMap& o) {
+    o.get_blocklist(&blocklist, &range);
+      epoch = o.get_epoch();
+  });
+
+  apply_blocklist(blocklist, epoch);
+
+  server->reconnect_clients(new C_MDS_VoidFn(this, &MDSRank::reconnect_done));
+  finish_contexts(g_ceph_context, waiting_for_reconnect);
+}
+void MDSRank::reconnect_done()
+{
+  dout(1) << "reconnect_done" << dendl;
+  request_state(MDSMap::STATE_REJOIN);    // move to rejoin state
+}
+
+void MDSRank::rejoin_joint_start()
+{
+  dout(1) << "rejoin_joint_start" << dendl;
+  mdcache->rejoin_send_rejoins();
+}
+void MDSRank::rejoin_start()
+{
+  dout(1) << "rejoin_start" << dendl;
+  mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
+  finish_contexts(g_ceph_context, waiting_for_rejoin);
+}
+void MDSRank::rejoin_done()
+{
+  dout(1) << "rejoin_done" << dendl;
+  mdcache->show_subtrees();
+  mdcache->show_cache();
+
+  if (mdcache->is_any_uncommitted_fragment()) {
+    dout(1) << " waiting for uncommitted fragments" << dendl;
+    mdcache->wait_for_uncommitted_fragments(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
+    return;
+  }
+
+  // funny case: is our cache empty?  no subtrees?
+  if (!mdcache->is_subtrees()) {
+    if (whoami == 0) {
+      // The root should always have a subtree!
+      clog->error() << "No subtrees found for root MDS rank!";
+      damaged();
+      ceph_assert(mdcache->is_subtrees());
+    } else {
+      dout(1) << " empty cache, no subtrees, leaving cluster" << dendl;
+      request_state(MDSMap::STATE_STOPPED);
+    }
+    return;
+  }
+
+  if (replay_queue.empty() && !server->get_num_pending_reclaim()) {
+    request_state(MDSMap::STATE_ACTIVE);
+  } else {
+    replaying_requests_done = replay_queue.empty();
+    request_state(MDSMap::STATE_CLIENTREPLAY);
+  }
+}
+
+void MDSRank::clientreplay_start()
+{
+  dout(1) << "clientreplay_start" << dendl;
+  finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
+  queue_one_replay();
+}
+
+bool MDSRank::queue_one_replay()
+{
+  if (!replay_queue.empty()) {
+    queue_waiter(replay_queue.front());
+    replay_queue.pop_front();
+    return true;
+  }
+  if (!replaying_requests_done) {
+    replaying_requests_done = true;
+    mdlog->flush();
+  }
+  maybe_clientreplay_done();
+  return false;
+}
+
+void MDSRank::maybe_clientreplay_done()
+{
+  if (is_clientreplay() && get_want_state() == MDSMap::STATE_CLIENTREPLAY) {
+
+    // don't go to active if there are session waiting for being reclaimed
+    if (replaying_requests_done && !server->get_num_pending_reclaim()) {
+      mdlog->wait_for_safe(new C_MDS_VoidFn(this, &MDSRank::clientreplay_done));
+      return;
+    }
+
+    dout(1) << " still have " << replay_queue.size() + (int)!replaying_requests_done
+	    << " requests need to be replayed, " << server->get_num_pending_reclaim()
+	    << " sessions need to be reclaimed" << dendl;
+  }
+}
+
+void MDSRank::clientreplay_done()
+{
+  dout(1) << "clientreplay_done" << dendl;
+  request_state(MDSMap::STATE_ACTIVE);
+}
+
+void MDSRank::active_start()
+{
+  dout(1) << "active_start" << dendl;
+
+  if (last_state == MDSMap::STATE_CREATING ||
+      last_state == MDSMap::STATE_STARTING) {
+    mdcache->open_root();
+  }
+
+  dout(10) << __func__ << ": initializing metrics handler" << dendl;
+  metrics_handler.init();
+  messenger->add_dispatcher_tail(&metrics_handler);
+
+  // metric aggregation is solely done by rank 0
+  if (is_rank0()) {
+    dout(10) << __func__ << ": initializing metric aggregator" << dendl;
+    ceph_assert(metric_aggregator == nullptr);
+    metric_aggregator = std::make_unique<MetricAggregator>(cct, this, mgrc);
+    metric_aggregator->init();
+    messenger->add_dispatcher_tail(metric_aggregator.get());
+  }
+
+  mdcache->clean_open_file_lists();
+  mdcache->export_remaining_imported_caps();
+  finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
+
+  mdcache->reissue_all_caps();
+
+  finish_contexts(g_ceph_context, waiting_for_active);  // kick waiters
+}
+
+void MDSRank::recovery_done(int oldstate)
+{
+  dout(1) << "recovery_done -- successful recovery!" << dendl;
+  ceph_assert(is_clientreplay() || is_active());
+
+  if (oldstate == MDSMap::STATE_CREATING)
+    return;
+
+  mdcache->start_recovered_truncates();
+  mdcache->start_purge_inodes();
+  mdcache->start_files_to_recover();
+
+  mdcache->populate_mydir();
+}
+
+void MDSRank::creating_done()
+{
+  dout(1)<< "creating_done" << dendl;
+  request_state(MDSMap::STATE_ACTIVE);
+  // sync snaptable cache
+  snapclient->sync(new C_MDSInternalNoop);
+}
+
+void MDSRank::boot_create()
+{
+  dout(3) << "boot_create" << dendl;
+
+  MDSGatherBuilder fin(g_ceph_context, new C_MDS_VoidFn(this, &MDSRank::creating_done));
+
+  mdcache->init_layouts();
+
+  inotable->set_rank(whoami);
+  sessionmap.set_rank(whoami);
+
+  // start with a fresh journal
+  dout(10) << "boot_create creating fresh journal" << dendl;
+  mdlog->create(fin.new_sub());
+
+  // open new journal segment, but do not journal subtree map (yet)
+  mdlog->prepare_new_segment();
+
+  if (whoami == mdsmap->get_root()) {
+    dout(3) << "boot_create creating fresh hierarchy" << dendl;
+    mdcache->create_empty_hierarchy(fin.get());
+  }
+
+  dout(3) << "boot_create creating mydir hierarchy" << dendl;
+  mdcache->create_mydir_hierarchy(fin.get());
+
+  dout(3) << "boot_create creating global snaprealm" << dendl;
+  mdcache->create_global_snaprealm();
+
+  // fixme: fake out inotable (reset, pretend loaded)
+  dout(10) << "boot_create creating fresh inotable table" << dendl;
+  inotable->reset();
+  inotable->save(fin.new_sub());
+
+  // write empty sessionmap
+  sessionmap.save(fin.new_sub());
+
+  // Create empty purge queue
+  purge_queue.create(new C_IO_Wrapper(this, fin.new_sub()));
+
+  // initialize tables
+  if (mdsmap->get_tableserver() == whoami) {
+    dout(10) << "boot_create creating fresh snaptable" << dendl;
+    snapserver->set_rank(whoami);
+    snapserver->reset();
+    snapserver->save(fin.new_sub());
+  }
+
+  ceph_assert(g_conf()->mds_kill_create_at != 1);
+
+  // ok now journal it
+  mdlog->journal_segment_subtree_map(fin.new_sub());
+  mdlog->flush();
+
+  // Usually we do this during reconnect, but creation skips that.
+  objecter->enable_blocklist_events();
+
+  fin.activate();
+}
+
+void MDSRank::stopping_start()
+{
+  dout(2) << "Stopping..." << dendl;
+
+  if (mdsmap->get_num_in_mds() == 1 && !sessionmap.empty()) {
+    std::vector<Session*> victims;
+    const auto& sessions = sessionmap.get_sessions();
+    for (const auto& p : sessions)  {
+      if (!p.first.is_client()) {
+        continue;
+      }
+
+      Session *s = p.second;
+      victims.push_back(s);
+    }
+
+    dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl;
+    ceph_assert(!victims.empty());
+
+    C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop);
+    for (const auto &s : victims) {
+      CachedStackStringStream css;
+      evict_client(s->get_client().v, false,
+                   g_conf()->mds_session_blocklist_on_evict, *css, gather.new_sub());
+    }
+    gather.activate();
+  }
+
+  mdcache->shutdown_start();
+}
+
+void MDSRank::stopping_done()
+{
+  dout(2) << "Finished stopping..." << dendl;
+
+  // tell monitor we shut down cleanly.
+  request_state(MDSMap::STATE_STOPPED);
+}
+
+void MDSRankDispatcher::handle_mds_map(
+    const cref_t<MMDSMap> &m,
+    const MDSMap &oldmap)
+{
+  // I am only to be passed MDSMaps in which I hold a rank
+  ceph_assert(whoami != MDS_RANK_NONE);
+
+  mds_gid_t mds_gid = mds_gid_t(monc->get_global_id());
+  MDSMap::DaemonState oldstate = oldmap.get_state_gid(mds_gid);
+  if (oldstate == MDSMap::STATE_NULL) {
+    // monitor may skip sending me the STANDBY map (e.g. if paxos_propose_interval is high)
+    // Assuming I have passed STANDBY state if I got a rank in the first map.
+    oldstate = MDSMap::STATE_STANDBY;
+  }
+  // I should not miss map update
+  ceph_assert(state == oldstate);
+  state = mdsmap->get_state_gid(mds_gid);
+  if (state != oldstate) {
+    last_state = oldstate;
+    incarnation = mdsmap->get_inc_gid(mds_gid);
+  }
+
+  version_t epoch = m->get_epoch();
+
+  // note source's map version
+  if (m->get_source().is_mds() &&
+      peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] < epoch) {
+    dout(15) << " peer " << m->get_source()
+	     << " has mdsmap epoch >= " << epoch
+	     << dendl;
+    peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] = epoch;
+  }
+
+  // Validate state transitions while I hold a rank
+  if (!MDSMap::state_transition_valid(oldstate, state)) {
+    derr << "Invalid state transition " << ceph_mds_state_name(oldstate)
+      << "->" << ceph_mds_state_name(state) << dendl;
+    respawn();
+  }
+
+  if (oldstate != state) {
+    // update messenger.
+    auto sleep_rank_change = g_conf().get_val<double>("mds_sleep_rank_change");
+    if (unlikely(sleep_rank_change > 0)) {
+      // This is to trigger a race where another rank tries to connect to this
+      // MDS before an update to the messenger "myname" is processed. This race
+      // should be closed by ranks holding messages until the rank is out of a
+      // "bootstrapping" state.
+      usleep(sleep_rank_change);
+    } if (state == MDSMap::STATE_STANDBY_REPLAY) {
+      dout(1) << "handle_mds_map i am now mds." << mds_gid << "." << incarnation
+          << " replaying mds." << whoami << "." << incarnation << dendl;
+      messenger->set_myname(entity_name_t::MDS(mds_gid));
+    } else {
+      dout(1) << "handle_mds_map i am now mds." << whoami << "." << incarnation << dendl;
+      messenger->set_myname(entity_name_t::MDS(whoami));
+    }
+  }
+
+  // tell objecter my incarnation
+  if (objecter->get_client_incarnation() != incarnation)
+    objecter->set_client_incarnation(incarnation);
+
+  if (mdsmap->get_required_client_features() != oldmap.get_required_client_features())
+    server->update_required_client_features();
+
+  // for debug
+  if (g_conf()->mds_dump_cache_on_map)
+    mdcache->dump_cache();
+
+  cluster_degraded = mdsmap->is_degraded();
+
+  // mdsmap and oldmap can be discontinuous. failover might happen in the missing mdsmap.
+  // the 'restart' set tracks ranks that have restarted since the old mdsmap
+  set<mds_rank_t> restart;
+  // replaying mds does not communicate with other ranks
+  if (state >= MDSMap::STATE_RESOLVE) {
+    // did someone fail?
+    //   new down?
+    set<mds_rank_t> olddown, down;
+    oldmap.get_down_mds_set(&olddown);
+    mdsmap->get_down_mds_set(&down);
+    for (const auto& r : down) {
+      if (oldmap.have_inst(r) && olddown.count(r) == 0) {
+	messenger->mark_down_addrs(oldmap.get_addrs(r));
+	handle_mds_failure(r);
+      }
+    }
+
+    // did someone fail?
+    //   did their addr/inst change?
+    set<mds_rank_t> up;
+    mdsmap->get_up_mds_set(up);
+    for (const auto& r : up) {
+      auto& info = mdsmap->get_info(r);
+      if (oldmap.have_inst(r)) {
+	auto& oldinfo = oldmap.get_info(r);
+	if (info.inc != oldinfo.inc) {
+	  messenger->mark_down_addrs(oldinfo.get_addrs());
+	  if (info.state == MDSMap::STATE_REPLAY ||
+	      info.state == MDSMap::STATE_RESOLVE) {
+	    restart.insert(r);
+	    handle_mds_failure(r);
+	  } else {
+	    ceph_assert(info.state == MDSMap::STATE_STARTING ||
+		   info.state == MDSMap::STATE_ACTIVE);
+	    // -> stopped (missing) -> starting -> active
+	    restart.insert(r);
+	    mdcache->migrator->handle_mds_failure_or_stop(r);
+	    if (mdsmap->get_tableserver() == whoami)
+	      snapserver->handle_mds_failure_or_stop(r);
+	  }
+	}
+      } else {
+	if (info.state == MDSMap::STATE_REPLAY ||
+	    info.state == MDSMap::STATE_RESOLVE) {
+	  // -> starting/creating (missing) -> active (missing) -> replay -> resolve
+	  restart.insert(r);
+	  handle_mds_failure(r);
+	} else {
+	  ceph_assert(info.state == MDSMap::STATE_CREATING ||
+		 info.state == MDSMap::STATE_STARTING ||
+		 info.state == MDSMap::STATE_ACTIVE);
+	}
+      }
+    }
+  }
+
+  // did it change?
+  if (oldstate != state) {
+    dout(1) << "handle_mds_map state change "
+	    << ceph_mds_state_name(oldstate) << " --> "
+	    << ceph_mds_state_name(state) << dendl;
+    beacon.set_want_state(*mdsmap, state);
+
+    if (oldstate == MDSMap::STATE_STANDBY_REPLAY) {
+        dout(10) << "Monitor activated us! Deactivating replay loop" << dendl;
+        assert (state == MDSMap::STATE_REPLAY);
+    } else {
+      // did i just recover?
+      if ((is_active() || is_clientreplay()) &&
+          (oldstate == MDSMap::STATE_CREATING ||
+	   oldstate == MDSMap::STATE_REJOIN ||
+	   oldstate == MDSMap::STATE_RECONNECT))
+        recovery_done(oldstate);
+
+      if (is_active()) {
+        active_start();
+      } else if (is_any_replay()) {
+        replay_start();
+      } else if (is_resolve()) {
+        resolve_start();
+      } else if (is_reconnect()) {
+        reconnect_start();
+      } else if (is_rejoin()) {
+	rejoin_start();
+      } else if (is_clientreplay()) {
+        clientreplay_start();
+      } else if (is_creating()) {
+        boot_create();
+      } else if (is_starting()) {
+        boot_start();
+      } else if (is_stopping()) {
+        ceph_assert(oldstate == MDSMap::STATE_ACTIVE);
+        stopping_start();
+      }
+    }
+  }
+
+  // RESOLVE
+  // is someone else newly resolving?
+  if (state >= MDSMap::STATE_RESOLVE) {
+    // recover snaptable
+    if (mdsmap->get_tableserver() == whoami) {
+      if (oldstate < MDSMap::STATE_RESOLVE) {
+	set<mds_rank_t> s;
+	mdsmap->get_mds_set_lower_bound(s, MDSMap::STATE_RESOLVE);
+	snapserver->finish_recovery(s);
+      } else {
+	set<mds_rank_t> old_set, new_set;
+	oldmap.get_mds_set_lower_bound(old_set, MDSMap::STATE_RESOLVE);
+	mdsmap->get_mds_set_lower_bound(new_set, MDSMap::STATE_RESOLVE);
+	for (const auto& r : new_set) {
+	  if (r == whoami)
+	    continue; // not me
+	  if (!old_set.count(r) || restart.count(r)) {  // newly so?
+	    snapserver->handle_mds_recovery(r);
+	  }
+	}
+      }
+    }
+
+    if ((!oldmap.is_resolving() || !restart.empty()) && mdsmap->is_resolving()) {
+      set<mds_rank_t> resolve;
+      mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
+      dout(10) << " resolve set is " << resolve << dendl;
+      calc_recovery_set();
+      mdcache->send_resolves();
+    }
+  }
+
+  // REJOIN
+  // is everybody finally rejoining?
+  if (state >= MDSMap::STATE_REJOIN) {
+    // did we start?
+    if (!oldmap.is_rejoining() && mdsmap->is_rejoining())
+      rejoin_joint_start();
+
+    // did we finish?
+    if (g_conf()->mds_dump_cache_after_rejoin &&
+	oldmap.is_rejoining() && !mdsmap->is_rejoining())
+      mdcache->dump_cache();      // for DEBUG only
+
+    if (oldstate >= MDSMap::STATE_REJOIN ||
+	oldstate == MDSMap::STATE_STARTING) {
+      // ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them.
+      set<mds_rank_t> olddis, dis;
+      oldmap.get_mds_set_lower_bound(olddis, MDSMap::STATE_REJOIN);
+      mdsmap->get_mds_set_lower_bound(dis, MDSMap::STATE_REJOIN);
+      for (const auto& r : dis) {
+	if (r == whoami)
+	  continue; // not me
+	if (!olddis.count(r) || restart.count(r)) {  // newly so?
+	  mdcache->kick_discovers(r);
+	  mdcache->kick_open_ino_peers(r);
+	}
+      }
+    }
+  }
+
+  if (oldmap.is_degraded() && !cluster_degraded && state >= MDSMap::STATE_ACTIVE) {
+    dout(1) << "cluster recovered." << dendl;
+    auto it = waiting_for_active_peer.find(MDS_RANK_NONE);
+    if (it != waiting_for_active_peer.end()) {
+      queue_waiters(it->second);
+      waiting_for_active_peer.erase(it);
+    }
+  }
+
+  // did someone leave a "bootstrapping" state? We can't connect until then to
+  // allow messenger "myname" updates.
+  {
+    std::vector<mds_rank_t> erase;
+    for (auto& [rank, queue] : waiting_for_bootstrapping_peer) {
+      auto state = mdsmap->get_state(rank);
+      if (state > MDSMap::STATE_REPLAY) {
+        queue_waiters(queue);
+        erase.push_back(rank);
+      }
+    }
+    for (const auto& rank : erase) {
+      waiting_for_bootstrapping_peer.erase(rank);
+    }
+  }
+  // for testing...
+  if (unlikely(g_conf().get_val<bool>("mds_connect_bootstrapping"))) {
+    std::set<mds_rank_t> bootstrapping;
+    mdsmap->get_mds_set(bootstrapping, MDSMap::STATE_REPLAY);
+    mdsmap->get_mds_set(bootstrapping, MDSMap::STATE_CREATING);
+    mdsmap->get_mds_set(bootstrapping, MDSMap::STATE_STARTING);
+    for (const auto& rank : bootstrapping) {
+      auto m = make_message<MMDSMap>(monc->get_fsid(), *mdsmap);
+      send_message_mds(std::move(m), rank);
+    }
+  }
+
+  // did someone go active?
+  if (state >= MDSMap::STATE_CLIENTREPLAY &&
+      oldstate >= MDSMap::STATE_CLIENTREPLAY) {
+    set<mds_rank_t> oldactive, active;
+    oldmap.get_mds_set_lower_bound(oldactive, MDSMap::STATE_CLIENTREPLAY);
+    mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
+    for (const auto& r : active) {
+      if (r == whoami)
+	continue; // not me
+      if (!oldactive.count(r) || restart.count(r))  // newly so?
+	handle_mds_recovery(r);
+    }
+  }
+
+  if (is_clientreplay() || is_active() || is_stopping()) {
+    // did anyone stop?
+    set<mds_rank_t> oldstopped, stopped;
+    oldmap.get_stopped_mds_set(oldstopped);
+    mdsmap->get_stopped_mds_set(stopped);
+    for (const auto& r : stopped)
+      if (oldstopped.count(r) == 0) {     // newly so?
+	mdcache->migrator->handle_mds_failure_or_stop(r);
+	if (mdsmap->get_tableserver() == whoami)
+	  snapserver->handle_mds_failure_or_stop(r);
+      }
+  }
+
+  {
+    map<epoch_t,MDSContext::vec >::iterator p = waiting_for_mdsmap.begin();
+    while (p != waiting_for_mdsmap.end() && p->first <= mdsmap->get_epoch()) {
+      MDSContext::vec ls;
+      ls.swap(p->second);
+      waiting_for_mdsmap.erase(p++);
+      queue_waiters(ls);
+    }
+  }
+
+  if (is_active()) {
+    // Before going active, set OSD epoch barrier to latest (so that
+    // we don't risk handing out caps to clients with old OSD maps that
+    // might not include barriers from the previous incarnation of this MDS)
+    set_osd_epoch_barrier(objecter->with_osdmap(
+			    std::mem_fn(&OSDMap::get_epoch)));
+
+    /* Now check if we should hint to the OSD that a read may follow */
+    if (mdsmap->has_standby_replay(whoami))
+      mdlog->set_write_iohint(0);
+    else
+      mdlog->set_write_iohint(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+  }
+
+  if (oldmap.get_max_mds() != mdsmap->get_max_mds()) {
+    purge_queue.update_op_limit(*mdsmap);
+  }
+
+  if (mdsmap->get_inline_data_enabled() && !oldmap.get_inline_data_enabled())
+    dout(0) << "WARNING: inline_data support has been deprecated and will be removed in a future release" << dendl;
+
+  mdcache->handle_mdsmap(*mdsmap, oldmap);
+
+  if (metric_aggregator != nullptr) {
+    metric_aggregator->notify_mdsmap(*mdsmap);
+  }
+  metrics_handler.notify_mdsmap(*mdsmap);
+}
+
+void MDSRank::handle_mds_recovery(mds_rank_t who)
+{
+  dout(5) << "handle_mds_recovery mds." << who << dendl;
+
+  mdcache->handle_mds_recovery(who);
+
+  queue_waiters(waiting_for_active_peer[who]);
+  waiting_for_active_peer.erase(who);
+}
+
+void MDSRank::handle_mds_failure(mds_rank_t who)
+{
+  if (who == whoami) {
+    dout(5) << "handle_mds_failure for myself; not doing anything" << dendl;
+    return;
+  }
+  dout(5) << "handle_mds_failure mds." << who << dendl;
+
+  mdcache->handle_mds_failure(who);
+
+  if (mdsmap->get_tableserver() == whoami)
+    snapserver->handle_mds_failure_or_stop(who);
+
+  snapclient->handle_mds_failure(who);
+
+  scrubstack->handle_mds_failure(who);
+}
+
+void MDSRankDispatcher::handle_asok_command(
+  std::string_view command,
+  const cmdmap_t& cmdmap,
+  Formatter *f,
+  const bufferlist &inbl,
+  std::function<void(int,const std::string&,bufferlist&)> on_finish)
+{
+  int r = 0;
+  CachedStackStringStream css;
+  bufferlist outbl;
+  if (command == "dump_ops_in_flight" ||
+      command == "ops") {
+    if (!op_tracker.dump_ops_in_flight(f)) {
+      *css << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
+    }
+  } else if (command == "dump_blocked_ops") {
+    if (!op_tracker.dump_ops_in_flight(f, true)) {
+      *css << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
+    }
+  } else if (command == "dump_historic_ops") {
+    if (!op_tracker.dump_historic_ops(f)) {
+      *css << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
+    }
+  } else if (command == "dump_historic_ops_by_duration") {
+    if (!op_tracker.dump_historic_ops(f, true)) {
+      *css << "op_tracker disabled; set mds_enable_op_tracker=true to enable";
+    }
+  } else if (command == "osdmap barrier") {
+    int64_t target_epoch = 0;
+    bool got_val = cmd_getval(cmdmap, "target_epoch", target_epoch);
+
+    if (!got_val) {
+      *css << "no target epoch given";
+      r = -CEPHFS_EINVAL;
+      goto out;
+    }
+    {
+      std::lock_guard l(mds_lock);
+      set_osd_epoch_barrier(target_epoch);
+    }
+    boost::system::error_code ec;
+    dout(4) << __func__ << ": possibly waiting for OSD epoch " << target_epoch << dendl;
+    objecter->wait_for_map(target_epoch, ceph::async::use_blocked[ec]);
+  } else if (command == "session ls" ||
+	     command == "client ls") {
+    std::lock_guard l(mds_lock);
+    bool cap_dump = false;
+    std::vector<std::string> filter_args;
+    cmd_getval(cmdmap, "cap_dump", cap_dump);
+    cmd_getval(cmdmap, "filters", filter_args);
+
+    SessionFilter filter;
+    r = filter.parse(filter_args, css.get());
+    if (r != 0) {
+      goto out;
+    }
+    dump_sessions(filter, f, cap_dump);
+  } else if (command == "session evict" ||
+	     command == "client evict") {
+    std::lock_guard l(mds_lock);
+    std::vector<std::string> filter_args;
+    cmd_getval(cmdmap, "filters", filter_args);
+
+    SessionFilter filter;
+    r = filter.parse(filter_args, css.get());
+    if (r != 0) {
+      r = -CEPHFS_EINVAL;
+      goto out;
+    }
+    evict_clients(filter, on_finish);
+    return;
+  } else if (command == "session kill") {
+    std::string client_id;
+    if (!cmd_getval(cmdmap, "client_id", client_id)) {
+      *css << "Invalid client_id specified";
+      r = -CEPHFS_ENOENT;
+      goto out;
+    }
+    std::lock_guard l(mds_lock);
+    bool evicted = evict_client(strtol(client_id.c_str(), 0, 10), true,
+        g_conf()->mds_session_blocklist_on_evict, *css);
+    if (!evicted) {
+      dout(15) << css->strv() << dendl;
+      r = -CEPHFS_ENOENT;
+    }
+  } else if (command == "session config" ||
+	     command == "client config") {
+    int64_t client_id;
+    std::string option;
+    std::string value;
+
+    cmd_getval(cmdmap, "client_id", client_id);
+    cmd_getval(cmdmap, "option", option);
+    bool got_value = cmd_getval(cmdmap, "value", value);
+
+    std::lock_guard l(mds_lock);
+    r = config_client(client_id, !got_value, option, value, *css);
+  } else if (command == "scrub start" ||
+	     command == "scrub_start") {
+    if (whoami != 0) {
+      *css << "Not rank 0";
+      r = -CEPHFS_EXDEV;
+      goto out;
+    }
+
+    string path;
+    string tag;
+    vector<string> scrubop_vec;
+    cmd_getval(cmdmap, "scrubops", scrubop_vec);
+    cmd_getval(cmdmap, "path", path);
+    cmd_getval(cmdmap, "tag", tag);
+
+    finisher->queue(
+      new LambdaContext(
+	[this, on_finish, f, path, tag, scrubop_vec](int r) {
+	  command_scrub_start(
+	    f, path, tag, scrubop_vec,
+	    new LambdaContext(
+	      [on_finish](int r) {
+		bufferlist outbl;
+		on_finish(r, {}, outbl);
+	      }));
+	}));
+    return;
+  } else if (command == "scrub abort") {
+    if (whoami != 0) {
+      *css << "Not rank 0";
+      r = -CEPHFS_EXDEV;
+      goto out;
+    }
+
+    finisher->queue(
+      new LambdaContext(
+	[this, on_finish, f](int r) {
+	  command_scrub_abort(
+	    f,
+	    new LambdaContext(
+	      [on_finish, f](int r) {
+		bufferlist outbl;
+		f->open_object_section("result");
+		f->dump_int("return_code", r);
+		f->close_section();
+		on_finish(r, {}, outbl);
+	      }));
+	}));
+    return;
+  } else if (command == "scrub pause") {
+    if (whoami != 0) {
+      *css << "Not rank 0";
+      r = -CEPHFS_EXDEV;
+      goto out;
+    }
+
+    finisher->queue(
+      new LambdaContext(
+	[this, on_finish, f](int r) {
+	  command_scrub_pause(
+	    f,
+	    new LambdaContext(
+	      [on_finish, f](int r) {
+		bufferlist outbl;
+		f->open_object_section("result");
+		f->dump_int("return_code", r);
+		f->close_section();
+		on_finish(r, {}, outbl);
+	      }));
+	}));
+    return;
+  } else if (command == "scrub resume") {
+    if (whoami != 0) {
+      *css << "Not rank 0";
+      r = -CEPHFS_EXDEV;
+      goto out;
+    }
+    command_scrub_resume(f);
+  } else if (command == "scrub status") {
+    command_scrub_status(f);
+  } else if (command == "tag path") {
+    if (whoami != 0) {
+      *css << "Not rank 0";
+      r = -CEPHFS_EXDEV;
+      goto out;
+    }
+    string path;
+    cmd_getval(cmdmap, "path", path);
+    string tag;
+    cmd_getval(cmdmap, "tag", tag);
+    command_tag_path(f, path, tag);
+  } else if (command == "flush_path") {
+    string path;
+    cmd_getval(cmdmap, "path", path);
+    command_flush_path(f, path);
+  } else if (command == "flush journal") {
+    command_flush_journal(f);
+  } else if (command == "get subtrees") {
+    command_get_subtrees(f);
+  } else if (command == "export dir") {
+    string path;
+    if(!cmd_getval(cmdmap, "path", path)) {
+      *css << "malformed path";
+      r = -CEPHFS_EINVAL;
+      goto out;
+    }
+    int64_t rank;
+    if(!cmd_getval(cmdmap, "rank", rank)) {
+      *css << "malformed rank";
+      r = -CEPHFS_EINVAL;
+      goto out;
+    }
+    command_export_dir(f, path, (mds_rank_t)rank);
+  } else if (command == "dump cache") {
+    std::lock_guard l(mds_lock);
+    string path;
+    if (!cmd_getval(cmdmap, "path", path)) {
+      r = mdcache->dump_cache(f);
+    } else {
+      r = mdcache->dump_cache(path);
+    }
+  } else if (command == "cache drop") {
+    int64_t timeout = 0;
+    cmd_getval(cmdmap, "timeout", timeout);
+    finisher->queue(
+      new LambdaContext(
+	[this, on_finish, f, timeout](int r) {
+	  command_cache_drop(
+	    timeout, f,
+	    new LambdaContext(
+	      [on_finish](int r) {
+		bufferlist outbl;
+		on_finish(r, {}, outbl);
+	      }));
+	}));
+    return;
+  } else if (command == "cache status") {
+    std::lock_guard l(mds_lock);
+    mdcache->cache_status(f);
+  } else if (command == "dump tree") {
+    command_dump_tree(cmdmap, *css, f);
+  } else if (command == "dump loads") {
+    std::lock_guard l(mds_lock);
+    r = balancer->dump_loads(f);
+  } else if (command == "dump snaps") {
+    std::lock_guard l(mds_lock);
+    string server;
+    cmd_getval(cmdmap, "server", server);
+    if (server == "--server") {
+      if (mdsmap->get_tableserver() == whoami) {
+	snapserver->dump(f);
+      } else {
+	r = -CEPHFS_EXDEV;
+	*css << "Not snapserver";
+      }
+    } else {
+      r = snapclient->dump_cache(f);
+    }
+  } else if (command == "force_readonly") {
+    std::lock_guard l(mds_lock);
+    mdcache->force_readonly();
+  } else if (command == "dirfrag split") {
+    command_dirfrag_split(cmdmap, *css);
+  } else if (command == "dirfrag merge") {
+    command_dirfrag_merge(cmdmap, *css);
+  } else if (command == "dirfrag ls") {
+    command_dirfrag_ls(cmdmap, *css, f);
+  } else if (command == "openfiles ls") {
+    command_openfiles_ls(f);
+  } else if (command == "dump inode") {
+    command_dump_inode(f, cmdmap, *css);
+  } else if (command == "damage ls") {
+    std::lock_guard l(mds_lock);
+    damage_table.dump(f);
+  } else if (command == "damage rm") {
+    std::lock_guard l(mds_lock);
+    damage_entry_id_t id = 0;
+    if (!cmd_getval(cmdmap, "damage_id", (int64_t&)id)) {
+      r = -CEPHFS_EINVAL;
+      goto out;
+    }
+    damage_table.erase(id);
+  } else {
+    r = -CEPHFS_ENOSYS;
+  }
+out:
+  on_finish(r, css->str(), outbl);
+}
+
+/**
+ * This function drops the mds_lock, so don't do anything with
+ * MDSRank after calling it (we could have gone into shutdown): just
+ * send your result back to the calling client and finish.
+ */
+void MDSRankDispatcher::evict_clients(
+  const SessionFilter &filter,
+  std::function<void(int,const std::string&,bufferlist&)> on_finish)
+{
+  bufferlist outbl;
+  if (is_any_replay()) {
+    on_finish(-CEPHFS_EAGAIN, "MDS is replaying log", outbl);
+    return;
+  }
+
+  std::vector<Session*> victims;
+  const auto& sessions = sessionmap.get_sessions();
+  for (const auto& p : sessions)  {
+    if (!p.first.is_client()) {
+      continue;
+    }
+
+    Session *s = p.second;
+
+    if (filter.match(*s, std::bind(&Server::waiting_for_reconnect, server,
+				   std::placeholders::_1))) {
+      victims.push_back(s);
+    }
+  }
+
+  dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl;
+
+  if (victims.empty()) {
+    on_finish(0, {}, outbl);
+    return;
+  }
+
+  C_GatherBuilder gather(g_ceph_context,
+			 new LambdaContext([on_finish](int r) {
+					     bufferlist bl;
+					     on_finish(r, {}, bl);
+					   }));
+  for (const auto s : victims) {
+    CachedStackStringStream css;
+    evict_client(s->get_client().v, false,
+                 g_conf()->mds_session_blocklist_on_evict, *css, gather.new_sub());
+  }
+  gather.activate();
+}
+
+void MDSRankDispatcher::dump_sessions(const SessionFilter &filter, Formatter *f, bool cap_dump) const
+{
+  // Dump sessions, decorated with recovery/replay status
+  f->open_array_section("sessions");
+  for (auto& [name, s] : sessionmap.get_sessions()) {
+    if (!name.is_client()) {
+      continue;
+    }
+
+    if (!filter.match(*s, std::bind(&Server::waiting_for_reconnect, server, std::placeholders::_1))) {
+      continue;
+    }
+
+    f->open_object_section("session");
+    s->dump(f, cap_dump);
+    f->close_section();
+  }
+  f->close_section(); // sessions
+}
+
+void MDSRank::command_scrub_start(Formatter *f,
+                                  std::string_view path, std::string_view tag,
+                                  const vector<string>& scrubop_vec, Context *on_finish)
+{
+  bool force = false;
+  bool recursive = false;
+  bool repair = false;
+  for (auto &op : scrubop_vec) {
+    if (op == "force")
+      force = true;
+    else if (op == "recursive")
+      recursive = true;
+    else if (op == "repair")
+      repair = true;
+  }
+
+  std::lock_guard l(mds_lock);
+  mdcache->enqueue_scrub(path, tag, force, recursive, repair, f, on_finish);
+  // scrub_dentry() finishers will dump the data for us; we're done!
+}
+
+void MDSRank::command_tag_path(Formatter *f,
+    std::string_view path, std::string_view tag)
+{
+  C_SaferCond scond;
+  {
+    std::lock_guard l(mds_lock);
+    mdcache->enqueue_scrub(path, tag, true, true, false, f, &scond);
+  }
+  scond.wait();
+}
+
+void MDSRank::command_scrub_abort(Formatter *f, Context *on_finish) {
+  std::lock_guard l(mds_lock);
+  scrubstack->scrub_abort(on_finish);
+}
+
+void MDSRank::command_scrub_pause(Formatter *f, Context *on_finish) {
+  std::lock_guard l(mds_lock);
+  scrubstack->scrub_pause(on_finish);
+}
+
+void MDSRank::command_scrub_resume(Formatter *f) {
+  std::lock_guard l(mds_lock);
+  int r = scrubstack->scrub_resume();
+
+  f->open_object_section("result");
+  f->dump_int("return_code", r);
+  f->close_section();
+}
+
+void MDSRank::command_scrub_status(Formatter *f) {
+  std::lock_guard l(mds_lock);
+  scrubstack->scrub_status(f);
+}
+
+void MDSRank::command_flush_path(Formatter *f, std::string_view path)
+{
+  C_SaferCond scond;
+  {
+    std::lock_guard l(mds_lock);
+    mdcache->flush_dentry(path, &scond);
+  }
+  int r = scond.wait();
+  f->open_object_section("results");
+  f->dump_int("return_code", r);
+  f->close_section(); // results
+}
+
+// synchronous wrapper around "journal flush" asynchronous context
+// execution.
+void MDSRank::command_flush_journal(Formatter *f) {
+  ceph_assert(f != NULL);
+
+  C_SaferCond cond;
+  CachedStackStringStream css;
+  {
+    std::lock_guard locker(mds_lock);
+    C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, this, css.get(), &cond);
+    flush_journal->send();
+  }
+  int r = cond.wait();
+
+  f->open_object_section("result");
+  f->dump_string("message", css->strv());
+  f->dump_int("return_code", r);
+  f->close_section();
+}
+
+void MDSRank::command_get_subtrees(Formatter *f)
+{
+  ceph_assert(f != NULL);
+  std::lock_guard l(mds_lock);
+
+  std::vector<CDir*> subtrees;
+  mdcache->get_subtrees(subtrees);
+
+  f->open_array_section("subtrees");
+  for (const auto& dir : subtrees) {
+    f->open_object_section("subtree");
+    {
+      f->dump_bool("is_auth", dir->is_auth());
+      f->dump_int("auth_first", dir->get_dir_auth().first);
+      f->dump_int("auth_second", dir->get_dir_auth().second); {
+	mds_rank_t export_pin = dir->inode->get_export_pin(false);
+	f->dump_int("export_pin", export_pin >= 0 ? export_pin : -1);
+	f->dump_bool("distributed_ephemeral_pin", export_pin == MDS_RANK_EPHEMERAL_DIST);
+	f->dump_bool("random_ephemeral_pin", export_pin == MDS_RANK_EPHEMERAL_RAND);
+      }
+      f->dump_int("export_pin_target", dir->get_export_pin(false));
+      f->open_object_section("dir");
+      dir->dump(f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+
+void MDSRank::command_export_dir(Formatter *f,
+    std::string_view path,
+    mds_rank_t target)
+{
+  int r = _command_export_dir(path, target);
+  f->open_object_section("results");
+  f->dump_int("return_code", r);
+  f->close_section(); // results
+}
+
+int MDSRank::_command_export_dir(
+    std::string_view path,
+    mds_rank_t target)
+{
+  std::lock_guard l(mds_lock);
+  filepath fp(path);
+
+  if (target == whoami || !mdsmap->is_up(target) || !mdsmap->is_in(target)) {
+    derr << "bad MDS target " << target << dendl;
+    return -CEPHFS_ENOENT;
+  }
+
+  CInode *in = mdcache->cache_traverse(fp);
+  if (!in) {
+    derr << "Bath path '" << path << "'" << dendl;
+    return -CEPHFS_ENOENT;
+  }
+  CDir *dir = in->get_dirfrag(frag_t());
+  if (!dir || !(dir->is_auth())) {
+    derr << "bad export_dir path dirfrag frag_t() or dir not auth" << dendl;
+    return -CEPHFS_EINVAL;
+  }
+
+  mdcache->migrator->export_dir(dir, target);
+  return 0;
+}
+
+void MDSRank::command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f) 
+{
+  std::string root;
+  int64_t depth;
+  cmd_getval(cmdmap, "root", root);
+  if (root.empty()) {
+    root = "/";
+  }
+  if (!cmd_getval(cmdmap, "depth", depth))
+    depth = -1;
+  std::lock_guard l(mds_lock);
+  CInode *in = mdcache->cache_traverse(filepath(root.c_str()));
+  if (!in) {
+    ss << "root inode is not in cache";
+    return;
+  }
+  f->open_array_section("inodes");
+  mdcache->dump_tree(in, 0, depth, f);
+  f->close_section();
+}
+
+CDir *MDSRank::_command_dirfrag_get(
+    const cmdmap_t &cmdmap,
+    std::ostream &ss)
+{
+  std::string path;
+  bool got = cmd_getval(cmdmap, "path", path);
+  if (!got) {
+    ss << "missing path argument";
+    return NULL;
+  }
+
+  std::string frag_str;
+  if (!cmd_getval(cmdmap, "frag", frag_str)) {
+    ss << "missing frag argument";
+    return NULL;
+  }
+
+  CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
+  if (!in) {
+    // TODO really we should load something in if it's not in cache,
+    // but the infrastructure is harder, and we might still be unable
+    // to act on it if someone else is auth.
+    ss << "directory '" << path << "' inode not in cache";
+    return NULL;
+  }
+
+  frag_t fg;
+
+  if (!fg.parse(frag_str.c_str())) {
+    ss << "frag " << frag_str << " failed to parse";
+    return NULL;
+  }
+
+  CDir *dir = in->get_dirfrag(fg);
+  if (!dir) {
+    ss << "frag " << in->ino() << "/" << fg << " not in cache ("
+          "use `dirfrag ls` to see if it should exist)";
+    return NULL;
+  }
+
+  if (!dir->is_auth()) {
+    ss << "frag " << dir->dirfrag() << " not auth (auth = "
+       << dir->authority() << ")";
+    return NULL;
+  }
+
+  return dir;
+}
+
+bool MDSRank::command_dirfrag_split(
+    cmdmap_t cmdmap,
+    std::ostream &ss)
+{
+  std::lock_guard l(mds_lock);
+  int64_t by = 0;
+  if (!cmd_getval(cmdmap, "bits", by)) {
+    ss << "missing bits argument";
+    return false;
+  }
+
+  if (by <= 0) {
+    ss << "must split by >0 bits";
+    return false;
+  }
+
+  CDir *dir = _command_dirfrag_get(cmdmap, ss);
+  if (!dir) {
+    return false;
+  }
+
+  mdcache->split_dir(dir, by);
+
+  return true;
+}
+
+bool MDSRank::command_dirfrag_merge(
+    cmdmap_t cmdmap,
+    std::ostream &ss)
+{
+  std::lock_guard l(mds_lock);
+  std::string path;
+  bool got = cmd_getval(cmdmap, "path", path);
+  if (!got) {
+    ss << "missing path argument";
+    return false;
+  }
+
+  std::string frag_str;
+  if (!cmd_getval(cmdmap, "frag", frag_str)) {
+    ss << "missing frag argument";
+    return false;
+  }
+
+  CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
+  if (!in) {
+    ss << "directory '" << path << "' inode not in cache";
+    return false;
+  }
+
+  frag_t fg;
+  if (!fg.parse(frag_str.c_str())) {
+    ss << "frag " << frag_str << " failed to parse";
+    return false;
+  }
+
+  mdcache->merge_dir(in, fg);
+
+  return true;
+}
+
+bool MDSRank::command_dirfrag_ls(
+    cmdmap_t cmdmap,
+    std::ostream &ss,
+    Formatter *f)
+{
+  std::lock_guard l(mds_lock);
+  std::string path;
+  bool got = cmd_getval(cmdmap, "path", path);
+  if (!got) {
+    ss << "missing path argument";
+    return false;
+  }
+
+  CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
+  if (!in) {
+    ss << "directory inode not in cache";
+    return false;
+  }
+
+  f->open_array_section("frags");
+  frag_vec_t leaves;
+  // NB using get_leaves_under instead of get_dirfrags to give
+  // you the list of what dirfrags may exist, not which are in cache
+  in->dirfragtree.get_leaves_under(frag_t(), leaves);
+  for (const auto& leaf : leaves) {
+    f->open_object_section("frag");
+    f->dump_int("value", leaf.value());
+    f->dump_int("bits", leaf.bits());
+    CachedStackStringStream css;
+    *css << std::hex << leaf.value() << "/" << std::dec << leaf.bits();
+    f->dump_string("str", css->strv());
+    f->close_section();
+  }
+  f->close_section();
+
+  return true;
+}
+
+void MDSRank::command_openfiles_ls(Formatter *f) 
+{
+  std::lock_guard l(mds_lock);
+  mdcache->dump_openfiles(f);
+}
+
+void MDSRank::command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss)
+{
+  std::lock_guard l(mds_lock);
+  int64_t number;
+  bool got = cmd_getval(cmdmap, "number", number);
+  if (!got) {
+    ss << "missing inode number";
+    return;
+  }
+  
+  bool success = mdcache->dump_inode(f, number);
+  if (!success) {
+    ss << "dump inode failed, wrong inode number or the inode is not cached";
+  }
+}
+
+void MDSRank::dump_status(Formatter *f) const
+{
+  f->dump_string("fs_name", fs_name);
+  if (state == MDSMap::STATE_REPLAY ||
+      state == MDSMap::STATE_STANDBY_REPLAY) {
+    mdlog->dump_replay_status(f);
+  } else if (state == MDSMap::STATE_RESOLVE) {
+    mdcache->dump_resolve_status(f);
+  } else if (state == MDSMap::STATE_RECONNECT) {
+    server->dump_reconnect_status(f);
+  } else if (state == MDSMap::STATE_REJOIN) {
+    mdcache->dump_rejoin_status(f);
+  } else if (state == MDSMap::STATE_CLIENTREPLAY) {
+    dump_clientreplay_status(f);
+  }
+  f->dump_float("rank_uptime", get_uptime().count());
+}
+
+void MDSRank::dump_clientreplay_status(Formatter *f) const
+{
+  f->open_object_section("clientreplay_status");
+  f->dump_unsigned("clientreplay_queue", replay_queue.size());
+  f->dump_unsigned("active_replay", mdcache->get_num_client_requests());
+  f->close_section();
+}
+
+void MDSRankDispatcher::update_log_config()
+{
+  map<string,string> log_to_monitors;
+  map<string,string> log_to_syslog;
+  map<string,string> log_channel;
+  map<string,string> log_prio;
+  map<string,string> log_to_graylog;
+  map<string,string> log_to_graylog_host;
+  map<string,string> log_to_graylog_port;
+  uuid_d fsid;
+  string host;
+
+  if (parse_log_client_options(g_ceph_context, log_to_monitors, log_to_syslog,
+			       log_channel, log_prio, log_to_graylog,
+			       log_to_graylog_host, log_to_graylog_port,
+			       fsid, host) == 0)
+    clog->update_config(log_to_monitors, log_to_syslog,
+			log_channel, log_prio, log_to_graylog,
+			log_to_graylog_host, log_to_graylog_port,
+			fsid, host);
+  dout(10) << __func__ << " log_to_monitors " << log_to_monitors << dendl;
+}
+
+void MDSRank::create_logger()
+{
+  dout(10) << "create_logger" << dendl;
+  {
+    PerfCountersBuilder mds_plb(g_ceph_context, "mds", l_mds_first, l_mds_last);
+
+    // super useful (high prio) perf stats
+    mds_plb.add_u64_counter(l_mds_request, "request", "Requests", "req",
+                            PerfCountersBuilder::PRIO_CRITICAL);
+    mds_plb.add_time_avg(l_mds_reply_latency, "reply_latency", "Reply latency", "rlat",
+                         PerfCountersBuilder::PRIO_CRITICAL);
+    mds_plb.add_u64(l_mds_inodes, "inodes", "Inodes", "inos",
+                    PerfCountersBuilder::PRIO_CRITICAL);
+    mds_plb.add_u64_counter(l_mds_forward, "forward", "Forwarding request", "fwd",
+                            PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64(l_mds_caps, "caps", "Capabilities", "caps",
+                    PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64_counter(l_mds_exported_inodes, "exported_inodes", "Exported inodes",
+                            "exi", PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64_counter(l_mds_imported_inodes, "imported_inodes", "Imported inodes",
+                            "imi", PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64_counter(l_mds_slow_reply, "slow_reply", "Slow replies", "slr",
+                              PerfCountersBuilder::PRIO_INTERESTING);
+
+    // caps msg stats
+    mds_plb.add_u64_counter(l_mdss_handle_client_caps, "handle_client_caps",
+                           "Client caps msg", "hcc", PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64_counter(l_mdss_handle_client_caps_dirty, "handle_client_caps_dirty",
+                           "Client dirty caps msg", "hccd", PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64_counter(l_mdss_handle_client_cap_release, "handle_client_cap_release",
+                           "Client cap release msg", "hccr", PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64_counter(l_mdss_process_request_cap_release, "process_request_cap_release",
+                           "Process request cap release", "prcr", PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64_counter(l_mdss_ceph_cap_op_revoke, "ceph_cap_op_revoke",
+                           "Revoke caps", "crev", PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64_counter(l_mdss_ceph_cap_op_grant, "ceph_cap_op_grant",
+                           "Grant caps", "cgra", PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64_counter(l_mdss_ceph_cap_op_trunc, "ceph_cap_op_trunc",
+                           "caps truncate notify", "ctru", PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64_counter(l_mdss_ceph_cap_op_flushsnap_ack, "ceph_cap_op_flushsnap_ack",
+                           "caps truncate notify", "cfsa", PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64_counter(l_mdss_ceph_cap_op_flush_ack, "ceph_cap_op_flush_ack",
+                           "caps truncate notify", "cfa", PerfCountersBuilder::PRIO_INTERESTING);
+    mds_plb.add_u64_counter(l_mdss_handle_inode_file_caps, "handle_inode_file_caps",
+                           "Inter mds caps msg", "hifc", PerfCountersBuilder::PRIO_INTERESTING);
+
+    // useful dir/inode/subtree stats
+    mds_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+    mds_plb.add_u64(l_mds_root_rfiles, "root_rfiles", "root inode rfiles");
+    mds_plb.add_u64(l_mds_root_rbytes, "root_rbytes", "root inode rbytes");
+    mds_plb.add_u64(l_mds_root_rsnaps, "root_rsnaps", "root inode rsnaps");
+    mds_plb.add_u64_counter(l_mds_dir_fetch, "dir_fetch", "Directory fetch");
+    mds_plb.add_u64_counter(l_mds_dir_commit, "dir_commit", "Directory commit");
+    mds_plb.add_u64_counter(l_mds_dir_split, "dir_split", "Directory split");
+    mds_plb.add_u64_counter(l_mds_dir_merge, "dir_merge", "Directory merge");
+    mds_plb.add_u64(l_mds_inodes_pinned, "inodes_pinned", "Inodes pinned");
+    mds_plb.add_u64(l_mds_inodes_expired, "inodes_expired", "Inodes expired");
+    mds_plb.add_u64(l_mds_inodes_with_caps, "inodes_with_caps",
+                    "Inodes with capabilities");
+    mds_plb.add_u64(l_mds_subtrees, "subtrees", "Subtrees");
+    mds_plb.add_u64(l_mds_load_cent, "load_cent", "Load per cent");
+    mds_plb.add_u64_counter(l_mds_openino_dir_fetch, "openino_dir_fetch",
+                            "OpenIno incomplete directory fetchings");
+
+    // low prio stats
+    mds_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+    mds_plb.add_u64_counter(l_mds_reply, "reply", "Replies");
+    mds_plb.add_u64(l_mds_inodes_top, "inodes_top", "Inodes on top");
+    mds_plb.add_u64(l_mds_inodes_bottom, "inodes_bottom", "Inodes on bottom");
+    mds_plb.add_u64(
+      l_mds_inodes_pin_tail, "inodes_pin_tail", "Inodes on pin tail");
+    mds_plb.add_u64_counter(l_mds_traverse, "traverse", "Traverses");
+    mds_plb.add_u64_counter(l_mds_traverse_hit, "traverse_hit", "Traverse hits");
+    mds_plb.add_u64_counter(l_mds_traverse_forward, "traverse_forward",
+                            "Traverse forwards");
+    mds_plb.add_u64_counter(l_mds_traverse_discover, "traverse_discover",
+                            "Traverse directory discovers");
+    mds_plb.add_u64_counter(l_mds_traverse_dir_fetch, "traverse_dir_fetch",
+                            "Traverse incomplete directory content fetchings");
+    mds_plb.add_u64_counter(l_mds_traverse_remote_ino, "traverse_remote_ino",
+                            "Traverse remote dentries");
+    mds_plb.add_u64_counter(l_mds_traverse_lock, "traverse_lock",
+                            "Traverse locks");
+    mds_plb.add_u64(l_mds_dispatch_queue_len, "q", "Dispatch queue length");
+    mds_plb.add_u64_counter(l_mds_exported, "exported", "Exports");
+    mds_plb.add_u64_counter(l_mds_imported, "imported", "Imports");
+    mds_plb.add_u64_counter(l_mds_openino_backtrace_fetch, "openino_backtrace_fetch",
+                            "OpenIno backtrace fetchings");
+    mds_plb.add_u64_counter(l_mds_openino_peer_discover, "openino_peer_discover",
+                            "OpenIno peer inode discovers");
+
+    // scrub stats
+    mds_plb.add_u64(l_mds_scrub_backtrace_fetch, "scrub_backtrace_fetch",
+                    "Scrub backtrace fetchings");
+    mds_plb.add_u64(l_mds_scrub_set_tag, "scrub_set_tag",
+                    "Scrub set tags");
+    mds_plb.add_u64(l_mds_scrub_backtrace_repaired, "scrub_backtrace_repaired",
+                    "Scrub backtraces repaired");
+    mds_plb.add_u64(l_mds_scrub_inotable_repaired, "scrub_inotable_repaired",
+                    "Scrub inotable repaired");
+    mds_plb.add_u64(l_mds_scrub_dir_inodes, "scrub_dir_inodes",
+                    "Scrub directory inodes");
+    mds_plb.add_u64(l_mds_scrub_dir_base_inodes, "scrub_dir_base_inodes",
+                    "Scrub directory base inodes");
+    mds_plb.add_u64(l_mds_scrub_dirfrag_rstats, "scrub_dirfrag_rstats",
+                    "Scrub dirfrags rstates");
+    mds_plb.add_u64(l_mds_scrub_file_inodes, "scrub_file_inodes",
+                    "Scrub file inodes");
+
+    logger = mds_plb.create_perf_counters();
+    g_ceph_context->get_perfcounters_collection()->add(logger);
+  }
+
+  {
+    PerfCountersBuilder mdm_plb(g_ceph_context, "mds_mem", l_mdm_first, l_mdm_last);
+    mdm_plb.add_u64(l_mdm_ino, "ino", "Inodes", "ino",
+                    PerfCountersBuilder::PRIO_INTERESTING);
+    mdm_plb.add_u64(l_mdm_dn, "dn", "Dentries", "dn",
+                    PerfCountersBuilder::PRIO_INTERESTING);
+
+    mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+    mdm_plb.add_u64_counter(l_mdm_inoa, "ino+", "Inodes opened");
+    mdm_plb.add_u64_counter(l_mdm_inos, "ino-", "Inodes closed");
+    mdm_plb.add_u64(l_mdm_dir, "dir", "Directories");
+    mdm_plb.add_u64_counter(l_mdm_dira, "dir+", "Directories opened");
+    mdm_plb.add_u64_counter(l_mdm_dirs, "dir-", "Directories closed");
+    mdm_plb.add_u64_counter(l_mdm_dna, "dn+", "Dentries opened");
+    mdm_plb.add_u64_counter(l_mdm_dns, "dn-", "Dentries closed");
+    mdm_plb.add_u64(l_mdm_cap, "cap", "Capabilities");
+    mdm_plb.add_u64_counter(l_mdm_capa, "cap+", "Capabilities added");
+    mdm_plb.add_u64_counter(l_mdm_caps, "cap-", "Capabilities removed");
+    mdm_plb.add_u64(l_mdm_heap, "heap", "Heap size");
+
+    mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+    mdm_plb.add_u64(l_mdm_rss, "rss", "RSS");
+
+    mlogger = mdm_plb.create_perf_counters();
+    g_ceph_context->get_perfcounters_collection()->add(mlogger);
+  }
+
+  mdlog->create_logger();
+  server->create_logger();
+  purge_queue.create_logger();
+  sessionmap.register_perfcounters();
+  mdcache->register_perfcounters();
+}
+
+void MDSRank::check_ops_in_flight()
+{
+  string summary;
+  vector<string> warnings;
+  int slow = 0;
+  if (op_tracker.check_ops_in_flight(&summary, warnings, &slow)) {
+    clog->warn() << summary;
+    for (const auto& warning : warnings) {
+      clog->warn() << warning;
+    }
+  }
+ 
+  // set mds slow request count 
+  mds_slow_req_count = slow;
+  return;
+}
+
+void MDSRankDispatcher::handle_osd_map()
+{
+  if (is_active() &&
+      mdsmap->get_tableserver() == whoami) {
+    snapserver->check_osd_map(true);
+  }
+
+  server->handle_osd_map();
+
+  purge_queue.update_op_limit(*mdsmap);
+
+  // it's ok if replay state is reached via standby-replay, the
+  // reconnect state will journal blocklisted clients (journal
+  // is opened for writing in `replay_done` before moving to
+  // up:resolve).
+  if (!is_any_replay()) {
+    std::set<entity_addr_t> newly_blocklisted;
+    objecter->consume_blocklist_events(&newly_blocklisted);
+    auto epoch = objecter->with_osdmap([](const OSDMap &o){return o.get_epoch();});
+    apply_blocklist(newly_blocklisted, epoch);
+  }
+
+  // By default the objecter only requests OSDMap updates on use,
+  // we would like to always receive the latest maps in order to
+  // apply policy based on the FULL flag.
+  objecter->maybe_request_map();
+}
+
+int MDSRank::config_client(int64_t session_id, bool remove,
+			   const std::string& option, const std::string& value,
+			   std::ostream& ss)
+{
+  Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
+  if (!session) {
+    ss << "session " << session_id << " not in sessionmap!";
+    return -CEPHFS_ENOENT;
+  }
+
+  if (option == "timeout") {
+    if (remove) {
+      auto it = session->info.client_metadata.find("timeout");
+      if (it == session->info.client_metadata.end()) {
+	ss << "Nonexistent config: " << option;
+	return -CEPHFS_ENODATA;
+      }
+      session->info.client_metadata.erase(it);
+    } else {
+      char *end;
+      strtoul(value.c_str(), &end, 0);
+      if (*end) {
+	ss << "Invalid config for timeout: " << value;
+	return -CEPHFS_EINVAL;
+      }
+      session->info.client_metadata[option] = value;
+    }
+    //sessionmap._mark_dirty(session, true);
+  } else {
+    ss << "Invalid config option: " << option;
+    return -CEPHFS_EINVAL;
+  }
+
+  return 0;
+}
+
+bool MDSRank::evict_client(int64_t session_id,
+    bool wait, bool blocklist, std::ostream& err_ss,
+    Context *on_killed)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
+
+  // Mutually exclusive args
+  ceph_assert(!(wait && on_killed != nullptr));
+
+  if (is_any_replay()) {
+    err_ss << "MDS is replaying log";
+    return false;
+  }
+
+  Session *session = sessionmap.get_session(
+      entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
+  if (!session) {
+    err_ss << "session " << session_id << " not in sessionmap!";
+    return false;
+  }
+
+  auto& addr = session->info.inst.addr;
+  {
+    CachedStackStringStream css;
+    *css << "Evicting " << (blocklist ? "(and blocklisting) " : "")
+         << "client session " << session_id << " (" << addr << ")";
+    dout(1) << css->strv() << dendl;
+    clog->info() << css->strv();
+  }
+
+  dout(4) << "Preparing blocklist command... (wait=" << wait << ")" << dendl;
+  CachedStackStringStream css;
+  *css << "{\"prefix\":\"osd blocklist\", \"blocklistop\":\"add\",";
+  *css << "\"addr\":\"";
+  *css << addr;
+  *css << "\"}";
+  std::vector<std::string> cmd = {css->str()};
+
+  auto kill_client_session = [this, session_id, wait, on_killed](){
+    ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
+    Session *session = sessionmap.get_session(
+        entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
+    if (session) {
+      if (on_killed || !wait) {
+        server->kill_session(session, on_killed);
+      } else {
+        C_SaferCond on_safe;
+        server->kill_session(session, &on_safe);
+
+        mds_lock.unlock();
+        on_safe.wait();
+        mds_lock.lock();
+      }
+    } else {
+      dout(1) << "session " << session_id << " was removed while we waited "
+      "for blocklist" << dendl;
+
+      // Even though it wasn't us that removed it, kick our completion
+      // as the session has been removed.
+      if (on_killed) {
+        on_killed->complete(0);
+      }
+    }
+  };
+
+  auto apply_blocklist = [this, cmd](std::function<void ()> fn){
+    ceph_assert(ceph_mutex_is_locked_by_me(mds_lock));
+
+    Context *on_blocklist_done = new LambdaContext([this, fn](int r) {
+      objecter->wait_for_latest_osdmap(
+      lambdafy((new C_OnFinisher(
+         new LambdaContext([this, fn](int r) {
+              std::lock_guard l(mds_lock);
+              auto epoch = objecter->with_osdmap([](const OSDMap &o){
+                  return o.get_epoch();
+              });
+
+              set_osd_epoch_barrier(epoch);
+
+              fn();
+            }), finisher)
+      )));
+    });
+
+    dout(4) << "Sending mon blocklist command: " << cmd[0] << dendl;
+    monc->start_mon_command(cmd, {}, nullptr, nullptr, on_blocklist_done);
+  };
+
+  if (wait) {
+    if (blocklist) {
+      C_SaferCond inline_ctx;
+      apply_blocklist([&inline_ctx](){inline_ctx.complete(0);});
+      mds_lock.unlock();
+      inline_ctx.wait();
+      mds_lock.lock();
+    }
+
+    // We dropped mds_lock, so check that session still exists
+    session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT,
+          session_id));
+    if (!session) {
+      dout(1) << "session " << session_id << " was removed while we waited "
+                 "for blocklist" << dendl;
+      return true;
+    }
+    kill_client_session();
+  } else {
+    if (blocklist) {
+      apply_blocklist(kill_client_session);
+    } else {
+      kill_client_session();
+    }
+  }
+
+  return true;
+}
+
+MDSRankDispatcher::MDSRankDispatcher(
+    mds_rank_t whoami_,
+    std::string fs_name_,
+    ceph::fair_mutex &mds_lock_,
+    LogChannelRef &clog_,
+    CommonSafeTimer<ceph::fair_mutex> &timer_,
+    Beacon &beacon_,
+    std::unique_ptr<MDSMap> &mdsmap_,
+    Messenger *msgr,
+    MonClient *monc_,
+    MgrClient *mgrc,
+    Context *respawn_hook_,
+    Context *suicide_hook_,
+    boost::asio::io_context& ioc)
+  : MDSRank(whoami_, fs_name_, mds_lock_, clog_, timer_, beacon_, mdsmap_,
+            msgr, monc_, mgrc, respawn_hook_, suicide_hook_, ioc)
+{
+    g_conf().add_observer(this);
+}
+
+void MDSRank::command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish) {
+  dout(20) << __func__ << dendl;
+
+  std::lock_guard locker(mds_lock);
+  C_Drop_Cache *request = new C_Drop_Cache(server, mdcache, mdlog, this,
+                                           timeout, f, on_finish);
+  request->send();
+}
+
+epoch_t MDSRank::get_osd_epoch() const
+{
+  return objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));  
+}
+
+const char** MDSRankDispatcher::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "clog_to_graylog",
+    "clog_to_graylog_host",
+    "clog_to_graylog_port",
+    "clog_to_monitors",
+    "clog_to_syslog",
+    "clog_to_syslog_facility",
+    "clog_to_syslog_level",
+    "fsid",
+    "host",
+    "mds_bal_fragment_dirs",
+    "mds_bal_fragment_interval",
+    "mds_cache_memory_limit",
+    "mds_cache_mid",
+    "mds_cache_reservation",
+    "mds_cache_trim_decay_rate",
+    "mds_cap_revoke_eviction_timeout",
+    "mds_dump_cache_threshold_file",
+    "mds_dump_cache_threshold_formatter",
+    "mds_enable_op_tracker",
+    "mds_export_ephemeral_random",
+    "mds_export_ephemeral_random_max",
+    "mds_export_ephemeral_distributed",
+    "mds_health_cache_threshold",
+    "mds_inject_migrator_session_race",
+    "mds_log_pause",
+    "mds_max_export_size",
+    "mds_max_purge_files",
+    "mds_forward_all_requests_to_auth",
+    "mds_max_purge_ops",
+    "mds_max_purge_ops_per_pg",
+    "mds_max_snaps_per_dir",
+    "mds_op_complaint_time",
+    "mds_op_history_duration",
+    "mds_op_history_size",
+    "mds_op_log_threshold",
+    "mds_recall_max_decay_rate",
+    "mds_recall_warning_decay_rate",
+    "mds_request_load_average_decay_rate",
+    "mds_session_cache_liveness_decay_rate",
+    "mds_heartbeat_reset_grace",
+    "mds_heartbeat_grace",
+    "mds_session_cap_acquisition_decay_rate",
+    "mds_max_caps_per_client",
+    "mds_session_cap_acquisition_throttle",
+    "mds_session_max_caps_throttle_ratio",
+    "mds_cap_acquisition_throttle_retry_request_time",
+    "mds_alternate_name_max",
+    "mds_dir_max_entries",
+    NULL
+  };
+  return KEYS;
+}
+
+void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::set<std::string>& changed)
+{
+  // XXX with or without mds_lock!
+
+  if (changed.count("mds_heartbeat_reset_grace")) {
+    _heartbeat_reset_grace = conf.get_val<uint64_t>("mds_heartbeat_reset_grace");
+  }
+  if (changed.count("mds_heartbeat_grace")) {
+    heartbeat_grace = conf.get_val<double>("mds_heartbeat_grace");
+  }
+  if (changed.count("mds_op_complaint_time") || changed.count("mds_op_log_threshold")) {
+    op_tracker.set_complaint_and_threshold(conf->mds_op_complaint_time, conf->mds_op_log_threshold);
+  }
+  if (changed.count("mds_op_history_size") || changed.count("mds_op_history_duration")) {
+    op_tracker.set_history_size_and_duration(conf->mds_op_history_size, conf->mds_op_history_duration);
+  }
+  if (changed.count("mds_enable_op_tracker")) {
+    op_tracker.set_tracking(conf->mds_enable_op_tracker);
+  }
+  if (changed.count("clog_to_monitors") ||
+      changed.count("clog_to_syslog") ||
+      changed.count("clog_to_syslog_level") ||
+      changed.count("clog_to_syslog_facility") ||
+      changed.count("clog_to_graylog") ||
+      changed.count("clog_to_graylog_host") ||
+      changed.count("clog_to_graylog_port") ||
+      changed.count("host") ||
+      changed.count("fsid")) {
+    update_log_config();
+  }
+
+  finisher->queue(new LambdaContext([this, changed](int) {
+    std::scoped_lock lock(mds_lock);
+
+    dout(10) << "flushing conf change to components: " << changed << dendl;
+
+    if (changed.count("mds_log_pause") && !g_conf()->mds_log_pause) {
+      mdlog->kick_submitter();
+    }
+    sessionmap.handle_conf_change(changed);
+    server->handle_conf_change(changed);
+    mdcache->handle_conf_change(changed, *mdsmap);
+    purge_queue.handle_conf_change(changed, *mdsmap);
+  }));
+}
+
+void MDSRank::get_task_status(std::map<std::string, std::string> *status) {
+  dout(20) << __func__ << dendl;
+
+  // scrub summary for now..
+  std::string_view scrub_summary = scrubstack->scrub_summary();
+  if (!ScrubStack::is_idle(scrub_summary)) {
+    send_status = true;
+    status->emplace(SCRUB_STATUS_KEY, std::move(scrub_summary));
+  }
+}
+
+void MDSRank::schedule_update_timer_task() {
+  dout(20) << __func__ << dendl;
+
+  timer.add_event_after(g_conf().get_val<double>("mds_task_status_update_interval"),
+                        new LambdaContext([this](int) {
+                            send_task_status();
+                          }));
+}
+
+void MDSRank::send_task_status() {
+  std::map<std::string, std::string> status;
+  get_task_status(&status);
+
+  if (send_status) {
+    if (status.empty()) {
+      send_status = false;
+    }
+
+    dout(20) << __func__ << ": updating " << status.size() << " status keys" << dendl;
+    int r = mgrc->service_daemon_update_task_status(std::move(status));
+    if (r < 0) {
+      derr << ": failed to update service daemon status: " << cpp_strerror(r) << dendl;
+    }
+
+  }
+
+  schedule_update_timer_task();
+}
diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h
new file mode 100644
index 000000000..31d9992be
--- /dev/null
+++ b/src/mds/MDSRank.h
@@ -0,0 +1,715 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef MDS_RANK_H_
+#define MDS_RANK_H_
+
+#include <string_view>
+
+#include <boost/asio/io_context.hpp>
+
+#include "common/DecayCounter.h"
+#include "common/LogClient.h"
+#include "common/Timer.h"
+#include "common/fair_mutex.h"
+#include "common/TrackedOp.h"
+#include "common/ceph_mutex.h"
+
+#include "include/common_fwd.h"
+
+#include "messages/MClientRequest.h"
+#include "messages/MCommand.h"
+#include "messages/MMDSMap.h"
+
+#include "Beacon.h"
+#include "DamageTable.h"
+#include "MDSMap.h"
+#include "SessionMap.h"
+#include "MDCache.h"
+#include "MDLog.h"
+#include "MDSContext.h"
+#include "PurgeQueue.h"
+#include "Server.h"
+#include "MetricsHandler.h"
+#include "osdc/Journaler.h"
+
+// Full .h import instead of forward declaration for PerfCounter, for the
+// benefit of those including this header and using MDSRank::logger
+#include "common/perf_counters.h"
+
+enum {
+  l_mds_first = 2000,
+  l_mds_request,
+  l_mds_reply,
+  l_mds_reply_latency,
+  l_mds_slow_reply,
+  l_mds_forward,
+  l_mds_dir_fetch,
+  l_mds_dir_commit,
+  l_mds_dir_split,
+  l_mds_dir_merge,
+  l_mds_inodes,
+  l_mds_inodes_top,
+  l_mds_inodes_bottom,
+  l_mds_inodes_pin_tail,
+  l_mds_inodes_pinned,
+  l_mds_inodes_expired,
+  l_mds_inodes_with_caps,
+  l_mds_caps,
+  l_mds_subtrees,
+  l_mds_traverse,
+  l_mds_traverse_hit,
+  l_mds_traverse_forward,
+  l_mds_traverse_discover,
+  l_mds_traverse_dir_fetch,
+  l_mds_traverse_remote_ino,
+  l_mds_traverse_lock,
+  l_mds_load_cent,
+  l_mds_dispatch_queue_len,
+  l_mds_exported,
+  l_mds_exported_inodes,
+  l_mds_imported,
+  l_mds_imported_inodes,
+  l_mds_openino_dir_fetch,
+  l_mds_openino_backtrace_fetch,
+  l_mds_openino_peer_discover,
+  l_mds_root_rfiles,
+  l_mds_root_rbytes,
+  l_mds_root_rsnaps,
+  l_mds_scrub_backtrace_fetch,
+  l_mds_scrub_set_tag,
+  l_mds_scrub_backtrace_repaired,
+  l_mds_scrub_inotable_repaired,
+  l_mds_scrub_dir_inodes,
+  l_mds_scrub_dir_base_inodes,
+  l_mds_scrub_dirfrag_rstats,
+  l_mds_scrub_file_inodes,
+  l_mdss_handle_inode_file_caps,
+  l_mdss_ceph_cap_op_revoke,
+  l_mdss_ceph_cap_op_grant,
+  l_mdss_ceph_cap_op_trunc,
+  l_mdss_ceph_cap_op_flushsnap_ack,
+  l_mdss_ceph_cap_op_flush_ack,
+  l_mdss_handle_client_caps,
+  l_mdss_handle_client_caps_dirty,
+  l_mdss_handle_client_cap_release,
+  l_mdss_process_request_cap_release,
+  l_mds_last,
+};
+
+// memory utilization
+enum {
+  l_mdm_first = 2500,
+  l_mdm_ino,
+  l_mdm_inoa,
+  l_mdm_inos,
+  l_mdm_dir,
+  l_mdm_dira,
+  l_mdm_dirs,
+  l_mdm_dn,
+  l_mdm_dna,
+  l_mdm_dns,
+  l_mdm_cap,
+  l_mdm_capa,
+  l_mdm_caps,
+  l_mdm_rss,
+  l_mdm_heap,
+  l_mdm_last,
+};
+
+namespace ceph {
+  struct heartbeat_handle_d;
+}
+
+class Locker;
+class MDCache;
+class MDLog;
+class MDBalancer;
+class InoTable;
+class SnapServer;
+class SnapClient;
+class MDSTableServer;
+class MDSTableClient;
+class Messenger;
+class MetricAggregator;
+class Objecter;
+class MonClient;
+class MgrClient;
+class Finisher;
+class ScrubStack;
+class C_ExecAndReply;
+
+/**
+ * The public part of this class's interface is what's exposed to all
+ * the various subsystems (server, mdcache, etc), such as pointers
+ * to the other subsystems, and message-sending calls.
+ */
+class MDSRank {
+  public:
+    friend class C_Flush_Journal;
+    friend class C_Drop_Cache;
+    friend class C_CacheDropExecAndReply;
+    friend class C_ScrubExecAndReply;
+    friend class C_ScrubControlExecAndReply;
+
+    CephContext *cct;
+
+    MDSRank(
+        mds_rank_t whoami_,
+	std::string fs_name_,
+        ceph::fair_mutex &mds_lock_,
+        LogChannelRef &clog_,
+        CommonSafeTimer<ceph::fair_mutex> &timer_,
+        Beacon &beacon_,
+        std::unique_ptr<MDSMap> & mdsmap_,
+        Messenger *msgr,
+        MonClient *monc_,
+        MgrClient *mgrc,
+        Context *respawn_hook_,
+        Context *suicide_hook_,
+	boost::asio::io_context& ioc);
+
+    mds_rank_t get_nodeid() const { return whoami; }
+    std::string_view get_fs_name() const { return fs_name; }
+    int64_t get_metadata_pool() const
+    {
+        return metadata_pool;
+    }
+
+    mono_time get_starttime() const {
+      return starttime;
+    }
+    chrono::duration<double> get_uptime() const {
+      mono_time now = mono_clock::now();
+      return chrono::duration<double>(now-starttime);
+    }
+
+    bool is_daemon_stopping() const;
+
+    MDSTableClient *get_table_client(int t);
+    MDSTableServer *get_table_server(int t);
+
+    Session *get_session(client_t client) {
+      return sessionmap.get_session(entity_name_t::CLIENT(client.v));
+    }
+    Session *get_session(const cref_t<Message> &m);
+
+    MDSMap::DaemonState get_state() const { return state; } 
+    MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); } 
+
+    bool is_creating() const { return state == MDSMap::STATE_CREATING; }
+    bool is_starting() const { return state == MDSMap::STATE_STARTING; }
+    bool is_standby() const { return state == MDSMap::STATE_STANDBY; }
+    bool is_replay() const { return state == MDSMap::STATE_REPLAY; }
+    bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; }
+    bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; }
+    bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; }
+    bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; }
+    bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; }
+    bool is_active() const { return state == MDSMap::STATE_ACTIVE; }
+    bool is_stopping() const { return state == MDSMap::STATE_STOPPING; }
+    bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
+    bool is_stopped() const { return mdsmap->is_stopped(whoami); }
+    bool is_cluster_degraded() const { return cluster_degraded; }
+    bool allows_multimds_snaps() const { return mdsmap->allows_multimds_snaps(); }
+
+    bool is_cache_trimmable() const {
+      return is_standby_replay() || is_clientreplay() || is_active() || is_stopping();
+    }
+
+    void handle_write_error(int err);
+    void handle_write_error_with_lock(int err);
+
+    void update_mlogger();
+
+    void queue_waiter(MDSContext *c) {
+      finished_queue.push_back(c);
+      progress_thread.signal();
+    }
+    void queue_waiter_front(MDSContext *c) {
+      finished_queue.push_front(c);
+      progress_thread.signal();
+    }
+    void queue_waiters(MDSContext::vec& ls) {
+      MDSContext::vec v;
+      v.swap(ls);
+      std::copy(v.begin(), v.end(), std::back_inserter(finished_queue));
+      progress_thread.signal();
+    }
+    void queue_waiters_front(MDSContext::vec& ls) {
+      MDSContext::vec v;
+      v.swap(ls);
+      std::copy(v.rbegin(), v.rend(), std::front_inserter(finished_queue));
+      progress_thread.signal();
+    }
+
+    // Daemon lifetime functions: these guys break the abstraction
+    // and call up into the parent MDSDaemon instance.  It's kind
+    // of unavoidable: if we want any depth into our calls 
+    // to be able to e.g. tear down the whole process, we have to
+    // have a reference going all the way down.
+    // >>>
+    void suicide();
+    void respawn();
+    // <<<
+
+    /**
+     * Call this periodically if inside a potentially long running piece
+     * of code while holding the mds_lock
+     */
+    void heartbeat_reset();
+    int heartbeat_reset_grace(int count=1) {
+      return count * _heartbeat_reset_grace;
+    }
+
+    /**
+     * Report state DAMAGED to the mon, and then pass on to respawn().  Call
+     * this when an unrecoverable error is encountered while attempting
+     * to load an MDS rank's data structures.  This is *not* for use with
+     * errors affecting normal dirfrag/inode objects -- they should be handled
+     * through cleaner scrub/repair mechanisms.
+     *
+     * Callers must already hold mds_lock.
+     */
+    void damaged();
+
+    /**
+     * Wrapper around `damaged` for users who are not
+     * already holding mds_lock.
+     *
+     * Callers must not already hold mds_lock.
+     */
+    void damaged_unlocked();
+
+    double last_cleared_laggy() const {
+      return beacon.last_cleared_laggy();
+    }
+
+    double get_dispatch_queue_max_age(utime_t now) const;
+
+    void send_message_mds(const ref_t<Message>& m, mds_rank_t mds);
+    void send_message_mds(const ref_t<Message>& m, const entity_addrvec_t &addr);
+    void forward_message_mds(const cref_t<MClientRequest>& req, mds_rank_t mds);
+    void send_message_client_counted(const ref_t<Message>& m, client_t client);
+    void send_message_client_counted(const ref_t<Message>& m, Session* session);
+    void send_message_client_counted(const ref_t<Message>& m, const ConnectionRef& connection);
+    void send_message_client(const ref_t<Message>& m, Session* session);
+    void send_message(const ref_t<Message>& m, const ConnectionRef& c);
+
+    void wait_for_bootstrapped_peer(mds_rank_t who, MDSContext *c) {
+      waiting_for_bootstrapping_peer[who].push_back(c);
+    }
+    void wait_for_active_peer(mds_rank_t who, MDSContext *c) { 
+      waiting_for_active_peer[who].push_back(c);
+    }
+    void wait_for_cluster_recovered(MDSContext *c) {
+      ceph_assert(cluster_degraded);
+      waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
+    }
+
+    void wait_for_any_client_connection(MDSContext *c) {
+      waiting_for_any_client_connection.push_back(c);
+    }
+    void kick_waiters_for_any_client_connection(void) {
+      finish_contexts(g_ceph_context, waiting_for_any_client_connection);
+    }
+    void wait_for_active(MDSContext *c) {
+      waiting_for_active.push_back(c);
+    }
+    void wait_for_replay(MDSContext *c) { 
+      waiting_for_replay.push_back(c); 
+    }
+    void wait_for_rejoin(MDSContext *c) {
+      waiting_for_rejoin.push_back(c);
+    }
+    void wait_for_reconnect(MDSContext *c) {
+      waiting_for_reconnect.push_back(c);
+    }
+    void wait_for_resolve(MDSContext *c) {
+      waiting_for_resolve.push_back(c);
+    }
+    void wait_for_mdsmap(epoch_t e, MDSContext *c) {
+      waiting_for_mdsmap[e].push_back(c);
+    }
+    void enqueue_replay(MDSContext *c) {
+      replay_queue.push_back(c);
+    }
+
+    bool queue_one_replay();
+    void maybe_clientreplay_done();
+
+    void set_osd_epoch_barrier(epoch_t e);
+    epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
+    epoch_t get_osd_epoch() const;
+
+    ceph_tid_t issue_tid() { return ++last_tid; }
+
+    MDSMap *get_mds_map() { return mdsmap.get(); }
+
+    uint64_t get_num_requests() const { return logger->get(l_mds_request); }
+  
+    int get_mds_slow_req_count() const { return mds_slow_req_count; }
+
+    void dump_status(Formatter *f) const;
+
+    void hit_export_target(mds_rank_t rank, double amount=-1.0);
+    bool is_export_target(mds_rank_t rank) {
+      const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
+      return map_targets.count(rank);
+    }
+
+    bool evict_client(int64_t session_id, bool wait, bool blocklist,
+                      std::ostream& ss, Context *on_killed=nullptr);
+    int config_client(int64_t session_id, bool remove,
+		      const std::string& option, const std::string& value,
+		      std::ostream& ss);
+
+    // Reference to global MDS::mds_lock, so that users of MDSRank don't
+    // carry around references to the outer MDS, and we can substitute
+    // a separate lock here in future potentially.
+    ceph::fair_mutex &mds_lock;
+
+    // Reference to global cluster log client, just to avoid initialising
+    // a separate one here.
+    LogChannelRef &clog;
+
+    // Reference to global timer utility, because MDSRank and MDSDaemon
+    // currently both use the same mds_lock, so it makes sense for them
+    // to share a timer.
+    CommonSafeTimer<ceph::fair_mutex> &timer;
+
+    std::unique_ptr<MDSMap> &mdsmap; /* MDSDaemon::mdsmap */
+
+    Objecter *objecter;
+
+    // sub systems
+    Server *server = nullptr;
+    MDCache *mdcache = nullptr;
+    Locker *locker = nullptr;
+    MDLog *mdlog = nullptr;
+    MDBalancer *balancer = nullptr;
+    ScrubStack *scrubstack = nullptr;
+    DamageTable damage_table;
+
+    InoTable *inotable = nullptr;
+
+    SnapServer *snapserver = nullptr;
+    SnapClient *snapclient = nullptr;
+
+    SessionMap sessionmap;
+
+    PerfCounters *logger = nullptr, *mlogger = nullptr;
+    OpTracker op_tracker;
+
+    // The last different state I held before current
+    MDSMap::DaemonState last_state = MDSMap::STATE_BOOT;
+    // The state assigned to me by the MDSMap
+    MDSMap::DaemonState state = MDSMap::STATE_STANDBY;
+
+    bool cluster_degraded = false;
+
+    Finisher *finisher;
+  protected:
+    typedef enum {
+      // The MDSMap is available, configure default layouts and structures
+      MDS_BOOT_INITIAL = 0,
+      // We are ready to open some inodes
+      MDS_BOOT_OPEN_ROOT,
+      // We are ready to do a replay if needed
+      MDS_BOOT_PREPARE_LOG,
+      // Replay is complete
+      MDS_BOOT_REPLAY_DONE
+    } BootStep;
+
+    class ProgressThread : public Thread {
+      public:
+      explicit ProgressThread(MDSRank *mds_) : mds(mds_) {}
+      void * entry() override;
+      void shutdown();
+      void signal() {cond.notify_all();}
+      private:
+      MDSRank *mds;
+      std::condition_variable_any cond;
+    } progress_thread;
+
+    class C_MDS_StandbyReplayRestart;
+    class C_MDS_StandbyReplayRestartFinish;
+    // Friended to access retry_dispatch
+    friend class C_MDS_RetryMessage;
+    friend class C_MDS_BootStart;
+    friend class C_MDS_InternalBootStart;
+    friend class C_MDS_MonCommand;
+
+    const mds_rank_t whoami;
+    std::string fs_name;
+
+    ~MDSRank();
+
+    void inc_dispatch_depth() { ++dispatch_depth; }
+    void dec_dispatch_depth() { --dispatch_depth; }
+    void retry_dispatch(const cref_t<Message> &m);
+    bool is_valid_message(const cref_t<Message> &m);
+    void handle_message(const cref_t<Message> &m);
+    void _advance_queues();
+    bool _dispatch(const cref_t<Message> &m, bool new_msg);
+    bool is_stale_message(const cref_t<Message> &m) const;
+
+    /**
+     * Emit clog warnings for any ops reported as warnings by optracker
+     */
+    void check_ops_in_flight();
+
+     /**
+     * Share MDSMap with clients
+     */
+    void create_logger();
+
+    void dump_clientreplay_status(Formatter *f) const;
+    void command_scrub_start(Formatter *f,
+                             std::string_view path, std::string_view tag,
+                             const vector<string>& scrubop_vec, Context *on_finish);
+    void command_tag_path(Formatter *f, std::string_view path,
+                          std::string_view tag);
+    // scrub control commands
+    void command_scrub_abort(Formatter *f, Context *on_finish);
+    void command_scrub_pause(Formatter *f, Context *on_finish);
+    void command_scrub_resume(Formatter *f);
+    void command_scrub_status(Formatter *f);
+
+    void command_flush_path(Formatter *f, std::string_view path);
+    void command_flush_journal(Formatter *f);
+    void command_get_subtrees(Formatter *f);
+    void command_export_dir(Formatter *f,
+        std::string_view path, mds_rank_t dest);
+    bool command_dirfrag_split(
+        cmdmap_t cmdmap,
+        std::ostream &ss);
+    bool command_dirfrag_merge(
+        cmdmap_t cmdmap,
+        std::ostream &ss);
+    bool command_dirfrag_ls(
+        cmdmap_t cmdmap,
+        std::ostream &ss,
+        Formatter *f);
+    int _command_export_dir(std::string_view path, mds_rank_t dest);
+    CDir *_command_dirfrag_get(
+        const cmdmap_t &cmdmap,
+        std::ostream &ss);
+    void command_openfiles_ls(Formatter *f);
+    void command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f);
+    void command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss);
+    void command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish);
+
+    // FIXME the state machine logic should be separable from the dispatch
+    // logic that calls it.
+    // >>>
+    void calc_recovery_set();
+    void request_state(MDSMap::DaemonState s);
+
+    void boot_create();             // i am new mds.
+    void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0);    // starting|replay
+
+    void replay_start();
+    void creating_done();
+    void starting_done();
+    void replay_done();
+    void standby_replay_restart();
+    void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
+
+    void reopen_log();
+
+    void resolve_start();
+    void resolve_done();
+    void reconnect_start();
+    void reconnect_done();
+    void rejoin_joint_start();
+    void rejoin_start();
+    void rejoin_done();
+    void recovery_done(int oldstate);
+    void clientreplay_start();
+    void clientreplay_done();
+    void active_start();
+    void stopping_start();
+    void stopping_done();
+
+    void validate_sessions();
+
+    void handle_mds_recovery(mds_rank_t who);
+    void handle_mds_failure(mds_rank_t who);
+
+    /* Update MDSMap export_targets for this rank. Called on ::tick(). */
+    void update_targets();
+
+    void _mon_command_finish(int r, std::string_view cmd, std::string_view outs);
+    void set_mdsmap_multimds_snaps_allowed();
+
+    Context *create_async_exec_context(C_ExecAndReply *ctx);
+
+    // blocklist the provided addrs and set OSD epoch barrier
+    // with the provided epoch.
+    void apply_blocklist(const std::set<entity_addr_t> &addrs, epoch_t epoch);
+
+    // Incarnation as seen in MDSMap at the point where a rank is
+    // assigned.
+    int incarnation = 0;
+
+    // Flag to indicate we entered shutdown: anyone seeing this to be true
+    // after taking mds_lock must drop out.
+    bool stopping = false;
+
+    // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
+    // because its init/shutdown happens at the top level.
+    PurgeQueue purge_queue;
+
+    MetricsHandler metrics_handler;
+    std::unique_ptr<MetricAggregator> metric_aggregator;
+
+    list<cref_t<Message>> waiting_for_nolaggy;
+    MDSContext::que finished_queue;
+    // Dispatch, retry, queues
+    int dispatch_depth = 0;
+
+    ceph::heartbeat_handle_d *hb = nullptr;  // Heartbeat for threads using mds_lock
+    double heartbeat_grace;
+    int _heartbeat_reset_grace;
+
+    map<mds_rank_t, version_t> peer_mdsmap_epoch;
+
+    ceph_tid_t last_tid = 0;    // for mds-initiated requests (e.g. stray rename)
+
+    MDSContext::vec waiting_for_active, waiting_for_replay, waiting_for_rejoin,
+				waiting_for_reconnect, waiting_for_resolve;
+    MDSContext::vec waiting_for_any_client_connection;
+    MDSContext::que replay_queue;
+    bool replaying_requests_done = false;
+
+    map<mds_rank_t, MDSContext::vec> waiting_for_active_peer;
+    map<mds_rank_t, MDSContext::vec> waiting_for_bootstrapping_peer;
+    map<epoch_t, MDSContext::vec> waiting_for_mdsmap;
+
+    epoch_t osd_epoch_barrier = 0;
+
+    // Const reference to the beacon so that we can behave differently
+    // when it's laggy.
+    Beacon &beacon;
+
+    int mds_slow_req_count = 0;
+
+    map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */
+
+    Messenger *messenger;
+    MonClient *monc;
+    MgrClient *mgrc;
+
+    Context *respawn_hook;
+    Context *suicide_hook;
+
+    bool standby_replaying = false;  // true if current replay pass is in standby-replay mode
+private:
+    bool send_status = true;
+
+    // The metadata pool won't change in the whole life time of the fs,
+    // with this we can get rid of the mds_lock in many places too.
+    int64_t metadata_pool = -1;
+
+    // "task" string that gets displayed in ceph status
+    inline static const std::string SCRUB_STATUS_KEY = "scrub status";
+
+    void get_task_status(std::map<std::string, std::string> *status);
+    void schedule_update_timer_task();
+    void send_task_status();
+
+    bool is_rank0() const {
+      return whoami == (mds_rank_t)0;
+    }
+
+    mono_time starttime = mono_clock::zero();
+    boost::asio::io_context& ioc;
+};
+
+/* This expects to be given a reference which it is responsible for.
+ * The finish function calls functions which
+ * will put the Message exactly once.*/
+class C_MDS_RetryMessage : public MDSInternalContext {
+public:
+  C_MDS_RetryMessage(MDSRank *mds, const cref_t<Message> &m)
+    : MDSInternalContext(mds), m(m) {}
+  void finish(int r) override {
+    get_mds()->retry_dispatch(m);
+  }
+protected:
+  cref_t<Message> m;
+};
+
+class CF_MDS_RetryMessageFactory : public MDSContextFactory {
+public:
+  CF_MDS_RetryMessageFactory(MDSRank *mds, const cref_t<Message> &m)
+    : mds(mds), m(m) {}
+
+  MDSContext *build() {
+    return new C_MDS_RetryMessage(mds, m);
+  }
+private:
+  MDSRank *mds;
+  cref_t<Message> m;
+};
+
+/**
+ * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
+ * the service/dispatcher stuff like init/shutdown that subsystems should
+ * never touch.
+ */
+class MDSRankDispatcher : public MDSRank, public md_config_obs_t
+{
+public:
+  MDSRankDispatcher(
+      mds_rank_t whoami_,
+      std::string fs_name,
+      ceph::fair_mutex &mds_lock_,
+      LogChannelRef &clog_,
+      CommonSafeTimer<ceph::fair_mutex> &timer_,
+      Beacon &beacon_,
+      std::unique_ptr<MDSMap> &mdsmap_,
+      Messenger *msgr,
+      MonClient *monc_,
+      MgrClient *mgrc,
+      Context *respawn_hook_,
+      Context *suicide_hook_,
+      boost::asio::io_context& ioc);
+
+  void init();
+  void tick();
+  void shutdown();
+  void handle_asok_command(
+    std::string_view command,
+    const cmdmap_t& cmdmap,
+    Formatter *f,
+    const bufferlist &inbl,
+    std::function<void(int,const std::string&,bufferlist&)> on_finish);
+  void handle_mds_map(const cref_t<MMDSMap> &m, const MDSMap &oldmap);
+  void handle_osd_map();
+  void update_log_config();
+
+  const char** get_tracked_conf_keys() const override final;
+  void handle_conf_change(const ConfigProxy& conf, const std::set<std::string>& changed) override;
+
+  void dump_sessions(const SessionFilter &filter, Formatter *f, bool cap_dump=false) const;
+  void evict_clients(const SessionFilter &filter,
+		     std::function<void(int,const std::string&,bufferlist&)> on_finish);
+
+  // Call into me from MDS::ms_dispatch
+  bool ms_dispatch(const cref_t<Message> &m);
+};
+
+#endif // MDS_RANK_H_
diff --git a/src/mds/MDSTable.cc b/src/mds/MDSTable.cc
new file mode 100644
index 000000000..679633f0c
--- /dev/null
+++ b/src/mds/MDSTable.cc
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "MDSTable.h"
+
+#include "MDSRank.h"
+#include "MDLog.h"
+
+#include "osdc/Filer.h"
+
+#include "include/types.h"
+
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/Finisher.h"
+
+#include "include/ceph_assert.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << rank << "." << table_name << ": "
+
+
+class MDSTableIOContext : public MDSIOContextBase
+{
+  protected:
+    MDSTable *ida;
+    MDSRank *get_mds() override {return ida->mds;}
+  public:
+    explicit MDSTableIOContext(MDSTable *ida_) : ida(ida_) {
+      ceph_assert(ida != NULL);
+    }
+};
+
+
+class C_IO_MT_Save : public MDSTableIOContext {
+  version_t version;
+public:
+  C_IO_MT_Save(MDSTable *i, version_t v) : MDSTableIOContext(i), version(v) {}
+  void finish(int r) override {
+    ida->save_2(r, version);
+  }
+  void print(ostream& out) const override {
+    out << "table_save(" << ida->table_name << ")";
+  }
+};
+
+void MDSTable::save(MDSContext *onfinish, version_t v)
+{
+  if (v > 0 && v <= committing_version) {
+    dout(10) << "save v " << version << " - already saving "
+	     << committing_version << " >= needed " << v << dendl;
+    if (onfinish)
+      waitfor_save[v].push_back(onfinish);
+    return;
+  }
+  
+  dout(10) << "save v " << version << dendl;
+  ceph_assert(is_active());
+  
+  bufferlist bl;
+  encode(version, bl);
+  encode_state(bl);
+
+  committing_version = version;
+
+  if (onfinish)
+    waitfor_save[version].push_back(onfinish);
+
+  // write (async)
+  SnapContext snapc;
+  object_t oid = get_object_name();
+  object_locator_t oloc(mds->get_metadata_pool());
+  mds->objecter->write_full(oid, oloc,
+			    snapc,
+			    bl, ceph::real_clock::now(), 0,
+			    new C_OnFinisher(new C_IO_MT_Save(this, version),
+					     mds->finisher));
+}
+
+void MDSTable::save_2(int r, version_t v)
+{
+  if (r < 0) {
+    dout(1) << "save error " << r << " v " << v << dendl;
+    mds->clog->error() << "failed to store table " << table_name << " object,"
+		       << " errno " << r;
+    mds->handle_write_error(r);
+    return;
+  }
+
+  dout(10) << "save_2 v " << v << dendl;
+  committed_version = v;
+  
+  MDSContext::vec ls;
+  while (!waitfor_save.empty()) {
+    auto it = waitfor_save.begin();
+    if (it->first > v) break;
+    auto& v = it->second;
+    ls.insert(ls.end(), v.begin(), v.end());
+    waitfor_save.erase(it);
+  }
+  finish_contexts(g_ceph_context, ls, 0);
+}
+
+
+void MDSTable::reset()
+{
+  reset_state();
+  projected_version = version;
+  state = STATE_ACTIVE;
+}
+
+
+
+// -----------------------
+
+class C_IO_MT_Load : public MDSTableIOContext {
+public:
+  Context *onfinish;
+  bufferlist bl;
+  C_IO_MT_Load(MDSTable *i, Context *o) : MDSTableIOContext(i), onfinish(o) {}
+  void finish(int r) override {
+    ida->load_2(r, bl, onfinish);
+  }
+  void print(ostream& out) const override {
+    out << "table_load(" << ida->table_name << ")";
+  }
+};
+
+object_t MDSTable::get_object_name() const
+{
+  char n[50];
+  if (per_mds)
+    snprintf(n, sizeof(n), "mds%d_%s", int(rank), table_name.c_str());
+  else
+    snprintf(n, sizeof(n), "mds_%s", table_name.c_str());
+  return object_t(n);
+}
+
+void MDSTable::load(MDSContext *onfinish)
+{ 
+  dout(10) << "load" << dendl;
+
+  ceph_assert(is_undef());
+  state = STATE_OPENING;
+
+  C_IO_MT_Load *c = new C_IO_MT_Load(this, onfinish);
+  object_t oid = get_object_name();
+  object_locator_t oloc(mds->get_metadata_pool());
+  mds->objecter->read_full(oid, oloc, CEPH_NOSNAP, &c->bl, 0,
+			   new C_OnFinisher(c, mds->finisher));
+}
+
+void MDSTable::load_2(int r, bufferlist& bl, Context *onfinish)
+{
+  ceph_assert(is_opening());
+  state = STATE_ACTIVE;
+  if (r == -CEPHFS_EBLOCKLISTED) {
+    mds->respawn();
+    return;
+  }
+  if (r < 0) {
+    derr << "load_2 could not read table: " << r << dendl;
+    mds->clog->error() << "error reading table object '" << get_object_name()
+                       << "' " << r << " (" << cpp_strerror(r) << ")";
+    mds->damaged();
+    ceph_assert(r >= 0);  // Should be unreachable because damaged() calls respawn()
+  }
+
+  dout(10) << "load_2 got " << bl.length() << " bytes" << dendl;
+  auto p = bl.cbegin();
+
+  try {
+    decode(version, p);
+    projected_version = committed_version = version;
+    dout(10) << "load_2 loaded v" << version << dendl;
+    decode_state(p);
+  } catch (buffer::error &e) {
+    mds->clog->error() << "error decoding table object '" << get_object_name()
+                       << "': " << e.what();
+    mds->damaged();
+    ceph_assert(r >= 0);  // Should be unreachable because damaged() calls respawn()
+  }
+
+  if (onfinish) {
+    onfinish->complete(0);
+  }
+}
diff --git a/src/mds/MDSTable.h b/src/mds/MDSTable.h
new file mode 100644
index 000000000..07e12f574
--- /dev/null
+++ b/src/mds/MDSTable.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDSTABLE_H
+#define CEPH_MDSTABLE_H
+
+#include "mdstypes.h"
+#include "mds_table_types.h"
+#include "include/buffer_fwd.h"
+
+#include "MDSContext.h"
+
+class MDSRank;
+
+class MDSTable {
+public:
+  friend class C_IO_MT_Load;
+  friend class C_IO_MT_Save;
+
+  MDSTable(MDSRank *m, std::string_view n, bool is_per_mds) :
+    mds(m), table_name(n), per_mds(is_per_mds) {}
+  virtual ~MDSTable() {}
+
+  void set_rank(mds_rank_t r)
+  {
+    rank = r;
+  }
+
+  version_t get_version() const { return version; }
+  version_t get_committed_version() const { return committed_version; }
+  version_t get_committing_version() const { return committing_version; }
+  version_t get_projected_version() const { return projected_version; }
+  
+  void force_replay_version(version_t v) {
+    version = projected_version = v;
+  }
+
+  //version_t project_version() { return ++projected_version; }
+  //version_t inc_version() { return ++version; }
+
+  // load/save from disk (hack)
+  bool is_undef() const { return state == STATE_UNDEF; }
+  bool is_active() const { return state == STATE_ACTIVE; }
+  bool is_opening() const { return state == STATE_OPENING; }
+
+  void reset();
+  void save(MDSContext *onfinish=0, version_t need=0);
+  void save_2(int r, version_t v);
+
+  void shutdown() {
+    if (is_active()) save(0);
+  }
+
+  object_t get_object_name() const;
+  void load(MDSContext *onfinish);
+  void load_2(int, bufferlist&, Context *onfinish);
+
+  // child must overload these
+  virtual void reset_state() = 0;
+  virtual void decode_state(bufferlist::const_iterator& p) = 0;
+  virtual void encode_state(bufferlist& bl) const = 0;
+
+  MDSRank *mds;
+protected:
+  static const int STATE_UNDEF   = 0;
+  static const int STATE_OPENING = 1;
+  static const int STATE_ACTIVE  = 2;
+  //static const int STATE_COMMITTING = 3;
+
+  std::string table_name;
+  bool per_mds;
+  mds_rank_t rank = MDS_RANK_NONE;
+
+  int state = STATE_UNDEF;
+
+  version_t version = 0, committing_version = 0, committed_version = 0, projected_version = 0;
+
+  map<version_t, MDSContext::vec > waitfor_save;
+};
+#endif
diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
new file mode 100644
index 000000000..38f212218
--- /dev/null
+++ b/src/mds/MDSTableClient.cc
@@ -0,0 +1,262 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "MDSMap.h"
+
+#include "MDSContext.h"
+#include "msg/Messenger.h"
+
+#include "MDSRank.h"
+#include "MDLog.h"
+#include "LogSegment.h"
+
+#include "MDSTableClient.h"
+#include "events/ETableClient.h"
+
+#include "common/config.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".tableclient(" << get_mdstable_name(table) << ") "
+
+
+class C_LoggedAck : public MDSLogContextBase {
+  MDSTableClient *tc;
+  version_t tid;
+  MDSRank *get_mds() override { return tc->mds; }
+public:
+  C_LoggedAck(MDSTableClient *a, version_t t) : tc(a), tid(t) {}
+  void finish(int r) override {
+    tc->_logged_ack(tid);
+  }
+};
+
+
+void MDSTableClient::handle_request(const cref_t<MMDSTableRequest> &m)
+{
+  dout(10) << "handle_request " << *m << dendl;
+  ceph_assert(m->table == table);
+
+  if (mds->get_state() < MDSMap::STATE_RESOLVE) {
+    if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
+      mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
+    }
+    return;
+  }
+
+  version_t tid = m->get_tid();
+  uint64_t reqid = m->reqid;
+
+  switch (m->op) {
+  case TABLESERVER_OP_QUERY_REPLY:
+    handle_query_result(m);
+    break;
+
+  case TABLESERVER_OP_NOTIFY_PREP:
+    ceph_assert(g_conf()->mds_kill_mdstable_at != 9);
+    handle_notify_prep(m);
+    break;
+    
+  case TABLESERVER_OP_AGREE:
+    if (pending_prepare.count(reqid)) {
+      dout(10) << "got agree on " << reqid << " atid " << tid << dendl;
+
+      ceph_assert(g_conf()->mds_kill_mdstable_at != 3);
+
+      MDSContext *onfinish = pending_prepare[reqid].onfinish;
+      *pending_prepare[reqid].ptid = tid;
+      if (pending_prepare[reqid].pbl)
+	*pending_prepare[reqid].pbl = m->bl;
+      pending_prepare.erase(reqid);
+      prepared_update[tid] = reqid;
+      if (onfinish) {
+        onfinish->complete(0);
+      }
+    }
+    else if (prepared_update.count(tid)) {
+      dout(10) << "got duplicated agree on " << reqid << " atid " << tid << dendl;
+      ceph_assert(prepared_update[tid] == reqid);
+      ceph_assert(!server_ready);
+    }
+    else if (pending_commit.count(tid)) {
+      dout(10) << "stray agree on " << reqid << " tid " << tid
+	       << ", already committing, will resend COMMIT" << dendl;
+      ceph_assert(!server_ready);
+      // will re-send commit when receiving the server ready message
+    }
+    else {
+      dout(10) << "stray agree on " << reqid << " tid " << tid
+	       << ", sending ROLLBACK" << dendl;
+      ceph_assert(!server_ready);
+      auto req = make_message<MMDSTableRequest>(table, TABLESERVER_OP_ROLLBACK, 0, tid);
+      mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
+    }
+    break;
+
+  case TABLESERVER_OP_ACK:
+    if (pending_commit.count(tid) &&
+	pending_commit[tid]->pending_commit_tids[table].count(tid)) {
+      dout(10) << "got ack on tid " << tid << ", logging" << dendl;
+      
+      ceph_assert(g_conf()->mds_kill_mdstable_at != 7);
+      
+      // remove from committing list
+      pending_commit[tid]->pending_commit_tids[table].erase(tid);
+      pending_commit.erase(tid);
+
+      // log ACK.
+      mds->mdlog->start_submit_entry(new ETableClient(table, TABLESERVER_OP_ACK, tid),
+				     new C_LoggedAck(this, tid));
+    } else {
+      dout(10) << "got stray ack on tid " << tid << ", ignoring" << dendl;
+    }
+    break;
+
+  case TABLESERVER_OP_SERVER_READY:
+    ceph_assert(!server_ready);
+    server_ready = true;
+
+    if (last_reqid == ~0ULL)
+      last_reqid = reqid;
+
+    resend_queries();
+    resend_prepares();
+    resend_commits();
+    break;
+
+  default:
+    ceph_abort_msg("unrecognized mds_table_client request op");
+  }
+}
+
+
+void MDSTableClient::_logged_ack(version_t tid)
+{
+  dout(10) << "_logged_ack " << tid << dendl;
+  // kick any waiters (LogSegment trim)
+  if (ack_waiters.count(tid)) {
+    dout(15) << "kicking ack waiters on tid " << tid << dendl;
+    mds->queue_waiters(ack_waiters[tid]);
+    ack_waiters.erase(tid);
+  }
+}
+
+void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl,
+			      MDSContext *onfinish)
+{
+  if (last_reqid == ~0ULL) {
+    dout(10) << "tableserver is not ready yet, waiting for request id" << dendl;
+    waiting_for_reqid.push_back(_pending_prepare(onfinish, ptid, pbl, mutation));
+    return;
+  }
+
+  uint64_t reqid = ++last_reqid;
+  dout(10) << "_prepare " << reqid << dendl;
+
+  pending_prepare[reqid].mutation = mutation;
+  pending_prepare[reqid].ptid = ptid;
+  pending_prepare[reqid].pbl = pbl;
+  pending_prepare[reqid].onfinish = onfinish;
+
+  if (server_ready) {
+    // send message
+    auto req = make_message<MMDSTableRequest>(table, TABLESERVER_OP_PREPARE, reqid);
+    req->bl = mutation;
+    mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
+  } else
+    dout(10) << "tableserver is not ready yet, deferring request" << dendl;
+}
+
+void MDSTableClient::commit(version_t tid, LogSegment *ls)
+{
+  dout(10) << "commit " << tid << dendl;
+
+  ceph_assert(prepared_update.count(tid));
+  prepared_update.erase(tid);
+
+  ceph_assert(pending_commit.count(tid) == 0);
+  pending_commit[tid] = ls;
+  ls->pending_commit_tids[table].insert(tid);
+
+  notify_commit(tid);
+
+  ceph_assert(g_conf()->mds_kill_mdstable_at != 4);
+
+  if (server_ready) {
+    // send message
+    auto req = make_message<MMDSTableRequest>(table, TABLESERVER_OP_COMMIT, 0, tid);
+    mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
+  } else
+    dout(10) << "tableserver is not ready yet, deferring request" << dendl;
+}
+
+
+
+// recovery
+
+void MDSTableClient::got_journaled_agree(version_t tid, LogSegment *ls)
+{
+  dout(10) << "got_journaled_agree " << tid << dendl;
+  ls->pending_commit_tids[table].insert(tid);
+  pending_commit[tid] = ls;
+
+  notify_commit(tid);
+}
+
+void MDSTableClient::got_journaled_ack(version_t tid)
+{
+  dout(10) << "got_journaled_ack " << tid << dendl;
+  if (pending_commit.count(tid)) {
+    pending_commit[tid]->pending_commit_tids[table].erase(tid);
+    pending_commit.erase(tid);
+  }
+}
+
+void MDSTableClient::resend_commits()
+{
+  for (map<version_t,LogSegment*>::iterator p = pending_commit.begin();
+       p != pending_commit.end();
+       ++p) {
+    dout(10) << "resending commit on " << p->first << dendl;
+    auto req = make_message<MMDSTableRequest>(table, TABLESERVER_OP_COMMIT, 0, p->first);
+    mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
+  }
+}
+
+void MDSTableClient::resend_prepares()
+{
+  while (!waiting_for_reqid.empty()) {
+    pending_prepare[++last_reqid] = waiting_for_reqid.front();
+    waiting_for_reqid.pop_front();
+  }
+
+  for (map<uint64_t, _pending_prepare>::iterator p = pending_prepare.begin();
+       p != pending_prepare.end();
+       ++p) {
+    dout(10) << "resending prepare on " << p->first << dendl;
+    auto req = make_message<MMDSTableRequest>(table, TABLESERVER_OP_PREPARE, p->first);
+    req->bl = p->second.mutation;
+    mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
+  }
+}
+
+void MDSTableClient::handle_mds_failure(mds_rank_t who)
+{
+  if (who != mds->get_mds_map()->get_tableserver())
+    return; // do nothing.
+
+  dout(7) << "tableserver mds." << who << " fails" << dendl;
+  server_ready = false;
+}
diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
new file mode 100644
index 000000000..2952ec406
--- /dev/null
+++ b/src/mds/MDSTableClient.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDSTABLECLIENT_H
+#define CEPH_MDSTABLECLIENT_H
+
+#include "include/types.h"
+#include "MDSContext.h"
+#include "mds_table_types.h"
+
+#include "messages/MMDSTableRequest.h"
+
+class MDSRank;
+class LogSegment;
+
+class MDSTableClient {
+public:
+  MDSTableClient(MDSRank *m, int tab) :
+    mds(m), table(tab) {}
+  virtual ~MDSTableClient() {}
+
+  void handle_request(const cref_t<MMDSTableRequest> &m);
+
+  void _prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, MDSContext *onfinish);
+  void commit(version_t tid, LogSegment *ls);
+
+  void resend_commits();
+  void resend_prepares();
+
+  // for recovery (by me)
+  void got_journaled_agree(version_t tid, LogSegment *ls);
+  void got_journaled_ack(version_t tid);
+
+  bool has_committed(version_t tid) const {
+    return pending_commit.count(tid) == 0;
+  }
+  void wait_for_ack(version_t tid, MDSContext *c) {
+    ack_waiters[tid].push_back(c);
+  }
+
+  set<version_t> get_journaled_tids() const {
+    set<version_t> tids;
+    for (auto p : pending_commit)
+      tids.insert(p.first);
+    return tids;
+  }
+
+  void handle_mds_failure(mds_rank_t mds);
+
+  // child must implement
+  virtual void resend_queries() = 0;
+  virtual void handle_query_result(const cref_t<MMDSTableRequest> &m) = 0;
+  virtual void handle_notify_prep(const cref_t<MMDSTableRequest> &m) = 0;
+  virtual void notify_commit(version_t tid) = 0;
+protected:
+  // prepares
+  struct _pending_prepare {
+    _pending_prepare() {}
+    _pending_prepare(MDSContext *c, version_t *pt, bufferlist *pb, bufferlist& m) :
+      onfinish(c), ptid(pt), pbl(pb), mutation(m) {}
+
+    MDSContext *onfinish = nullptr;
+    version_t *ptid = nullptr;
+    bufferlist *pbl = nullptr;
+    bufferlist mutation;
+  };
+
+  friend class C_LoggedAck;
+
+  void handle_reply(class MMDSTableQuery *m);
+  void _logged_ack(version_t tid);
+
+  MDSRank *mds;
+  int table;
+
+  uint64_t last_reqid = ~0ULL;
+
+  bool server_ready = false;
+
+  map<uint64_t, _pending_prepare> pending_prepare;
+  map<version_t, uint64_t> prepared_update;
+  list<_pending_prepare> waiting_for_reqid;
+
+  // pending commits
+  map<version_t, LogSegment*> pending_commit;
+  map<version_t, MDSContext::vec > ack_waiters;
+};
+#endif
diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc
new file mode 100644
index 000000000..3666e0db9
--- /dev/null
+++ b/src/mds/MDSTableServer.cc
@@ -0,0 +1,373 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "MDSTableServer.h"
+#include "MDSRank.h"
+#include "MDLog.h"
+#include "msg/Messenger.h"
+
+#include "events/ETableServer.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << rank << ".tableserver(" << get_mdstable_name(table) << ") "
+
+void MDSTableServer::handle_request(const cref_t<MMDSTableRequest> &req)
+{
+  ceph_assert(req->op >= 0);
+  switch (req->op) {
+  case TABLESERVER_OP_QUERY: return handle_query(req);
+  case TABLESERVER_OP_PREPARE: return handle_prepare(req);
+  case TABLESERVER_OP_COMMIT: return handle_commit(req);
+  case TABLESERVER_OP_ROLLBACK: return handle_rollback(req);
+  case TABLESERVER_OP_NOTIFY_ACK: return handle_notify_ack(req);
+  default: ceph_abort_msg("unrecognized mds_table_server request op");
+  }
+}
+
+class C_Prepare : public MDSLogContextBase {
+  MDSTableServer *server;
+  cref_t<MMDSTableRequest> req;
+  version_t tid;
+  MDSRank *get_mds() override { return server->mds; }
+public:
+
+  C_Prepare(MDSTableServer *s, const cref_t<MMDSTableRequest> r, version_t v) : server(s), req(r), tid(v) {}
+  void finish(int r) override {
+    server->_prepare_logged(req, tid);
+  }
+};
+
+// prepare
+void MDSTableServer::handle_prepare(const cref_t<MMDSTableRequest> &req)
+{
+  dout(7) << "handle_prepare " << *req << dendl;
+  mds_rank_t from = mds_rank_t(req->get_source().num());
+
+  ceph_assert(g_conf()->mds_kill_mdstable_at != 1);
+
+  projected_version++;
+
+  ETableServer *le = new ETableServer(table, TABLESERVER_OP_PREPARE, req->reqid, from,
+				      projected_version, projected_version);
+  mds->mdlog->start_entry(le);
+  le->mutation = req->bl;
+  mds->mdlog->submit_entry(le, new C_Prepare(this, req, projected_version));
+  mds->mdlog->flush();
+}
+
+void MDSTableServer::_prepare_logged(const cref_t<MMDSTableRequest> &req, version_t tid)
+{
+  dout(7) << "_create_logged " << *req << " tid " << tid << dendl;
+  mds_rank_t from = mds_rank_t(req->get_source().num());
+
+  ceph_assert(g_conf()->mds_kill_mdstable_at != 2);
+
+  _note_prepare(from, req->reqid);
+  bufferlist out;
+  _prepare(req->bl, req->reqid, from, out);
+  ceph_assert(version == tid);
+
+  auto reply = make_message<MMDSTableRequest>(table, TABLESERVER_OP_AGREE, req->reqid, tid);
+  reply->bl = std::move(out);
+
+  if (_notify_prep(tid)) {
+    auto& p = pending_notifies[tid];
+    p.notify_ack_gather = active_clients;
+    p.mds = from;
+    p.reply = reply;
+  } else {
+    mds->send_message_mds(reply, from);
+  }
+}
+
+void MDSTableServer::handle_notify_ack(const cref_t<MMDSTableRequest> &m)
+{
+  dout(7) << __func__ << " " << *m << dendl;
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+  version_t tid = m->get_tid();
+
+  auto p = pending_notifies.find(tid);
+  if (p != pending_notifies.end()) {
+    if (p->second.notify_ack_gather.erase(from)) {
+      if (p->second.notify_ack_gather.empty()) {
+	if (p->second.onfinish)
+	  p->second.onfinish->complete(0);
+	else
+	  mds->send_message_mds(p->second.reply, p->second.mds);
+	pending_notifies.erase(p);
+      }
+    } else {
+      dout(0) << "got unexpected notify ack for tid " <<  tid << " from mds." << from << dendl;
+    }
+  } else {
+  }
+}
+
+class C_Commit : public MDSLogContextBase {
+  MDSTableServer *server;
+  cref_t<MMDSTableRequest> req;
+  MDSRank *get_mds() override { return server->mds; }
+public:
+  C_Commit(MDSTableServer *s, const cref_t<MMDSTableRequest> &r) : server(s), req(r) {}
+  void finish(int r) override {
+    server->_commit_logged(req);
+  }
+};
+
+// commit
+void MDSTableServer::handle_commit(const cref_t<MMDSTableRequest> &req)
+{
+  dout(7) << "handle_commit " << *req << dendl;
+
+  version_t tid = req->get_tid();
+
+  if (pending_for_mds.count(tid)) {
+
+    if (committing_tids.count(tid)) {
+      dout(0) << "got commit for tid " << tid << ", already committing, waiting." << dendl;
+      return;
+    }
+
+    ceph_assert(g_conf()->mds_kill_mdstable_at != 5);
+
+    projected_version++;
+    committing_tids.insert(tid);
+
+    mds->mdlog->start_submit_entry(new ETableServer(table, TABLESERVER_OP_COMMIT, 0, MDS_RANK_NONE, 
+						    tid, projected_version),
+				   new C_Commit(this, req));
+  }
+  else if (tid <= version) {
+    dout(0) << "got commit for tid " << tid << " <= " << version
+	    << ", already committed, sending ack." << dendl;
+    auto reply = make_message<MMDSTableRequest>(table, TABLESERVER_OP_ACK, req->reqid, tid);
+    mds->send_message(reply, req->get_connection());
+  } 
+  else {
+    // wtf.
+    dout(0) << "got commit for tid " << tid << " > " << version << dendl;
+    ceph_assert(tid <= version);
+  }
+}
+
+void MDSTableServer::_commit_logged(const cref_t<MMDSTableRequest> &req)
+{
+  dout(7) << "_commit_logged, sending ACK" << dendl;
+
+  ceph_assert(g_conf()->mds_kill_mdstable_at != 6);
+  version_t tid = req->get_tid();
+
+  pending_for_mds.erase(tid);
+  committing_tids.erase(tid);
+
+  _commit(tid, req);
+  _note_commit(tid);
+
+  auto reply = make_message<MMDSTableRequest>(table, TABLESERVER_OP_ACK, req->reqid, req->get_tid());
+  mds->send_message_mds(reply, mds_rank_t(req->get_source().num()));
+}
+
+class C_Rollback : public MDSLogContextBase {
+  MDSTableServer *server;
+  cref_t<MMDSTableRequest> req;
+  MDSRank *get_mds() override { return server->mds; }
+public:
+  C_Rollback(MDSTableServer *s, const cref_t<MMDSTableRequest> &r) : server(s), req(r) {}
+  void finish(int r) override {
+    server->_rollback_logged(req);
+  }
+};
+
+// ROLLBACK
+void MDSTableServer::handle_rollback(const cref_t<MMDSTableRequest> &req)
+{
+  dout(7) << "handle_rollback " << *req << dendl;
+
+  ceph_assert(g_conf()->mds_kill_mdstable_at != 8);
+  version_t tid = req->get_tid();
+  ceph_assert(pending_for_mds.count(tid));
+  ceph_assert(!committing_tids.count(tid));
+
+  projected_version++;
+  committing_tids.insert(tid);
+
+  mds->mdlog->start_submit_entry(new ETableServer(table, TABLESERVER_OP_ROLLBACK, 0, MDS_RANK_NONE,
+						  tid, projected_version),
+				 new C_Rollback(this, req));
+}
+
+void MDSTableServer::_rollback_logged(const cref_t<MMDSTableRequest> &req)
+{
+  dout(7) << "_rollback_logged " << *req << dendl;
+
+  version_t tid = req->get_tid();
+
+  pending_for_mds.erase(tid);
+  committing_tids.erase(tid);
+
+  _rollback(tid);
+  _note_rollback(tid);
+}
+
+
+
+// SERVER UPDATE
+class C_ServerUpdate : public MDSLogContextBase {
+  MDSTableServer *server;
+  bufferlist bl;
+  MDSRank *get_mds() override { return server->mds; }
+public:
+  C_ServerUpdate(MDSTableServer *s, bufferlist &b)  : server(s), bl(b) {}
+  void finish(int r) override {
+    server->_server_update_logged(bl);
+  }
+};
+
+void MDSTableServer::do_server_update(bufferlist& bl)
+{
+  dout(10) << "do_server_update len " << bl.length() << dendl;
+
+  projected_version++;
+
+  ETableServer *le = new ETableServer(table, TABLESERVER_OP_SERVER_UPDATE, 0, MDS_RANK_NONE, 0, projected_version);
+  mds->mdlog->start_entry(le);
+  le->mutation = bl;
+  mds->mdlog->submit_entry(le, new C_ServerUpdate(this, bl));
+}
+
+void MDSTableServer::_server_update_logged(bufferlist& bl)
+{
+  dout(10) << "_server_update_logged len " << bl.length() << dendl;
+  _server_update(bl);
+  _note_server_update(bl);
+}
+
+// recovery
+
+class C_ServerRecovery : public MDSContext {
+  MDSTableServer *server;
+  MDSRank *get_mds() override { return server->mds; }
+public:
+  C_ServerRecovery(MDSTableServer *s)  : server(s) {}
+  void finish(int r) override {
+    server->_do_server_recovery();
+  }
+};
+
+void MDSTableServer::_do_server_recovery()
+{
+  dout(7) << __func__ << " " << active_clients <<  dendl;
+  map<mds_rank_t, uint64_t> next_reqids;
+
+  for (auto p : pending_for_mds) {
+    mds_rank_t who = p.second.mds;
+    if (!active_clients.count(who))
+      continue;
+
+    if (p.second.reqid >= next_reqids[who])
+      next_reqids[who] = p.second.reqid + 1;
+
+    version_t tid = p.second.tid;
+    auto reply = make_message<MMDSTableRequest>(table, TABLESERVER_OP_AGREE, p.second.reqid, tid);
+    _get_reply_buffer(tid, &reply->bl);
+    mds->send_message_mds(reply, who);
+  }
+
+  for (auto p : active_clients) {
+    auto reply = make_message<MMDSTableRequest>(table, TABLESERVER_OP_SERVER_READY, next_reqids[p]);
+    mds->send_message_mds(reply, p);
+  }
+  recovered = true;
+}
+
+void MDSTableServer::finish_recovery(set<mds_rank_t>& active)
+{
+  dout(7) << __func__ << dendl;
+
+  active_clients = active;
+
+  // don't know if survivor mds have received all 'notify prep' messages.
+  // so we need to send 'notify prep' again.
+  if (!pending_for_mds.empty() && _notify_prep(version)) {
+    auto& q = pending_notifies[version];
+    q.notify_ack_gather = active_clients;
+    q.mds = MDS_RANK_NONE;
+    q.onfinish = new C_ServerRecovery(this);
+  } else {
+    _do_server_recovery();
+  }
+}
+
+void MDSTableServer::handle_mds_recovery(mds_rank_t who)
+{
+  dout(7) << "handle_mds_recovery mds." << who << dendl;
+
+  active_clients.insert(who);
+  if (!recovered) {
+    dout(7) << " still not recovered, delaying" << dendl;
+    return;
+  }
+
+  uint64_t next_reqid = 0;
+  // resend agrees for recovered mds
+  for (auto p = pending_for_mds.begin(); p != pending_for_mds.end(); ++p) {
+    if (p->second.mds != who)
+      continue;
+    ceph_assert(!pending_notifies.count(p->second.tid));
+
+    if (p->second.reqid >= next_reqid)
+      next_reqid = p->second.reqid + 1;
+
+    auto reply = make_message<MMDSTableRequest>(table, TABLESERVER_OP_AGREE, p->second.reqid, p->second.tid);
+    _get_reply_buffer(p->second.tid, &reply->bl);
+    mds->send_message_mds(reply, who);
+  }
+
+  auto reply = make_message<MMDSTableRequest>(table, TABLESERVER_OP_SERVER_READY, next_reqid);
+  mds->send_message_mds(reply, who);
+}
+
+void MDSTableServer::handle_mds_failure_or_stop(mds_rank_t who)
+{
+  dout(7) << __func__ << " mds." << who << dendl;
+
+  active_clients.erase(who);
+
+  list<ref_t<MMDSTableRequest>> rollback;
+  for (auto p = pending_notifies.begin(); p != pending_notifies.end(); ) {
+    auto q = p++;
+    if (q->second.mds == who) {
+      // haven't sent reply yet.
+      rollback.push_back(q->second.reply);
+      pending_notifies.erase(q);
+    } else if (q->second.notify_ack_gather.erase(who)) {
+      // the failed mds will reload snaptable when it recovers.
+      // so we can remove it from the gather set.
+      if (q->second.notify_ack_gather.empty()) {
+	if (q->second.onfinish)
+	  q->second.onfinish->complete(0);
+	else
+	  mds->send_message_mds(q->second.reply, q->second.mds);
+	pending_notifies.erase(q);
+      }
+    }
+  }
+
+  for (auto &req : rollback) {
+    req->op = TABLESERVER_OP_ROLLBACK;
+    handle_rollback(req);
+  }
+}
diff --git a/src/mds/MDSTableServer.h b/src/mds/MDSTableServer.h
new file mode 100644
index 000000000..84dc5b87d
--- /dev/null
+++ b/src/mds/MDSTableServer.h
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDSTABLESERVER_H
+#define CEPH_MDSTABLESERVER_H
+
+#include "MDSTable.h"
+#include "MDSContext.h"
+
+#include "messages/MMDSTableRequest.h"
+
+class MDSTableServer : public MDSTable {
+public:
+  friend class C_ServerRecovery;
+
+  MDSTableServer(MDSRank *m, int tab) :
+    MDSTable(m, get_mdstable_name(tab), false), table(tab) {}
+  ~MDSTableServer() override {}
+
+  virtual void handle_query(const cref_t<MMDSTableRequest> &m) = 0;
+  virtual void _prepare(const bufferlist &bl, uint64_t reqid, mds_rank_t bymds, bufferlist& out) = 0;
+  virtual void _get_reply_buffer(version_t tid, bufferlist *pbl) const = 0;
+  virtual void _commit(version_t tid, cref_t<MMDSTableRequest> req) = 0;
+  virtual void _rollback(version_t tid) = 0;
+  virtual void _server_update(bufferlist& bl) { ceph_abort(); }
+  virtual bool _notify_prep(version_t tid) { return false; };
+
+  void _note_prepare(mds_rank_t mds, uint64_t reqid, bool replay=false) {
+    version++;
+    if (replay)
+      projected_version = version;
+    pending_for_mds[version].mds = mds;
+    pending_for_mds[version].reqid = reqid;
+    pending_for_mds[version].tid = version;
+  }
+  void _note_commit(uint64_t tid, bool replay=false) {
+    version++;
+    if (replay)
+      projected_version = version;
+    pending_for_mds.erase(tid);
+  }
+  void _note_rollback(uint64_t tid, bool replay=false) {
+    version++;
+    if (replay)
+      projected_version = version;
+    pending_for_mds.erase(tid);
+  }
+  void _note_server_update(bufferlist& bl, bool replay=false) {
+    version++;
+    if (replay)
+      projected_version = version;
+  }
+
+  void reset_state() override {
+    pending_for_mds.clear();
+    ++version;
+  }
+
+  void handle_request(const cref_t<MMDSTableRequest> &m);
+  void do_server_update(bufferlist& bl);
+
+  virtual void encode_server_state(bufferlist& bl) const = 0;
+  virtual void decode_server_state(bufferlist::const_iterator& bl) = 0;
+
+  void encode_state(bufferlist& bl) const override {
+    encode_server_state(bl);
+    encode(pending_for_mds, bl);
+  }
+  void decode_state(bufferlist::const_iterator& bl) override {
+    decode_server_state(bl);
+    decode(pending_for_mds, bl);
+  }
+
+  // recovery
+  void finish_recovery(set<mds_rank_t>& active);
+  void _do_server_recovery();
+
+  void handle_mds_recovery(mds_rank_t who);
+  void handle_mds_failure_or_stop(mds_rank_t who);
+protected:
+  int table;
+  bool recovered = false;
+  set<mds_rank_t> active_clients;
+private:
+  struct notify_info_t {
+    notify_info_t() {}
+    set<mds_rank_t> notify_ack_gather;
+    mds_rank_t mds;
+    ref_t<MMDSTableRequest> reply = NULL;
+    MDSContext *onfinish = nullptr;
+  };
+
+  friend class C_Prepare;
+  friend class C_Commit;
+  friend class C_Rollback;
+  friend class C_ServerUpdate;
+
+  void handle_prepare(const cref_t<MMDSTableRequest> &m);
+  void _prepare_logged(const cref_t<MMDSTableRequest> &m, version_t tid);
+
+  void handle_commit(const cref_t<MMDSTableRequest> &m);
+  void _commit_logged(const cref_t<MMDSTableRequest> &m);
+
+  void handle_rollback(const cref_t<MMDSTableRequest> &m);
+  void _rollback_logged(const cref_t<MMDSTableRequest> &m);
+
+  void _server_update_logged(bufferlist& bl);
+
+  void handle_notify_ack(const cref_t<MMDSTableRequest> &m);
+
+  map<version_t,mds_table_pending_t> pending_for_mds;  // ** child should encode this! **
+  set<version_t> committing_tids;
+
+  map<version_t, notify_info_t> pending_notifies;
+};
+#endif
diff --git a/src/mds/Mantle.cc b/src/mds/Mantle.cc
new file mode 100644
index 000000000..653d47d66
--- /dev/null
+++ b/src/mds/Mantle.cc
@@ -0,0 +1,137 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Michael Sevilla <mikesevilla3@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "mdstypes.h"
+#include "MDSRank.h"
+#include "Mantle.h"
+#include "msg/Messenger.h"
+#include "common/Clock.h"
+#include "CInode.h"
+
+#define dout_context g_ceph_context
+#undef dout_prefix
+#define dout_prefix *_dout << "mds.mantle "
+#define mantle_dout(lvl) \
+  do {\
+    auto subsys = ceph_subsys_mds;\
+    if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
+      subsys = ceph_subsys_mds_balancer;\
+    }\
+    dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
+
+#define mantle_dendl dendl; } while (0)
+
+
+static int dout_wrapper(lua_State *L)
+{
+  int level = luaL_checkinteger(L, 1);
+  lua_concat(L, lua_gettop(L)-1);
+  mantle_dout(ceph::dout::need_dynamic(level)) << lua_tostring(L, 2)
+					       << mantle_dendl;
+  return 0;
+}
+
+int Mantle::balance(std::string_view script,
+                    mds_rank_t whoami,
+                    const std::vector<std::map<std::string, double>> &metrics,
+                    std::map<mds_rank_t, double> &my_targets)
+{
+  lua_settop(L, 0); /* clear the stack */
+
+  /* load the balancer */
+  if (luaL_loadstring(L, script.data())) {
+    mantle_dout(0) << "WARNING: mantle could not load balancer: "
+            << lua_tostring(L, -1) << mantle_dendl;
+    return -CEPHFS_EINVAL;
+  }
+
+  /* tell the balancer which mds is making the decision */
+  lua_pushinteger(L, (lua_Integer)whoami);
+  lua_setglobal(L, "whoami");
+
+  /* global mds metrics to hold all dictionaries */
+  lua_newtable(L);
+
+  /* push name of mds (i) and its metrics onto Lua stack */
+  for (size_t i=0; i < metrics.size(); i++) {
+    lua_newtable(L);
+
+    /* push values into this mds's table; setfield assigns key/pops val */
+    for (const auto &it : metrics[i]) {
+      lua_pushnumber(L, it.second);
+      lua_setfield(L, -2, it.first.c_str());
+    }
+
+    /* in global mds table at stack[-3], set k=stack[-1] to v=stack[-2] */
+    lua_seti(L, -2, i);
+  }
+
+  /* set the name of the global mds table */
+  lua_setglobal(L, "mds");
+
+  ceph_assert(lua_gettop(L) == 1);
+  if (lua_pcall(L, 0, 1, 0) != LUA_OK) {
+    mantle_dout(0) << "WARNING: mantle could not execute script: "
+            << lua_tostring(L, -1) << mantle_dendl;
+    return -CEPHFS_EINVAL;
+  }
+
+  /* parse response by iterating over Lua stack */
+  if (lua_istable(L, -1) == 0) {
+    mantle_dout(0) << "WARNING: mantle script returned a malformed response" << mantle_dendl;
+    return -CEPHFS_EINVAL;
+  }
+
+  /* fill in return value */
+  for (lua_pushnil(L); lua_next(L, -2); lua_pop(L, 1)) {
+    if (!lua_isinteger(L, -2) || !lua_isnumber(L, -1)) {
+      mantle_dout(0) << "WARNING: mantle script returned a malformed response" << mantle_dendl;
+      return -CEPHFS_EINVAL;
+    }
+    mds_rank_t rank(lua_tointeger(L, -2));
+    my_targets[rank] = lua_tonumber(L, -1);
+  }
+
+  return 0;
+}
+
+Mantle::Mantle (void)
+{
+  /* build lua vm state */
+  L = luaL_newstate();
+  if (!L) {
+    mantle_dout(0) << "WARNING: mantle could not load Lua state" << mantle_dendl;
+    throw std::bad_alloc();
+  }
+
+  /* balancer policies can use basic Lua functions */
+  static const luaL_Reg loadedlibs[] = {
+    {"_G", luaopen_base},
+    {LUA_COLIBNAME, luaopen_coroutine},
+    {LUA_STRLIBNAME, luaopen_string},
+    {LUA_MATHLIBNAME, luaopen_math},
+    {LUA_TABLIBNAME, luaopen_table},
+    {LUA_UTF8LIBNAME, luaopen_utf8},
+    {NULL, NULL}
+  };
+
+  const luaL_Reg *lib;
+  for (lib = loadedlibs; lib->func; lib++) {
+      luaL_requiref(L, lib->name, lib->func, 1);
+      lua_pop(L, 1);  /* remove lib */
+  }
+
+  /* setup debugging */
+  lua_register(L, "BAL_LOG", dout_wrapper);
+}
diff --git a/src/mds/Mantle.h b/src/mds/Mantle.h
new file mode 100644
index 000000000..ffc1843a5
--- /dev/null
+++ b/src/mds/Mantle.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Michael Sevilla <mikesevilla3@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MANTLE_H
+#define CEPH_MANTLE_H
+
+#include <string_view>
+
+#include <lua.hpp>
+#include <vector>
+#include <map>
+#include <string>
+
+#include "mdstypes.h"
+
+class Mantle {
+  public:
+    Mantle();
+    ~Mantle() { if (L) lua_close(L); }
+    int balance(std::string_view script,
+                mds_rank_t whoami,
+                const std::vector <std::map<std::string, double>> &metrics,
+                std::map<mds_rank_t,double> &my_targets);
+
+  protected:
+    lua_State *L;
+};
+
+#endif
diff --git a/src/mds/MetricAggregator.cc b/src/mds/MetricAggregator.cc
new file mode 100644
index 000000000..6487084fb
--- /dev/null
+++ b/src/mds/MetricAggregator.cc
@@ -0,0 +1,435 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/algorithm/copy.hpp>
+
+#include "MDSRank.h"
+#include "MetricAggregator.h"
+#include "mgr/MgrClient.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds.metric.aggregator" << " " << __func__
+
+MetricAggregator::MetricAggregator(CephContext *cct, MDSRank *mds, MgrClient *mgrc)
+  : Dispatcher(cct),
+    mds(mds),
+    mgrc(mgrc),
+    mds_pinger(mds) {
+}
+
+void MetricAggregator::ping_all_active_ranks() {
+  dout(10) << ": pinging " << active_rank_addrs.size() << " active mds(s)" << dendl;
+
+  for (const auto &[rank, addr] : active_rank_addrs) {
+    dout(20) << ": pinging rank=" << rank << " addr=" << addr << dendl;
+    mds_pinger.send_ping(rank, addr);
+  }
+}
+
+int MetricAggregator::init() {
+  dout(10) << dendl;
+
+  pinger = std::thread([this]() {
+      std::unique_lock locker(lock);
+      while (!stopping) {
+        ping_all_active_ranks();
+        locker.unlock();
+        double timo = g_conf().get_val<std::chrono::seconds>("mds_ping_interval").count();
+        sleep(timo);
+        locker.lock();
+      }
+    });
+
+  mgrc->set_perf_metric_query_cb(
+    [this](const ConfigPayload &config_payload) {
+      set_perf_queries(config_payload);
+    },
+    [this]() {
+      return get_perf_reports();
+    });
+
+  return 0;
+}
+
+void MetricAggregator::shutdown() {
+  dout(10) << dendl;
+
+  {
+    std::scoped_lock locker(lock);
+    ceph_assert(!stopping);
+    stopping = true;
+  }
+
+  if (pinger.joinable()) {
+    pinger.join();
+  }
+}
+
+bool MetricAggregator::ms_can_fast_dispatch2(const cref_t<Message> &m) const {
+  return m->get_type() == MSG_MDS_METRICS;
+}
+
+void MetricAggregator::ms_fast_dispatch2(const ref_t<Message> &m) {
+  bool handled = ms_dispatch2(m);
+  ceph_assert(handled);
+}
+
+bool MetricAggregator::ms_dispatch2(const ref_t<Message> &m) {
+  if (m->get_type() == MSG_MDS_METRICS &&
+      m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MDS) {
+    const Message *msg = m.get();
+    const MMDSOp *op = dynamic_cast<const MMDSOp*>(msg);
+    if (!op)
+      dout(0) << typeid(*msg).name() << " is not an MMDSOp type" << dendl;
+    ceph_assert(op);
+    handle_mds_metrics(ref_cast<MMDSMetrics>(m));
+    return true;
+  }
+  return false;
+}
+
+void MetricAggregator::refresh_metrics_for_rank(const entity_inst_t &client,
+                                                mds_rank_t rank, const Metrics &metrics) {
+  dout(20) << ": client=" << client << ", rank=" << rank << ", metrics="
+           << metrics << dendl;
+
+  auto &p = clients_by_rank.at(rank);
+  bool ins = p.insert(client).second;
+  if (ins) {
+    dout(20) << ": rank=" << rank << " has " << p.size() << " connected"
+             << " client(s)" << dendl;
+  }
+
+  auto update_counter_func = [&metrics](const MDSPerformanceCounterDescriptor &d,
+                                        PerformanceCounter *c) {
+    ceph_assert(d.is_supported());
+
+    dout(20) << ": performance_counter_descriptor=" << d << dendl;
+
+    switch (d.type) {
+    case MDSPerformanceCounterType::CAP_HIT_METRIC:
+      c->first = metrics.cap_hit_metric.hits;
+      c->second = metrics.cap_hit_metric.misses;
+      break;
+    case MDSPerformanceCounterType::READ_LATENCY_METRIC:
+      if (metrics.read_latency_metric.updated) {
+        c->first = metrics.read_latency_metric.lat.tv.tv_sec;
+        c->second = metrics.read_latency_metric.lat.tv.tv_nsec;
+      }
+      break;
+    case MDSPerformanceCounterType::WRITE_LATENCY_METRIC:
+      if (metrics.write_latency_metric.updated) {
+        c->first = metrics.write_latency_metric.lat.tv.tv_sec;
+        c->second = metrics.write_latency_metric.lat.tv.tv_nsec;
+      }
+      break;
+    case MDSPerformanceCounterType::METADATA_LATENCY_METRIC:
+      if (metrics.metadata_latency_metric.updated) {
+        c->first = metrics.metadata_latency_metric.lat.tv.tv_sec;
+        c->second = metrics.metadata_latency_metric.lat.tv.tv_nsec;
+      }
+      break;
+    case MDSPerformanceCounterType::DENTRY_LEASE_METRIC:
+      if (metrics.dentry_lease_metric.updated) {
+        c->first = metrics.dentry_lease_metric.hits;
+        c->second = metrics.dentry_lease_metric.misses;
+      }
+      break;
+    case MDSPerformanceCounterType::OPENED_FILES_METRIC:
+      if (metrics.opened_files_metric.updated) {
+        c->first = metrics.opened_files_metric.opened_files;
+        c->second = metrics.opened_files_metric.total_inodes;
+      }
+      break;
+    case MDSPerformanceCounterType::PINNED_ICAPS_METRIC:
+      if (metrics.pinned_icaps_metric.updated) {
+        c->first = metrics.pinned_icaps_metric.pinned_icaps;
+        c->second = metrics.pinned_icaps_metric.total_inodes;
+      }
+      break;
+    case MDSPerformanceCounterType::OPENED_INODES_METRIC:
+      if (metrics.opened_inodes_metric.updated) {
+        c->first = metrics.opened_inodes_metric.opened_inodes;
+        c->second = metrics.opened_inodes_metric.total_inodes;
+      }
+      break;
+    case MDSPerformanceCounterType::READ_IO_SIZES_METRIC:
+      if (metrics.read_io_sizes_metric.updated) {
+        c->first = metrics.read_io_sizes_metric.total_ops;
+        c->second = metrics.read_io_sizes_metric.total_size;
+      }
+      break;
+    case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
+      if (metrics.write_io_sizes_metric.updated) {
+        c->first = metrics.write_io_sizes_metric.total_ops;
+        c->second = metrics.write_io_sizes_metric.total_size;
+      }
+      break;
+    case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
+      if (metrics.read_latency_metric.updated) {
+        c->first = metrics.read_latency_metric.mean.tv.tv_sec;
+        c->second = metrics.read_latency_metric.mean.tv.tv_nsec;
+      }
+      break;
+    case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
+      if (metrics.read_latency_metric.updated) {
+        c->first = metrics.read_latency_metric.sq_sum;
+        c->second = metrics.read_latency_metric.count;
+      }
+      break;
+    case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
+      if (metrics.write_latency_metric.updated) {
+        c->first = metrics.write_latency_metric.mean.tv.tv_sec;
+        c->second = metrics.write_latency_metric.mean.tv.tv_nsec;
+      }
+      break;
+    case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
+      if (metrics.write_latency_metric.updated) {
+        c->first = metrics.write_latency_metric.sq_sum;
+        c->second = metrics.write_latency_metric.count;
+      }
+      break;
+    case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
+      if (metrics.metadata_latency_metric.updated) {
+        c->first = metrics.metadata_latency_metric.mean.tv.tv_sec;
+        c->second = metrics.metadata_latency_metric.mean.tv.tv_nsec;
+      }
+      break;
+    case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
+      if (metrics.metadata_latency_metric.updated) {
+        c->first = metrics.metadata_latency_metric.sq_sum;
+        c->second = metrics.metadata_latency_metric.count;
+      }
+      break;
+    default:
+      ceph_abort_msg("unknown counter type");
+    }
+  };
+
+  auto sub_key_func = [client, rank](const MDSPerfMetricSubKeyDescriptor &d,
+                                     MDSPerfMetricSubKey *sub_key) {
+    ceph_assert(d.is_supported());
+
+    dout(20) << ": sub_key_descriptor=" << d << dendl;
+
+    std::string match_string;
+    switch (d.type) {
+    case MDSPerfMetricSubKeyType::MDS_RANK:
+      match_string = stringify(rank);
+      break;
+    case MDSPerfMetricSubKeyType::CLIENT_ID:
+      match_string = stringify(client);
+      break;
+    default:
+      ceph_abort_msg("unknown counter type");
+    }
+
+    dout(20) << ": match_string=" << match_string << dendl;
+
+    std::smatch match;
+    if (!std::regex_search(match_string, match, d.regex)) {
+      return false;
+    }
+    if (match.size() <= 1) {
+      return false;
+    }
+    for (size_t i = 1; i < match.size(); ++i) {
+      sub_key->push_back(match[i].str());
+    }
+    return true;
+  };
+
+  for (auto& [query, perf_key_map] : query_metrics_map) {
+    MDSPerfMetricKey key;
+    if (query.get_key(sub_key_func, &key)) {
+      query.update_counters(update_counter_func, &perf_key_map[key]);
+    }
+  }
+}
+
+void MetricAggregator::remove_metrics_for_rank(const entity_inst_t &client,
+                                               mds_rank_t rank, bool remove) {
+  dout(20) << ": client=" << client << ", rank=" << rank << dendl;
+
+  if (remove) {
+    auto &p = clients_by_rank.at(rank);
+    bool rm = p.erase(client) != 0;
+    ceph_assert(rm);
+    dout(20) << ": rank=" << rank << " has " << p.size() << " connected"
+             << " client(s)" << dendl;
+  }
+
+  auto sub_key_func = [client, rank](const MDSPerfMetricSubKeyDescriptor &d,
+                                     MDSPerfMetricSubKey *sub_key) {
+    ceph_assert(d.is_supported());
+    dout(20) << ": sub_key_descriptor=" << d << dendl;
+
+    std::string match_string;
+    switch (d.type) {
+    case MDSPerfMetricSubKeyType::MDS_RANK:
+      match_string = stringify(rank);
+      break;
+    case MDSPerfMetricSubKeyType::CLIENT_ID:
+      match_string = stringify(client);
+      break;
+    default:
+      ceph_abort_msg("unknown counter type");
+    }
+
+    dout(20) << ": match_string=" << match_string << dendl;
+
+    std::smatch match;
+    if (!std::regex_search(match_string, match, d.regex)) {
+      return false;
+    }
+    if (match.size() <= 1) {
+      return false;
+    }
+    for (size_t i = 1; i < match.size(); ++i) {
+      sub_key->push_back(match[i].str());
+    }
+    return true;
+  };
+
+  for (auto& [query, perf_key_map] : query_metrics_map) {
+    MDSPerfMetricKey key;
+    if (query.get_key(sub_key_func, &key)) {
+      if (perf_key_map.erase(key)) {
+        dout(10) << ": removed metric for key=" << key << dendl;
+      }
+    }
+  }
+}
+
+void MetricAggregator::handle_mds_metrics(const cref_t<MMDSMetrics> &m) {
+  const metrics_message_t &metrics_message = m->metrics_message;
+
+  auto seq = metrics_message.seq;
+  auto rank = metrics_message.rank;
+  auto &client_metrics_map = metrics_message.client_metrics_map;
+
+  dout(20) << ": applying " << client_metrics_map.size() << " updates for rank="
+           << rank << " with sequence number " << seq << dendl;
+
+  std::scoped_lock locker(lock);
+  if (!mds_pinger.pong_received(rank, seq)) {
+    return;
+  }
+
+  for (auto& [client, metrics] : client_metrics_map) {
+    switch (metrics.update_type) {
+    case UpdateType::UPDATE_TYPE_REFRESH:
+      refresh_metrics_for_rank(client, rank, metrics);
+      break;
+    case UpdateType::UPDATE_TYPE_REMOVE:
+      remove_metrics_for_rank(client, rank, true);
+      break;
+    default:
+      ceph_abort();
+    }
+  }
+}
+
+void MetricAggregator::cull_metrics_for_rank(mds_rank_t rank) {
+  dout(20) << ": rank=" << rank << dendl;
+
+  auto &p = clients_by_rank.at(rank);
+  for (auto &client : p) {
+    remove_metrics_for_rank(client, rank, false);
+  }
+
+  dout(10) << ": culled " << p.size() << " clients" << dendl;
+  clients_by_rank.erase(rank);
+}
+
+void MetricAggregator::notify_mdsmap(const MDSMap &mdsmap) {
+  dout(10) << dendl;
+
+  std::scoped_lock locker(lock);
+  std::set<mds_rank_t> current_active;
+  mdsmap.get_active_mds_set(current_active);
+
+  std::set<mds_rank_t> active_set;
+  boost::copy(active_rank_addrs | boost::adaptors::map_keys,
+              std::inserter(active_set, active_set.begin()));
+
+  std::set<mds_rank_t> diff;
+  std::set_difference(active_set.begin(), active_set.end(),
+                      current_active.begin(), current_active.end(),
+                      std::inserter(diff, diff.end()));
+
+  for (auto &rank : diff) {
+    dout(10) << ": mds rank=" << rank << " removed from mdsmap" << dendl;
+    active_rank_addrs.erase(rank);
+    cull_metrics_for_rank(rank);
+    mds_pinger.reset_ping(rank);
+  }
+
+  diff.clear();
+  std::set_difference(current_active.begin(), current_active.end(),
+                      active_set.begin(), active_set.end(),
+                      std::inserter(diff, diff.end()));
+
+  for (auto &rank : diff) {
+    auto rank_addr = mdsmap.get_addrs(rank);
+    dout(10) << ": active rank=" << rank << " (mds." << mdsmap.get_mds_info(rank).name
+             << ") has addr=" << rank_addr << dendl;
+    active_rank_addrs.emplace(rank, rank_addr);
+    clients_by_rank.emplace(rank, std::unordered_set<entity_inst_t>{});
+  }
+
+  dout(10) << ": active set=["  << active_rank_addrs << "]" << dendl;
+}
+
+void MetricAggregator::set_perf_queries(const ConfigPayload &config_payload) {
+  const MDSConfigPayload &mds_config_payload = boost::get<MDSConfigPayload>(config_payload);
+  const std::map<MDSPerfMetricQuery, MDSPerfMetricLimits> &queries = mds_config_payload.config;
+
+  dout(10) << ": setting " << queries.size() << " queries" << dendl;
+
+  std::scoped_lock locker(lock);
+  std::map<MDSPerfMetricQuery, std::map<MDSPerfMetricKey, PerformanceCounters>> new_data;
+  for (auto &p : queries) {
+    std::swap(new_data[p.first], query_metrics_map[p.first]);
+  }
+  std::swap(query_metrics_map, new_data);
+}
+
+MetricPayload MetricAggregator::get_perf_reports() {
+  MDSMetricPayload payload;
+  MDSPerfMetricReport &metric_report = payload.metric_report;
+  std::map<MDSPerfMetricQuery, MDSPerfMetrics> &reports = metric_report.reports;
+
+  std::scoped_lock locker(lock);
+
+  for (auto& [query, counters] : query_metrics_map) {
+    auto &report = reports[query];
+
+    query.get_performance_counter_descriptors(&report.performance_counter_descriptors);
+
+    auto &descriptors = report.performance_counter_descriptors;
+
+    dout(20) << ": descriptors=" << descriptors << dendl;
+
+    for (auto &p : counters) {
+      dout(20) << ": packing perf_metric_key=" << p.first << ", perf_counter="
+               << p.second << dendl;
+      auto &bl = report.group_packed_performance_counters[p.first];
+      query.pack_counters(p.second, &bl);
+    }
+  }
+
+  // stash a copy of dealyed and failed ranks. mgr culls out metrics
+  // for failed ranks and tags metrics for delayed ranks as "stale".
+  for (auto &p : active_rank_addrs) {
+    auto rank = p.first;
+    if (mds_pinger.is_rank_lagging(rank)) {
+      metric_report.rank_metrics_delayed.insert(rank);
+    }
+  }
+
+  return payload;
+}
diff --git a/src/mds/MetricAggregator.h b/src/mds/MetricAggregator.h
new file mode 100644
index 000000000..fe9aef2e3
--- /dev/null
+++ b/src/mds/MetricAggregator.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MDS_METRIC_AGGREGATOR_H
+#define CEPH_MDS_METRIC_AGGREGATOR_H
+
+#include <map>
+#include <set>
+#include <thread>
+
+#include "msg/msg_types.h"
+#include "msg/Dispatcher.h"
+#include "common/ceph_mutex.h"
+#include "include/common_fwd.h"
+#include "messages/MMDSMetrics.h"
+
+#include "mgr/MetricTypes.h"
+#include "mgr/MDSPerfMetricTypes.h"
+
+#include "mdstypes.h"
+#include "MDSMap.h"
+#include "MDSPinger.h"
+
+class MDSRank;
+class MgrClient;
+
+class MetricAggregator : public Dispatcher {
+public:
+  MetricAggregator(CephContext *cct, MDSRank *mds, MgrClient *mgrc);
+
+  int init();
+  void shutdown();
+
+  void notify_mdsmap(const MDSMap &mdsmap);
+
+  bool ms_can_fast_dispatch_any() const override {
+    return true;
+  }
+  bool ms_can_fast_dispatch2(const cref_t<Message> &m) const override;
+  void ms_fast_dispatch2(const ref_t<Message> &m) override;
+  bool ms_dispatch2(const ref_t<Message> &m) override;
+
+  void ms_handle_connect(Connection *c) override {
+  }
+  bool ms_handle_reset(Connection *c) override {
+    return false;
+  }
+  void ms_handle_remote_reset(Connection *c) override {
+  }
+  bool ms_handle_refused(Connection *c) override {
+    return false;
+  }
+
+private:
+  // drop this lock when calling ->send_message_mds() else mds might
+  // deadlock
+  ceph::mutex lock = ceph::make_mutex("MetricAggregator::lock");
+  MDSRank *mds;
+  MgrClient *mgrc;
+
+  // maintain a map of rank to list of clients so that when a rank
+  // goes away we cull metrics of clients connected to that rank.
+  std::map<mds_rank_t, std::unordered_set<entity_inst_t>> clients_by_rank;
+
+  // user query to metrics map
+  std::map<MDSPerfMetricQuery, std::map<MDSPerfMetricKey, PerformanceCounters>> query_metrics_map;
+
+  MDSPinger mds_pinger;
+  std::thread pinger;
+
+  std::map<mds_rank_t, entity_addrvec_t> active_rank_addrs;
+
+  bool stopping = false;
+
+  void handle_mds_metrics(const cref_t<MMDSMetrics> &m);
+
+  void refresh_metrics_for_rank(const entity_inst_t &client, mds_rank_t rank,
+                                const Metrics &metrics);
+  void remove_metrics_for_rank(const entity_inst_t &client, mds_rank_t rank, bool remove);
+
+  void cull_metrics_for_rank(mds_rank_t rank);
+
+  void ping_all_active_ranks();
+
+  void set_perf_queries(const ConfigPayload &config_payload);
+  MetricPayload get_perf_reports();
+};
+
+#endif // CEPH_MDS_METRIC_AGGREGATOR_H
diff --git a/src/mds/MetricsHandler.cc b/src/mds/MetricsHandler.cc
new file mode 100644
index 000000000..b28b06b7a
--- /dev/null
+++ b/src/mds/MetricsHandler.cc
@@ -0,0 +1,422 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "messages/MMDSMetrics.h"
+
+#include "MDSRank.h"
+#include "SessionMap.h"
+#include "MetricsHandler.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": mds.metrics"
+
+MetricsHandler::MetricsHandler(CephContext *cct, MDSRank *mds)
+  : Dispatcher(cct),
+    mds(mds) {
+}
+
+bool MetricsHandler::ms_can_fast_dispatch2(const cref_t<Message> &m) const {
+  return m->get_type() == CEPH_MSG_CLIENT_METRICS || m->get_type() == MSG_MDS_PING;
+}
+
+void MetricsHandler::ms_fast_dispatch2(const ref_t<Message> &m) {
+  bool handled = ms_dispatch2(m);
+  ceph_assert(handled);
+}
+
+bool MetricsHandler::ms_dispatch2(const ref_t<Message> &m) {
+  if (m->get_type() == CEPH_MSG_CLIENT_METRICS &&
+      m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_CLIENT) {
+    handle_client_metrics(ref_cast<MClientMetrics>(m));
+    return true;
+  } else if (m->get_type() == MSG_MDS_PING &&
+             m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MDS) {
+    const Message *msg = m.get();
+    const MMDSOp *op = dynamic_cast<const MMDSOp*>(msg);
+    if (!op)
+      dout(0) << typeid(*msg).name() << " is not an MMDSOp type" << dendl;
+    ceph_assert(op);
+    handle_mds_ping(ref_cast<MMDSPing>(m));
+    return true;
+  }
+  return false;
+}
+
+void MetricsHandler::init() {
+  dout(10) << dendl;
+
+  updater = std::thread([this]() {
+      std::unique_lock locker(lock);
+      while (!stopping) {
+        double after = g_conf().get_val<std::chrono::seconds>("mds_metrics_update_interval").count();
+        locker.unlock();
+        sleep(after);
+        locker.lock();
+        update_rank0();
+      }
+    });
+}
+
+void MetricsHandler::shutdown() {
+  dout(10) << dendl;
+
+  {
+    std::scoped_lock locker(lock);
+    ceph_assert(!stopping);
+    stopping = true;
+  }
+
+  if (updater.joinable()) {
+    updater.join();
+  }
+}
+
+
+void MetricsHandler::add_session(Session *session) {
+  ceph_assert(session != nullptr);
+
+  auto &client = session->info.inst;
+  dout(10) << ": session=" << session << ", client=" << client << dendl;
+
+  std::scoped_lock locker(lock);
+
+  auto p = client_metrics_map.emplace(client, std::pair(last_updated_seq, Metrics())).first;
+  auto &metrics = p->second.second;
+  metrics.update_type = UPDATE_TYPE_REFRESH;
+  dout(20) << ": metrics=" << metrics << dendl;
+}
+
+void MetricsHandler::remove_session(Session *session) {
+  ceph_assert(session != nullptr);
+
+  auto &client = session->info.inst;
+  dout(10) << ": session=" << session << ", client=" << client << dendl;
+
+  std::scoped_lock locker(lock);
+
+  auto it = client_metrics_map.find(client);
+  if (it == client_metrics_map.end()) {
+    return;
+  }
+
+  // if a session got removed before rank 0 saw at least one refresh
+  // update from us or if we will send a remove type update as the
+  // the first "real" update (with an incoming sequence number), then
+  // cut short the update as rank 0 has not witnessed this client session
+  // update this rank.
+  auto lus = it->second.first;
+  if (lus == last_updated_seq) {
+    dout(10) << ": metric lus=" << lus << ", last_updated_seq=" << last_updated_seq
+             << dendl;
+    client_metrics_map.erase(it);
+    return;
+  }
+
+  // zero out all metrics
+  auto &metrics = it->second.second;
+  metrics.cap_hit_metric = { };
+  metrics.read_latency_metric = { };
+  metrics.write_latency_metric = { };
+  metrics.metadata_latency_metric = { };
+  metrics.dentry_lease_metric = { };
+  metrics.opened_files_metric = { };
+  metrics.pinned_icaps_metric = { };
+  metrics.opened_inodes_metric = { };
+  metrics.read_io_sizes_metric = { };
+  metrics.write_io_sizes_metric = { };
+  metrics.update_type = UPDATE_TYPE_REMOVE;
+}
+
+void MetricsHandler::set_next_seq(version_t seq) {
+  dout(20) << ": current sequence number " << next_seq << ", setting next sequence number "
+           << seq << dendl;
+  next_seq = seq;
+}
+
+void MetricsHandler::reset_seq() {
+  dout(10) << ": last_updated_seq=" << last_updated_seq << dendl;
+
+  set_next_seq(0);
+  for (auto &[client, metrics_v] : client_metrics_map) {
+    dout(10) << ": reset last updated seq for client addr=" << client << dendl;
+    metrics_v.first = last_updated_seq;
+  }
+}
+
+void MetricsHandler::handle_payload(Session *session, const CapInfoPayload &payload) {
+  dout(20) << ": type=" << payload.get_type()
+	   << ", session=" << session << ", hits=" << payload.cap_hits << ", misses="
+	   << payload.cap_misses << dendl;
+
+  auto it = client_metrics_map.find(session->info.inst);
+  if (it == client_metrics_map.end()) {
+    return;
+  }
+
+  auto &metrics = it->second.second;
+  metrics.update_type = UPDATE_TYPE_REFRESH;
+  metrics.cap_hit_metric.hits = payload.cap_hits;
+  metrics.cap_hit_metric.misses = payload.cap_misses;
+}
+
+void MetricsHandler::handle_payload(Session *session, const ReadLatencyPayload &payload) {
+  dout(20) << ": type=" << payload.get_type()
+           << ", session=" << session << ", latency=" << payload.lat
+           << ", avg=" << payload.mean << ", sq_sum=" << payload.sq_sum
+           << ", count=" << payload.count << dendl;
+
+  auto it = client_metrics_map.find(session->info.inst);
+  if (it == client_metrics_map.end()) {
+    return;
+  }
+
+  auto &metrics = it->second.second;
+  metrics.update_type = UPDATE_TYPE_REFRESH;
+  metrics.read_latency_metric.lat = payload.lat;
+  metrics.read_latency_metric.mean = payload.mean;
+  metrics.read_latency_metric.sq_sum = payload.sq_sum;
+  metrics.read_latency_metric.count = payload.count;
+  metrics.read_latency_metric.updated = true;
+}
+
+void MetricsHandler::handle_payload(Session *session, const WriteLatencyPayload &payload) {
+  dout(20) << ": type=" << payload.get_type()
+           << ", session=" << session << ", latency=" << payload.lat
+           << ", avg=" << payload.mean << ", sq_sum=" << payload.sq_sum
+           << ", count=" << payload.count << dendl;
+
+  auto it = client_metrics_map.find(session->info.inst);
+  if (it == client_metrics_map.end()) {
+    return;
+  }
+
+  auto &metrics = it->second.second;
+  metrics.update_type = UPDATE_TYPE_REFRESH;
+  metrics.write_latency_metric.lat = payload.lat;
+  metrics.write_latency_metric.mean = payload.mean;
+  metrics.write_latency_metric.sq_sum = payload.sq_sum;
+  metrics.write_latency_metric.count = payload.count;
+  metrics.write_latency_metric.updated = true;
+}
+
+void MetricsHandler::handle_payload(Session *session, const MetadataLatencyPayload &payload) {
+  dout(20) << ": type=" << payload.get_type()
+           << ", session=" << session << ", latency=" << payload.lat
+           << ", avg=" << payload.mean << ", sq_sum=" << payload.sq_sum
+           << ", count=" << payload.count << dendl;
+
+  auto it = client_metrics_map.find(session->info.inst);
+  if (it == client_metrics_map.end()) {
+    return;
+  }
+
+  auto &metrics = it->second.second;
+  metrics.update_type = UPDATE_TYPE_REFRESH;
+  metrics.metadata_latency_metric.lat = payload.lat;
+  metrics.metadata_latency_metric.mean = payload.mean;
+  metrics.metadata_latency_metric.sq_sum = payload.sq_sum;
+  metrics.metadata_latency_metric.count = payload.count;
+  metrics.metadata_latency_metric.updated = true;
+}
+
+void MetricsHandler::handle_payload(Session *session, const DentryLeasePayload &payload) {
+  dout(20) << ": type=" << payload.get_type()
+	   << ", session=" << session << ", hits=" << payload.dlease_hits << ", misses="
+	   << payload.dlease_misses << dendl;
+
+  auto it = client_metrics_map.find(session->info.inst);
+  if (it == client_metrics_map.end()) {
+    return;
+  }
+
+  auto &metrics = it->second.second;
+  metrics.update_type = UPDATE_TYPE_REFRESH;
+  metrics.dentry_lease_metric.hits = payload.dlease_hits;
+  metrics.dentry_lease_metric.misses = payload.dlease_misses;
+  metrics.dentry_lease_metric.updated = true;
+}
+
+void MetricsHandler::handle_payload(Session *session, const OpenedFilesPayload &payload) {
+  dout(20) << ": type=" << payload.get_type()
+           << ", session=" << session << ", opened_files=" << payload.opened_files
+           << ", total_inodes=" << payload.total_inodes << dendl;
+
+  auto it = client_metrics_map.find(session->info.inst);
+  if (it == client_metrics_map.end()) {
+    return;
+  }
+
+  auto &metrics = it->second.second;
+  metrics.update_type = UPDATE_TYPE_REFRESH;
+  metrics.opened_files_metric.opened_files = payload.opened_files;
+  metrics.opened_files_metric.total_inodes = payload.total_inodes;
+  metrics.opened_files_metric.updated = true;
+}
+
+void MetricsHandler::handle_payload(Session *session, const PinnedIcapsPayload &payload) {
+  dout(20) << ": type=" << payload.get_type()
+           << ", session=" << session << ", pinned_icaps=" << payload.pinned_icaps
+           << ", total_inodes=" << payload.total_inodes << dendl;
+
+  auto it = client_metrics_map.find(session->info.inst);
+  if (it == client_metrics_map.end()) {
+    return;
+  }
+
+  auto &metrics = it->second.second;
+  metrics.update_type = UPDATE_TYPE_REFRESH;
+  metrics.pinned_icaps_metric.pinned_icaps = payload.pinned_icaps;
+  metrics.pinned_icaps_metric.total_inodes = payload.total_inodes;
+  metrics.pinned_icaps_metric.updated = true;
+}
+
+void MetricsHandler::handle_payload(Session *session, const OpenedInodesPayload &payload) {
+  dout(20) << ": type=" << payload.get_type()
+           << ", session=" << session << ", opened_inodes=" << payload.opened_inodes
+           << ", total_inodes=" << payload.total_inodes << dendl;
+
+  auto it = client_metrics_map.find(session->info.inst);
+  if (it == client_metrics_map.end()) {
+    return;
+  }
+
+  auto &metrics = it->second.second;
+  metrics.update_type = UPDATE_TYPE_REFRESH;
+  metrics.opened_inodes_metric.opened_inodes = payload.opened_inodes;
+  metrics.opened_inodes_metric.total_inodes = payload.total_inodes;
+  metrics.opened_inodes_metric.updated = true;
+}
+
+void MetricsHandler::handle_payload(Session *session, const ReadIoSizesPayload &payload) {
+  dout(20) << ": type=" << payload.get_type()
+           << ", session=" << session << ", total_ops=" << payload.total_ops
+           << ", total_size=" << payload.total_size << dendl;
+
+  auto it = client_metrics_map.find(session->info.inst);
+  if (it == client_metrics_map.end()) {
+    return;
+  }
+
+  auto &metrics = it->second.second;
+  metrics.update_type = UPDATE_TYPE_REFRESH;
+  metrics.read_io_sizes_metric.total_ops = payload.total_ops;
+  metrics.read_io_sizes_metric.total_size = payload.total_size;
+  metrics.read_io_sizes_metric.updated = true;
+}
+
+void MetricsHandler::handle_payload(Session *session, const WriteIoSizesPayload &payload) {
+  dout(20) << ": type=" << payload.get_type()
+           << ", session=" << session << ", total_ops=" << payload.total_ops
+           << ", total_size=" << payload.total_size << dendl;
+
+  auto it = client_metrics_map.find(session->info.inst);
+  if (it == client_metrics_map.end()) {
+    return;
+  }
+
+  auto &metrics = it->second.second;
+  metrics.update_type = UPDATE_TYPE_REFRESH;
+  metrics.write_io_sizes_metric.total_ops = payload.total_ops;
+  metrics.write_io_sizes_metric.total_size = payload.total_size;
+  metrics.write_io_sizes_metric.updated = true;
+}
+
+void MetricsHandler::handle_payload(Session *session, const UnknownPayload &payload) {
+  dout(5) << ": type=Unknown, session=" << session << ", ignoring unknown payload" << dendl;
+}
+
+void MetricsHandler::handle_client_metrics(const cref_t<MClientMetrics> &m) {
+  std::scoped_lock locker(lock);
+
+  Session *session = mds->get_session(m);
+  dout(20) << ": session=" << session << dendl;
+
+  if (session == nullptr) {
+    dout(10) << ": ignoring session less message" << dendl;
+    return;
+  }
+
+  for (auto &metric : m->updates) {
+    boost::apply_visitor(HandlePayloadVisitor(this, session), metric.payload);
+  }
+}
+
+void MetricsHandler::handle_mds_ping(const cref_t<MMDSPing> &m) {
+  std::scoped_lock locker(lock);
+  set_next_seq(m->seq);
+}
+
+void MetricsHandler::notify_mdsmap(const MDSMap &mdsmap) {
+  dout(10) << dendl;
+
+  std::set<mds_rank_t> active_set;
+
+  std::scoped_lock locker(lock);
+
+  // reset sequence number when rank0 is unavailable or a new
+  // rank0 mds is chosen -- new rank0 will assign a starting
+  // sequence number when it is ready to process metric updates.
+  // this also allows to cut-short metric remove operations to
+  // be satisfied locally in many cases.
+
+  // update new rank0 address
+  mdsmap.get_active_mds_set(active_set);
+  if (!active_set.count((mds_rank_t)0)) {
+    dout(10) << ": rank0 is unavailable" << dendl;
+    addr_rank0 = boost::none;
+    reset_seq();
+    return;
+  }
+
+  dout(10) << ": rank0 is mds." << mdsmap.get_mds_info((mds_rank_t)0).name << dendl;
+
+  auto new_rank0_addr = mdsmap.get_addrs((mds_rank_t)0);
+  if (addr_rank0 != new_rank0_addr) {
+    dout(10) << ": rank0 addr is now " << new_rank0_addr << dendl;
+    addr_rank0 = new_rank0_addr;
+    reset_seq();
+  }
+}
+
+void MetricsHandler::update_rank0() {
+  dout(20) << dendl;
+
+  if (!addr_rank0) {
+    dout(20) << ": not yet notified with rank0 address, ignoring" << dendl;
+    return;
+  }
+
+  metrics_message_t metrics_message;
+  auto &update_client_metrics_map = metrics_message.client_metrics_map;
+
+  metrics_message.seq = next_seq;
+  metrics_message.rank = mds->get_nodeid();
+
+  for (auto p = client_metrics_map.begin(); p != client_metrics_map.end();) {
+    // copy metrics and update local metrics map as required
+    auto &metrics = p->second.second;
+    update_client_metrics_map.emplace(p->first, metrics);
+    if (metrics.update_type == UPDATE_TYPE_REFRESH) {
+      metrics = {};
+      ++p;
+    } else {
+      p = client_metrics_map.erase(p);
+    }
+  }
+
+  // only start incrementing when its kicked via set_next_seq()
+  if (next_seq != 0) {
+    ++last_updated_seq;
+  }
+
+  dout(20) << ": sending metric updates for " << update_client_metrics_map.size()
+           << " clients to rank 0 (address: " << *addr_rank0 << ") with sequence number "
+           << next_seq << ", last updated sequence number " << last_updated_seq << dendl;
+
+  mds->send_message_mds(make_message<MMDSMetrics>(std::move(metrics_message)), *addr_rank0);
+}
diff --git a/src/mds/MetricsHandler.h b/src/mds/MetricsHandler.h
new file mode 100644
index 000000000..0b75b0248
--- /dev/null
+++ b/src/mds/MetricsHandler.h
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MDS_METRICS_HANDLER_H
+#define CEPH_MDS_METRICS_HANDLER_H
+
+#include <thread>
+#include <utility>
+#include <boost/variant.hpp>
+
+#include "msg/Dispatcher.h"
+#include "common/ceph_mutex.h"
+#include "include/common_fwd.h"
+#include "include/cephfs/metrics/Types.h"
+
+#include "messages/MMDSPing.h"
+#include "messages/MClientMetrics.h"
+
+#include "MDSPerfMetricTypes.h"
+
+class MDSRank;
+class Session;
+
+class MetricsHandler : public Dispatcher {
+public:
+  MetricsHandler(CephContext *cct, MDSRank *mds);
+
+  bool ms_can_fast_dispatch_any() const override {
+    return true;
+  }
+  bool ms_can_fast_dispatch2(const cref_t<Message> &m) const override;
+  void ms_fast_dispatch2(const ref_t<Message> &m) override;
+  bool ms_dispatch2(const ref_t<Message> &m) override;
+
+  void ms_handle_connect(Connection *c) override {
+  }
+  bool ms_handle_reset(Connection *c) override {
+    return false;
+  }
+  void ms_handle_remote_reset(Connection *c) override {
+  }
+  bool ms_handle_refused(Connection *c) override {
+    return false;
+  }
+
+  void add_session(Session *session);
+  void remove_session(Session *session);
+
+  void init();
+  void shutdown();
+
+  void notify_mdsmap(const MDSMap &mdsmap);
+
+private:
+  struct HandlePayloadVisitor : public boost::static_visitor<void> {
+    MetricsHandler *metrics_handler;
+    Session *session;
+
+    HandlePayloadVisitor(MetricsHandler *metrics_handler, Session *session)
+      : metrics_handler(metrics_handler), session(session) {
+    }
+
+    template <typename ClientMetricPayload>
+    inline void operator()(const ClientMetricPayload &payload) const {
+      metrics_handler->handle_payload(session, payload);
+    }
+  };
+
+  MDSRank *mds;
+  // drop this lock when calling ->send_message_mds() else mds might
+  // deadlock
+  ceph::mutex lock = ceph::make_mutex("MetricsHandler::lock");
+
+  // ISN sent by rank0 pinger is 1
+  version_t next_seq = 0;
+
+  // sequence number incremented on each update sent to rank 0.
+  // this is nowhere related to next_seq and is completely used
+  // locally to figure out if a session got added and removed
+  // within an update to rank 0.
+  version_t last_updated_seq = 0;
+
+  std::thread updater;
+  std::map<entity_inst_t, std::pair<version_t, Metrics>> client_metrics_map;
+
+  // address of rank 0 mds, so that the message can be sent without
+  // acquiring mds_lock. misdirected messages to rank 0 are taken
+  // care of by rank 0.
+  boost::optional<entity_addrvec_t> addr_rank0;
+
+  bool stopping = false;
+
+  void handle_payload(Session *session, const CapInfoPayload &payload);
+  void handle_payload(Session *session, const ReadLatencyPayload &payload);
+  void handle_payload(Session *session, const WriteLatencyPayload &payload);
+  void handle_payload(Session *session, const MetadataLatencyPayload &payload);
+  void handle_payload(Session *session, const DentryLeasePayload &payload);
+  void handle_payload(Session *session, const OpenedFilesPayload &payload);
+  void handle_payload(Session *session, const PinnedIcapsPayload &payload);
+  void handle_payload(Session *session, const OpenedInodesPayload &payload);
+  void handle_payload(Session *session, const ReadIoSizesPayload &payload);
+  void handle_payload(Session *session, const WriteIoSizesPayload &payload);
+  void handle_payload(Session *session, const UnknownPayload &payload);
+
+  void set_next_seq(version_t seq);
+  void reset_seq();
+
+  void handle_client_metrics(const cref_t<MClientMetrics> &m);
+  void handle_mds_ping(const cref_t<MMDSPing> &m);
+
+  void update_rank0();
+};
+
+#endif // CEPH_MDS_METRICS_HANDLER_H
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
new file mode 100644
index 000000000..13bd2652a
--- /dev/null
+++ b/src/mds/Migrator.cc
@@ -0,0 +1,3709 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "MDSRank.h"
+#include "MDCache.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+#include "Migrator.h"
+#include "Locker.h"
+#include "Server.h"
+
+#include "MDBalancer.h"
+#include "MDLog.h"
+#include "MDSMap.h"
+#include "Mutation.h"
+
+#include "include/filepath.h"
+#include "common/likely.h"
+
+#include "events/EExport.h"
+#include "events/EImportStart.h"
+#include "events/EImportFinish.h"
+#include "events/ESessions.h"
+
+#include "msg/Messenger.h"
+
+#include "messages/MClientCaps.h"
+
+/*
+ * this is what the dir->dir_auth values look like
+ *
+ *   dir_auth  authbits  
+ * export
+ *   me         me      - before
+ *   me, me     me      - still me, but preparing for export
+ *   me, them   me      - send MExportDir (peer is preparing)
+ *   them, me   me      - journaled EExport
+ *   them       them    - done
+ *
+ * import:
+ *   them       them    - before
+ *   me, them   me      - journaled EImportStart
+ *   me         me      - done
+ *
+ * which implies:
+ *  - auth bit is set if i am listed as first _or_ second dir_auth.
+ */
+
+#include "common/config.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".mig " << __func__ << " "
+
+
+class MigratorContext : public MDSContext {
+protected:
+  Migrator *mig;
+  MDSRank *get_mds() override {
+    return mig->mds;
+  }
+public:
+  explicit MigratorContext(Migrator *mig_) : mig(mig_) {
+    ceph_assert(mig != NULL);
+  }
+};
+
+class MigratorLogContext : public MDSLogContextBase {
+protected:
+  Migrator *mig;
+  MDSRank *get_mds() override {
+    return mig->mds;
+  }
+public:
+  explicit MigratorLogContext(Migrator *mig_) : mig(mig_) {
+    ceph_assert(mig != NULL);
+  }
+};
+
+void Migrator::dispatch(const cref_t<Message> &m)
+{
+  switch (m->get_type()) {
+    // import
+  case MSG_MDS_EXPORTDIRDISCOVER:
+    handle_export_discover(ref_cast<MExportDirDiscover>(m));
+    break;
+  case MSG_MDS_EXPORTDIRPREP:
+    handle_export_prep(ref_cast<MExportDirPrep>(m));
+    break;
+  case MSG_MDS_EXPORTDIR:
+    if (unlikely(inject_session_race)) {
+      dout(0) << "waiting for inject_session_race" << dendl;
+      mds->wait_for_any_client_connection(new C_MDS_RetryMessage(mds, m));
+    } else {
+      handle_export_dir(ref_cast<MExportDir>(m));
+    }
+    break;
+  case MSG_MDS_EXPORTDIRFINISH:
+    handle_export_finish(ref_cast<MExportDirFinish>(m));
+    break;
+  case MSG_MDS_EXPORTDIRCANCEL:
+    handle_export_cancel(ref_cast<MExportDirCancel>(m));
+    break;
+
+    // export 
+  case MSG_MDS_EXPORTDIRDISCOVERACK:
+    handle_export_discover_ack(ref_cast<MExportDirDiscoverAck>(m));
+    break;
+  case MSG_MDS_EXPORTDIRPREPACK:
+    handle_export_prep_ack(ref_cast<MExportDirPrepAck>(m));
+    break;
+  case MSG_MDS_EXPORTDIRACK:
+    handle_export_ack(ref_cast<MExportDirAck>(m));
+    break;
+  case MSG_MDS_EXPORTDIRNOTIFYACK:
+    handle_export_notify_ack(ref_cast<MExportDirNotifyAck>(m));
+    break;
+
+    // export 3rd party (dir_auth adjustments)
+  case MSG_MDS_EXPORTDIRNOTIFY:
+    handle_export_notify(ref_cast<MExportDirNotify>(m));
+    break;
+
+    // caps
+  case MSG_MDS_EXPORTCAPS:
+    handle_export_caps(ref_cast<MExportCaps>(m));
+    break;
+  case MSG_MDS_EXPORTCAPSACK:
+    handle_export_caps_ack(ref_cast<MExportCapsAck>(m));
+    break;
+  case MSG_MDS_GATHERCAPS:
+    handle_gather_caps(ref_cast<MGatherCaps>(m));
+    break;
+
+  default:
+    derr << "migrator unknown message " << m->get_type() << dendl;
+    ceph_abort_msg("migrator unknown message");
+  }
+}
+
+
+class C_MDC_EmptyImport : public MigratorContext {
+  CDir *dir;
+public:
+  C_MDC_EmptyImport(Migrator *m, CDir *d) :
+    MigratorContext(m), dir(d) {
+    dir->get(CDir::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    mig->export_empty_import(dir);
+    dir->put(CDir::PIN_PTRWAITER);
+  }
+};
+
+
+void Migrator::export_empty_import(CDir *dir)
+{
+  dout(7) << *dir << dendl;
+  ceph_assert(dir->is_subtree_root());
+
+  if (dir->inode->is_auth()) {
+    dout(7) << " inode is auth" << dendl;
+    return;
+  }
+  if (!dir->is_auth()) {
+    dout(7) << " not auth" << dendl;
+    return;
+  }
+  if (dir->is_freezing() || dir->is_frozen()) {
+    dout(7) << " freezing or frozen" << dendl;
+    return;
+  }
+  if (dir->get_num_head_items() > 0) {
+    dout(7) << " not actually empty" << dendl;
+    return;
+  }
+  if (dir->inode->is_root()) {
+    dout(7) << " root" << dendl;
+    return;
+  }
+  
+  mds_rank_t dest = dir->inode->authority().first;
+  //if (mds->is_shutting_down()) dest = 0;  // this is more efficient.
+  
+  dout(7) << " really empty, exporting to " << dest << dendl;
+  assert (dest != mds->get_nodeid());
+  
+  dout(7) << "exporting to mds." << dest 
+           << " empty import " << *dir << dendl;
+  export_dir( dir, dest );
+}
+
+void Migrator::find_stale_export_freeze()
+{
+  utime_t now = ceph_clock_now();
+  utime_t cutoff = now;
+  cutoff -= g_conf()->mds_freeze_tree_timeout;
+
+
+  /*
+   * We could have situations like:
+   *
+   * - mds.0 authpins an item in subtree A
+   * - mds.0 sends request to mds.1 to authpin an item in subtree B
+   * - mds.0 freezes subtree A
+   * - mds.1 authpins an item in subtree B
+   * - mds.1 sends request to mds.0 to authpin an item in subtree A
+   * - mds.1 freezes subtree B
+   * - mds.1 receives the remote authpin request from mds.0
+   *   (wait because subtree B is freezing)
+   * - mds.0 receives the remote authpin request from mds.1
+   *   (wait because subtree A is freezing)
+   *
+   *
+   * - client request authpins items in subtree B
+   * - freeze subtree B
+   * - import subtree A which is parent of subtree B
+   *   (authpins parent inode of subtree B, see CDir::set_dir_auth())
+   * - freeze subtree A
+   * - client request tries authpinning items in subtree A
+   *   (wait because subtree A is freezing)
+   */
+  for (map<CDir*,export_state_t>::iterator p = export_state.begin();
+       p != export_state.end(); ) {
+    CDir* dir = p->first;
+    export_state_t& stat = p->second;
+    ++p;
+    if (stat.state != EXPORT_DISCOVERING && stat.state != EXPORT_FREEZING)
+      continue;
+    ceph_assert(dir->freeze_tree_state);
+    if (stat.last_cum_auth_pins != dir->freeze_tree_state->auth_pins) {
+      stat.last_cum_auth_pins = dir->freeze_tree_state->auth_pins;
+      stat.last_cum_auth_pins_change = now;
+      continue;
+    }
+    if (stat.last_cum_auth_pins_change >= cutoff)
+      continue;
+    if (stat.num_remote_waiters > 0 ||
+	(!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
+      export_try_cancel(dir);
+    }
+  }
+}
+
+void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
+{
+  dout(10) << *dir << dendl;
+
+  map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+  ceph_assert(it != export_state.end());
+
+  int state = it->second.state;
+  switch (state) {
+  case EXPORT_LOCKING:
+    dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl;
+    num_locking_exports--;
+    it->second.state = EXPORT_CANCELLED;
+    dir->auth_unpin(this);
+    break;
+  case EXPORT_DISCOVERING:
+    dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl;
+    it->second.state = EXPORT_CANCELLED;
+    dir->unfreeze_tree();  // cancel the freeze
+    dir->auth_unpin(this);
+    if (notify_peer &&
+	(!mds->is_cluster_degraded() ||
+	 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
+      mds->send_message_mds(make_message<MExportDirCancel>(dir->dirfrag(),
+							   it->second.tid),
+			    it->second.peer);
+    break;
+
+  case EXPORT_FREEZING:
+    dout(10) << "export state=freezing : canceling freeze" << dendl;
+    it->second.state = EXPORT_CANCELLED;
+    dir->unfreeze_tree();  // cancel the freeze
+    if (dir->is_subtree_root())
+      mdcache->try_subtree_merge(dir);
+    if (notify_peer &&
+	(!mds->is_cluster_degraded() ||
+	 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
+      mds->send_message_mds(make_message<MExportDirCancel>(dir->dirfrag(),
+							   it->second.tid),
+			    it->second.peer);
+    break;
+
+    // NOTE: state order reversal, warning comes after prepping
+  case EXPORT_WARNING:
+    dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
+    it->second.state = EXPORT_CANCELLING;
+    // fall-thru
+
+  case EXPORT_PREPPING:
+    if (state != EXPORT_WARNING) {
+      dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
+      it->second.state = EXPORT_CANCELLED;
+    }
+
+    {
+      // unpin bounds
+      set<CDir*> bounds;
+      mdcache->get_subtree_bounds(dir, bounds);
+      for (set<CDir*>::iterator q = bounds.begin();
+          q != bounds.end();
+          ++q) {
+        CDir *bd = *q;
+        bd->put(CDir::PIN_EXPORTBOUND);
+        bd->state_clear(CDir::STATE_EXPORTBOUND);
+      }
+      if (state == EXPORT_WARNING) {
+	// notify bystanders
+	export_notify_abort(dir, it->second, bounds);
+	// process delayed expires
+	mdcache->process_delayed_expire(dir);
+      }
+    }
+    dir->unfreeze_tree();
+    mdcache->try_subtree_merge(dir);
+    if (notify_peer &&
+	(!mds->is_cluster_degraded() ||
+	 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
+      mds->send_message_mds(make_message<MExportDirCancel>(dir->dirfrag(),
+							   it->second.tid),
+			    it->second.peer);
+    break;
+
+  case EXPORT_EXPORTING:
+    dout(10) << "export state=exporting : reversing, and unfreezing" << dendl;
+    it->second.state = EXPORT_CANCELLING;
+    export_reverse(dir, it->second);
+    break;
+
+  case EXPORT_LOGGINGFINISH:
+  case EXPORT_NOTIFYING:
+    dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl;
+    // leave export_state, don't clean up now.
+    break;
+  case EXPORT_CANCELLING:
+    break;
+
+  default:
+    ceph_abort();
+  }
+
+  // finish clean-up?
+  if (it->second.state == EXPORT_CANCELLING ||
+      it->second.state == EXPORT_CANCELLED) {
+    MutationRef mut;
+    mut.swap(it->second.mut);
+
+    if (it->second.state == EXPORT_CANCELLED) {
+      export_cancel_finish(it);
+    }
+
+    // drop locks
+    if (state == EXPORT_LOCKING || state == EXPORT_DISCOVERING) {
+      MDRequestRef mdr = static_cast<MDRequestImpl*>(mut.get());
+      ceph_assert(mdr);
+      mdcache->request_kill(mdr);
+    } else if (mut) {
+      mds->locker->drop_locks(mut.get());
+      mut->cleanup();
+    }
+
+    mdcache->show_subtrees();
+
+    maybe_do_queued_export();
+  }
+}
+
+void Migrator::export_cancel_finish(export_state_iterator& it)
+{
+  CDir *dir = it->first;
+  bool unpin = (it->second.state == EXPORT_CANCELLING);
+  auto parent = std::move(it->second.parent);
+
+  total_exporting_size -= it->second.approx_size;
+  export_state.erase(it);
+
+  ceph_assert(dir->state_test(CDir::STATE_EXPORTING));
+  dir->clear_exporting();
+
+  if (unpin) {
+    // pinned by Migrator::export_notify_abort()
+    dir->auth_unpin(this);
+  }
+  // send pending import_maps?  (these need to go out when all exports have finished.)
+  mdcache->maybe_send_pending_resolves();
+
+  if (parent)
+    child_export_finish(parent, false);
+}
+
+// ==========================================================
+// mds failure handling
+
+void Migrator::handle_mds_failure_or_stop(mds_rank_t who)
+{
+  dout(5) << who << dendl;
+
+  // check my exports
+
+  // first add an extra auth_pin on any freezes, so that canceling a
+  // nested freeze doesn't complete one further up the hierarchy and
+  // confuse the shit out of us.  we'll remove it after canceling the
+  // freeze.  this way no freeze completions run before we want them
+  // to.
+  std::vector<CDir*> pinned_dirs;
+  for (map<CDir*,export_state_t>::iterator p = export_state.begin();
+       p != export_state.end();
+       ++p) {
+    if (p->second.state == EXPORT_FREEZING) {
+      CDir *dir = p->first;
+      dout(10) << "adding temp auth_pin on freezing " << *dir << dendl;
+      dir->auth_pin(this);
+      pinned_dirs.push_back(dir);
+    }
+  }
+
+  map<CDir*,export_state_t>::iterator p = export_state.begin();
+  while (p != export_state.end()) {
+    map<CDir*,export_state_t>::iterator next = p;
+    ++next;
+    CDir *dir = p->first;
+    
+    // abort exports:
+    //  - that are going to the failed node
+    //  - that aren't frozen yet (to avoid auth_pin deadlock)
+    //  - they havne't prepped yet (they may need to discover bounds to do that)
+    if ((p->second.peer == who &&
+	 p->second.state != EXPORT_CANCELLING) ||
+	p->second.state == EXPORT_LOCKING ||
+	p->second.state == EXPORT_DISCOVERING ||
+	p->second.state == EXPORT_FREEZING ||
+	p->second.state == EXPORT_PREPPING) {
+      // the guy i'm exporting to failed, or we're just freezing.
+      dout(10) << "cleaning up export state (" << p->second.state << ")"
+	       << get_export_statename(p->second.state) << " of " << *dir << dendl;
+      export_try_cancel(dir);
+    } else if (p->second.peer != who) {
+      // bystander failed.
+      if (p->second.warning_ack_waiting.erase(who)) {
+	if (p->second.state == EXPORT_WARNING) {
+	  p->second.notify_ack_waiting.erase(who);   // they won't get a notify either.
+	  // exporter waiting for warning acks, let's fake theirs.
+	  dout(10) << "faking export_warning_ack from mds." << who
+		   << " on " << *dir << " to mds." << p->second.peer
+		   << dendl;
+	  if (p->second.warning_ack_waiting.empty())
+	    export_go(dir);
+	}
+      }
+      if (p->second.notify_ack_waiting.erase(who)) {
+	// exporter is waiting for notify acks, fake it
+	dout(10) << "faking export_notify_ack from mds." << who
+		 << " on " << *dir << " to mds." << p->second.peer
+		 << dendl;
+	if (p->second.state == EXPORT_NOTIFYING) {
+	  if (p->second.notify_ack_waiting.empty())
+	    export_finish(dir);
+	} else if (p->second.state == EXPORT_CANCELLING) {
+	  if (p->second.notify_ack_waiting.empty()) {
+	    export_cancel_finish(p);
+	  }
+	}
+      }
+    }
+    
+    // next!
+    p = next;
+  }
+
+
+  // check my imports
+  map<dirfrag_t,import_state_t>::iterator q = import_state.begin();
+  while (q != import_state.end()) {
+    map<dirfrag_t,import_state_t>::iterator next = q;
+    ++next;
+    dirfrag_t df = q->first;
+    CInode *diri = mdcache->get_inode(df.ino);
+    CDir *dir = mdcache->get_dirfrag(df);
+
+    if (q->second.peer == who) {
+      if (dir)
+	dout(10) << "cleaning up import state (" << q->second.state << ")"
+		 << get_import_statename(q->second.state) << " of " << *dir << dendl;
+      else
+	dout(10) << "cleaning up import state (" << q->second.state << ")"
+		 << get_import_statename(q->second.state) << " of " << df << dendl;
+
+      switch (q->second.state) {
+      case IMPORT_DISCOVERING:
+	dout(10) << "import state=discovering : clearing state" << dendl;
+	import_reverse_discovering(df);
+	break;
+
+      case IMPORT_DISCOVERED:
+	ceph_assert(diri);
+	dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
+	import_reverse_discovered(df, diri);
+	break;
+
+      case IMPORT_PREPPING:
+	ceph_assert(dir);
+	dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
+	import_reverse_prepping(dir, q->second);
+	break;
+
+      case IMPORT_PREPPED:
+	ceph_assert(dir);
+	dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
+	{
+	  set<CDir*> bounds;
+	  mdcache->get_subtree_bounds(dir, bounds);
+	  import_remove_pins(dir, bounds);
+	  
+	  // adjust auth back to the exporter
+	  mdcache->adjust_subtree_auth(dir, q->second.peer);
+
+	  // notify bystanders ; wait in aborting state
+	  q->second.state = IMPORT_ABORTING;
+	  import_notify_abort(dir, bounds);
+	  ceph_assert(g_conf()->mds_kill_import_at != 10);
+	}
+	break;
+
+      case IMPORT_LOGGINGSTART:
+	ceph_assert(dir);
+	dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl;
+	import_reverse(dir);
+	break;
+
+      case IMPORT_ACKING:
+	ceph_assert(dir);
+	// hrm.  make this an ambiguous import, and wait for exporter recovery to disambiguate
+	dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl;
+	{
+	  set<CDir*> bounds;
+	  mdcache->get_subtree_bounds(dir, bounds);
+	  mdcache->add_ambiguous_import(dir, bounds);
+	}
+	break;
+	
+      case IMPORT_FINISHING:
+	ceph_assert(dir);
+	dout(10) << "import state=finishing : finishing import on " << *dir << dendl;
+	import_finish(dir, true);
+	break;
+
+      case IMPORT_ABORTING:
+	ceph_assert(dir);
+	dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl;
+	break;
+      }
+    } else {
+      auto bystanders_entry = q->second.bystanders.find(who);
+      if (bystanders_entry != q->second.bystanders.end()) {
+	q->second.bystanders.erase(bystanders_entry);
+	if (q->second.state == IMPORT_ABORTING) {
+	  ceph_assert(dir);
+	  dout(10) << "faking export_notify_ack from mds." << who
+		   << " on aborting import " << *dir << " from mds." << q->second.peer
+		   << dendl;
+	  if (q->second.bystanders.empty())
+	    import_reverse_unfreeze(dir);
+	}
+      }
+    }
+
+    // next!
+    q = next;
+  }
+
+  for (const auto& dir : pinned_dirs) {
+    dout(10) << "removing temp auth_pin on " << *dir << dendl;
+    dir->auth_unpin(this);
+  }  
+}
+
+
+
+void Migrator::show_importing()
+{  
+  dout(10) << dendl;
+  for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
+       p != import_state.end();
+       ++p) {
+    CDir *dir = mdcache->get_dirfrag(p->first);
+    if (dir) {
+      dout(10) << " importing from " << p->second.peer
+	       << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
+	       << " " << p->first << " " << *dir << dendl;
+    } else {
+      dout(10) << " importing from " << p->second.peer
+	       << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
+	       << " " << p->first << dendl;
+    }
+  }
+}
+
+void Migrator::show_exporting() 
+{
+  dout(10) << dendl;
+  for (const auto& [dir, state] : export_state) {
+    dout(10) << " exporting to " << state.peer
+	     << ": (" << state.state << ") " << get_export_statename(state.state)
+	     << " " << dir->dirfrag() << " " << *dir << dendl;
+  }
+}
+
+
+
+void Migrator::audit()
+{
+  if (!g_conf()->subsys.should_gather<ceph_subsys_mds, 5>())
+    return;  // hrm.
+
+  // import_state
+  show_importing();
+  for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
+       p != import_state.end();
+       ++p) {
+    if (p->second.state == IMPORT_DISCOVERING)
+      continue;
+    if (p->second.state == IMPORT_DISCOVERED) {
+      CInode *in = mdcache->get_inode(p->first.ino);
+      ceph_assert(in);
+      continue;
+    }
+    CDir *dir = mdcache->get_dirfrag(p->first);
+    ceph_assert(dir);
+    if (p->second.state == IMPORT_PREPPING)
+      continue;
+    if (p->second.state == IMPORT_ABORTING) {
+      ceph_assert(!dir->is_ambiguous_dir_auth());
+      ceph_assert(dir->get_dir_auth().first != mds->get_nodeid());
+      continue;
+    }
+    ceph_assert(dir->is_ambiguous_dir_auth());
+    ceph_assert(dir->authority().first  == mds->get_nodeid() ||
+	   dir->authority().second == mds->get_nodeid());
+  }
+
+  // export_state
+  show_exporting();
+  for (map<CDir*,export_state_t>::iterator p = export_state.begin();
+       p != export_state.end();
+       ++p) {
+    CDir *dir = p->first;
+    if (p->second.state == EXPORT_LOCKING ||
+	p->second.state == EXPORT_DISCOVERING ||
+	p->second.state == EXPORT_FREEZING ||
+	p->second.state == EXPORT_CANCELLING)
+      continue;
+    ceph_assert(dir->is_ambiguous_dir_auth());
+    ceph_assert(dir->authority().first  == mds->get_nodeid() ||
+	   dir->authority().second == mds->get_nodeid());
+  }
+
+  // ambiguous+me subtrees should be importing|exporting
+
+  // write me
+}
+
+
+
+
+
+// ==========================================================
+// EXPORT
+
+void Migrator::export_dir_nicely(CDir *dir, mds_rank_t dest)
+{
+  // enqueue
+  dout(7) << *dir << " to " << dest << dendl;
+  export_queue.push_back(pair<dirfrag_t,mds_rank_t>(dir->dirfrag(), dest));
+
+  maybe_do_queued_export();
+}
+
+void Migrator::maybe_do_queued_export()
+{
+  static bool running;
+  if (running)
+    return;
+  running = true;
+
+  uint64_t max_total_size = max_export_size * 2;
+
+  while (!export_queue.empty() &&
+	 max_total_size > total_exporting_size &&
+	 max_total_size - total_exporting_size >=
+	 max_export_size * (num_locking_exports + 1)) {
+
+    dirfrag_t df = export_queue.front().first;
+    mds_rank_t dest = export_queue.front().second;
+    export_queue.pop_front();
+    
+    CDir *dir = mdcache->get_dirfrag(df);
+    if (!dir) continue;
+    if (!dir->is_auth()) continue;
+
+    dout(7) << "nicely exporting to mds." << dest << " " << *dir << dendl;
+
+    export_dir(dir, dest);
+  }
+
+  running = false;
+}
+
+
+
+
+class C_MDC_ExportFreeze : public MigratorContext {
+  CDir *dir;   // dir i'm exporting
+  uint64_t tid;
+public:
+  C_MDC_ExportFreeze(Migrator *m, CDir *e, uint64_t t) :
+    MigratorContext(m), dir(e), tid(t) {
+    dir->get(CDir::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    if (r >= 0)
+      mig->export_frozen(dir, tid);
+    dir->put(CDir::PIN_PTRWAITER);
+  }
+};
+
+
+bool Migrator::export_try_grab_locks(CDir *dir, MutationRef& mut)
+{
+  CInode *diri = dir->get_inode();
+
+  if (!diri->filelock.can_wrlock(diri->get_loner()) ||
+      !diri->nestlock.can_wrlock(diri->get_loner()))
+    return false;
+
+  MutationImpl::LockOpVec lov;
+
+  set<CDir*> wouldbe_bounds;
+  set<CInode*> bound_inodes;
+  mdcache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds);
+  for (auto& bound : wouldbe_bounds)
+    bound_inodes.insert(bound->get_inode());
+  for (auto& in : bound_inodes)
+    lov.add_rdlock(&in->dirfragtreelock);
+
+  lov.add_rdlock(&diri->dirfragtreelock);
+
+  CInode* in = diri;
+  while (true) {
+    lov.add_rdlock(&in->snaplock);
+    CDentry* pdn = in->get_projected_parent_dn();
+    if (!pdn)
+      break;
+    in = pdn->get_dir()->get_inode();
+  }
+
+  if (!mds->locker->rdlock_try_set(lov, mut))
+    return false;
+
+  mds->locker->wrlock_force(&diri->filelock, mut);
+  mds->locker->wrlock_force(&diri->nestlock, mut);
+
+  return true;
+}
+
+
+/** export_dir(dir, dest)
+ * public method to initiate an export.
+ * will fail if the directory is freezing, frozen, unpinnable, or root. 
+ */
+void Migrator::export_dir(CDir *dir, mds_rank_t dest)
+{
+  ceph_assert(dir->is_auth());
+  ceph_assert(dest != mds->get_nodeid());
+   
+  CDir* parent = dir->inode->get_projected_parent_dir();
+  if (!mds->is_stopping() && !dir->is_exportable(dest) && dir->get_num_head_items() > 0) {
+    dout(7) << "Cannot export to mds." << dest << " " << *dir << ": dir is export pinned" << dendl;
+    return;
+  } else if (!(mds->is_active() || mds->is_stopping())) {
+    dout(7) << "Cannot export to mds." << dest << " " << *dir << ": not active" << dendl;
+    return;
+  } else if (mdcache->is_readonly()) {
+    dout(7) << "Cannot export to mds." << dest << " " << *dir << ": read-only FS, no exports for now" << dendl;
+    return;
+  } else if (!mds->mdsmap->is_active(dest)) {
+    dout(7) << "Cannot export to mds." << dest << " " << *dir << ": destination not active" << dendl;
+    return;
+  } else if (mds->is_cluster_degraded()) {
+    dout(7) << "Cannot export to mds." << dest << " " << *dir << ": cluster degraded" << dendl;
+    return;
+  } else if (dir->inode->is_system()) {
+    dout(7) << "Cannot export to mds." << dest << " " << *dir << ": is a system directory" << dendl;
+    return;
+  } else if (dir->is_frozen() || dir->is_freezing()) {
+    dout(7) << "Cannot export to mds." << dest << " " << *dir << ": is frozen" << dendl;
+    return;
+  } else if (dir->state_test(CDir::STATE_EXPORTING)) {
+    dout(7) << "Cannot export to mds." << dest << " " << *dir << ": already exporting" << dendl;
+    return;
+  } else if (parent && parent->inode->is_stray()
+             && parent->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) {
+    dout(7) << "Cannot export to mds." << dest << " " << *dir << ": in stray directory" << dendl;
+    return;
+  }
+
+  if (unlikely(g_conf()->mds_thrash_exports)) {
+    // create random subtree bound (which will not be exported)
+    std::vector<CDir*> ls;
+    for (auto p = dir->begin(); p != dir->end(); ++p) {
+      auto dn = p->second;
+      CDentry::linkage_t *dnl= dn->get_linkage();
+      if (dnl->is_primary()) {
+	CInode *in = dnl->get_inode();
+	if (in->is_dir()) {
+          auto&& dirs = in->get_nested_dirfrags();
+          ls.insert(std::end(ls), std::begin(dirs), std::end(dirs));
+        }
+      }
+    }
+    if (ls.size() > 0) {
+      int n = rand() % ls.size();
+      auto p = ls.begin();
+      while (n--) ++p;
+      CDir *bd = *p;
+      if (!(bd->is_frozen() || bd->is_freezing())) {
+	ceph_assert(bd->is_auth());
+	dir->state_set(CDir::STATE_AUXSUBTREE);
+	mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
+	dout(7) << "create aux subtree " << *bd << " under " << *dir << dendl;
+      }
+    }
+  }
+
+  dout(4) << "Starting export to mds." << dest << " " << *dir << dendl;
+
+  mds->hit_export_target(dest, -1);
+
+  dir->auth_pin(this);
+  dir->mark_exporting();
+
+  MDRequestRef mdr = mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
+  mdr->more()->export_dir = dir;
+  mdr->pin(dir);
+
+  ceph_assert(export_state.count(dir) == 0);
+  export_state_t& stat = export_state[dir];
+  num_locking_exports++;
+  stat.state = EXPORT_LOCKING;
+  stat.peer = dest;
+  stat.tid = mdr->reqid.tid;
+  stat.mut = mdr;
+
+  mdcache->dispatch_request(mdr);
+}
+
+/*
+ * check if directory is too large to be export in whole. If it is,
+ * choose some subdirs, whose total size is suitable.
+ */
+void Migrator::maybe_split_export(CDir* dir, uint64_t max_size, bool null_okay,
+				  vector<pair<CDir*, size_t> >& results)
+{
+  static const unsigned frag_size = 800;
+  static const unsigned inode_size = 1000;
+  static const unsigned cap_size = 80;
+  static const unsigned remote_size = 10;
+  static const unsigned null_size = 1;
+
+  // state for depth-first search
+  struct LevelData {
+    CDir *dir;
+    CDir::dentry_key_map::iterator iter;
+    size_t dirfrag_size = frag_size;
+    size_t subdirs_size = 0;
+    bool complete = true;
+    vector<CDir*> siblings;
+    vector<pair<CDir*, size_t> > subdirs;
+    LevelData(const LevelData&) = default;
+    LevelData(CDir *d) :
+      dir(d), iter(d->begin()) {}
+  };
+
+  vector<LevelData> stack;
+  stack.emplace_back(dir);
+
+  size_t found_size = 0;
+  size_t skipped_size = 0;
+
+  for (;;) {
+    auto& data = stack.back();
+    CDir *cur = data.dir;
+    auto& it = data.iter;
+    auto& dirfrag_size = data.dirfrag_size;
+
+    while(it != cur->end()) {
+      CDentry *dn = it->second;
+      ++it;
+
+      dirfrag_size += dn->name.size();
+      if (dn->get_linkage()->is_null()) {
+	dirfrag_size += null_size;
+	continue;
+      }
+      if (dn->get_linkage()->is_remote()) {
+	dirfrag_size += remote_size;
+	continue;
+      }
+
+      CInode *in = dn->get_linkage()->get_inode();
+      dirfrag_size += inode_size;
+      dirfrag_size += in->get_client_caps().size() * cap_size;
+
+      if (in->is_dir()) {
+	auto ls = in->get_nested_dirfrags();
+	std::reverse(ls.begin(), ls.end());
+
+	bool complete = true;
+	for (auto p = ls.begin(); p != ls.end(); ) {
+	  if ((*p)->state_test(CDir::STATE_EXPORTING) ||
+	      (*p)->is_freezing_dir() || (*p)->is_frozen_dir()) {
+	    complete = false;
+	    p = ls.erase(p);
+	  } else {
+	    ++p;
+	  }
+	}
+	if (!complete) {
+	  // skip exporting dir's ancestors. because they can't get
+	  // frozen (exporting dir's parent inode is auth pinned).
+	  for (auto p = stack.rbegin(); p < stack.rend(); ++p) {
+	    if (!p->complete)
+	      break;
+	    p->complete = false;
+	  }
+	}
+	if (!ls.empty()) {
+	  stack.emplace_back(ls.back());
+	  ls.pop_back();
+	  stack.back().siblings.swap(ls);
+	  break;
+	}
+      }
+    }
+    // did above loop push new dirfrag into the stack?
+    if (stack.back().dir != cur)
+      continue;
+
+    if (data.complete) {
+      auto cur_size = data.subdirs_size + dirfrag_size;
+      // we can do nothing with large dirfrag
+      if (cur_size >= max_size && found_size * 2 > max_size)
+	break;
+
+      found_size += dirfrag_size;
+
+      if (stack.size() > 1) {
+	auto& parent = stack[stack.size() - 2];
+	parent.subdirs.emplace_back(cur, cur_size);
+	parent.subdirs_size += cur_size;
+      }
+    } else {
+      // can't merge current dirfrag to its parent if there is skipped subdir
+      results.insert(results.end(), data.subdirs.begin(), data.subdirs.end());
+      skipped_size += dirfrag_size;
+    }
+
+    vector<CDir*> ls;
+    ls.swap(data.siblings);
+
+    stack.pop_back();
+    if (stack.empty())
+      break;
+
+    if (found_size >= max_size)
+      break;
+
+    // next dirfrag
+    if (!ls.empty()) {
+      stack.emplace_back(ls.back());
+      ls.pop_back();
+      stack.back().siblings.swap(ls);
+    }
+  }
+
+  for (auto& p : stack)
+    results.insert(results.end(), p.subdirs.begin(), p.subdirs.end());
+
+  if (results.empty() && (!skipped_size || !null_okay))
+    results.emplace_back(dir, found_size + skipped_size);
+}
+
+class C_M_ExportDirWait : public MigratorContext {
+  MDRequestRef mdr;
+  int count;
+public:
+  C_M_ExportDirWait(Migrator *m, MDRequestRef mdr, int count)
+    : MigratorContext(m), mdr(mdr), count(count) {}
+  void finish(int r) override {
+    mig->dispatch_export_dir(mdr, count);
+  }
+};
+
+void Migrator::dispatch_export_dir(MDRequestRef& mdr, int count)
+{
+  CDir *dir = mdr->more()->export_dir;
+  dout(7) << *mdr << " " << *dir << dendl;
+
+  map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+  if (it == export_state.end() || it->second.tid != mdr->reqid.tid) {
+    // export must have aborted.
+    dout(7) << "export must have aborted " << *mdr << dendl;
+    ceph_assert(mdr->killed || mdr->aborted);
+    if (mdr->aborted) {
+      mdr->aborted = false;
+      mdcache->request_kill(mdr);
+    }
+    return;
+  }
+  ceph_assert(it->second.state == EXPORT_LOCKING);
+
+  if (mdr->more()->peer_error || dir->is_frozen() || dir->is_freezing()) {
+    dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl;
+    export_try_cancel(dir);
+    return;
+  }
+
+  mds_rank_t dest = it->second.peer;
+  if (!mds->is_export_target(dest)) {
+    dout(7) << "dest is not yet an export target" << dendl;
+    if (count > 3) {
+      dout(7) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl;
+      export_try_cancel(dir);
+      return;
+    }
+
+    mds->locker->drop_locks(mdr.get());
+    mdr->drop_local_auth_pins();
+
+    mds->wait_for_mdsmap(mds->mdsmap->get_epoch(), new C_M_ExportDirWait(this, mdr, count+1));
+    return;
+  }
+
+  if (!dir->inode->get_parent_dn()) {
+    dout(7) << "waiting for dir to become stable before export: " << *dir << dendl;
+    dir->add_waiter(CDir::WAIT_CREATED, new C_M_ExportDirWait(this, mdr, 1));
+    return;
+  }
+
+  // locks?
+  if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+    MutationImpl::LockOpVec lov;
+    // If auth MDS of the subtree root inode is neither the exporter MDS
+    // nor the importer MDS and it gathers subtree root's fragstat/neststat
+    // while the subtree is exporting. It's possible that the exporter MDS
+    // and the importer MDS both are auth MDS of the subtree root or both
+    // are not auth MDS of the subtree root at the time they receive the
+    // lock messages. So the auth MDS of the subtree root inode may get no
+    // or duplicated fragstat/neststat for the subtree root dirfrag.
+    lov.lock_scatter_gather(&dir->get_inode()->filelock);
+    lov.lock_scatter_gather(&dir->get_inode()->nestlock);
+    if (dir->get_inode()->is_auth()) {
+      dir->get_inode()->filelock.set_scatter_wanted();
+      dir->get_inode()->nestlock.set_scatter_wanted();
+    }
+    lov.add_rdlock(&dir->get_inode()->dirfragtreelock);
+
+    if (!mds->locker->acquire_locks(mdr, lov, nullptr, true)) {
+      if (mdr->aborted)
+	export_try_cancel(dir);
+      return;
+    }
+
+    lov.clear();
+    // bound dftlocks:
+    // NOTE: We need to take an rdlock on bounding dirfrags during
+    //  migration for a rather irritating reason: when we export the
+    //  bound inode, we need to send scatterlock state for the dirfrags
+    //  as well, so that the new auth also gets the correct info.  If we
+    //  race with a refragment, this info is useless, as we can't
+    //  redivvy it up.  And it's needed for the scatterlocks to work
+    //  properly: when the auth is in a sync/lock state it keeps each
+    //  dirfrag's portion in the local (auth OR replica) dirfrag.
+    set<CDir*> wouldbe_bounds;
+    set<CInode*> bound_inodes;
+    mdcache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds);
+    for (auto& bound : wouldbe_bounds)
+      bound_inodes.insert(bound->get_inode());
+    for (auto& in : bound_inodes)
+      lov.add_rdlock(&in->dirfragtreelock);
+
+    if (!mds->locker->rdlock_try_set(lov, mdr))
+      return;
+
+    if (!mds->locker->try_rdlock_snap_layout(dir->get_inode(), mdr))
+      return;
+
+    mdr->locking_state |= MutationImpl::ALL_LOCKED;
+  }
+
+  ceph_assert(g_conf()->mds_kill_export_at != 1);
+
+  auto parent = it->second.parent;
+
+  vector<pair<CDir*, size_t> > results;
+  maybe_split_export(dir, max_export_size, (bool)parent, results);
+
+  if (results.size() == 1 && results.front().first == dir) {
+    num_locking_exports--;
+    it->second.state = EXPORT_DISCOVERING;
+    // send ExportDirDiscover (ask target)
+    filepath path;
+    dir->inode->make_path(path);
+    auto discover = make_message<MExportDirDiscover>(dir->dirfrag(), path,
+						     mds->get_nodeid(),
+						     it->second.tid);
+    mds->send_message_mds(discover, dest);
+    ceph_assert(g_conf()->mds_kill_export_at != 2);
+
+    it->second.last_cum_auth_pins_change = ceph_clock_now();
+    it->second.approx_size = results.front().second;
+    total_exporting_size += it->second.approx_size;
+
+    // start the freeze, but hold it up with an auth_pin.
+    dir->freeze_tree();
+    ceph_assert(dir->is_freezing_tree());
+    dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir, it->second.tid));
+    return;
+  }
+
+  if (parent) {
+    parent->pending_children += results.size();
+  } else {
+    parent = std::make_shared<export_base_t>(dir->dirfrag(), dest,
+					     results.size(), export_queue_gen);
+  }
+
+  if (results.empty()) {
+    dout(7) << "subtree's children all are under exporting, retry rest parts of parent export "
+	    << parent->dirfrag << dendl;
+    parent->restart = true;
+  } else {
+    dout(7) << "subtree is too large, splitting it into: " <<  dendl;
+  }
+
+  for (auto& p : results) {
+    CDir *sub = p.first;
+    ceph_assert(sub != dir);
+    dout(7) << " sub " << *sub << dendl;
+
+    sub->auth_pin(this);
+    sub->mark_exporting();
+
+    MDRequestRef _mdr = mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
+    _mdr->more()->export_dir = sub;
+    _mdr->pin(sub);
+
+    ceph_assert(export_state.count(sub) == 0);
+    auto& stat = export_state[sub];
+    num_locking_exports++;
+    stat.state = EXPORT_LOCKING;
+    stat.peer = dest;
+    stat.tid = _mdr->reqid.tid;
+    stat.mut = _mdr;
+    stat.parent = parent;
+    mdcache->dispatch_request(_mdr);
+  }
+
+  // cancel the original one
+  export_try_cancel(dir);
+}
+
+void Migrator::child_export_finish(std::shared_ptr<export_base_t>& parent, bool success)
+{
+  if (success)
+    parent->restart = true;
+  if (--parent->pending_children == 0) {
+    if (parent->restart &&
+	parent->export_queue_gen == export_queue_gen) {
+      CDir *origin = mdcache->get_dirfrag(parent->dirfrag);
+      if (origin && origin->is_auth()) {
+	dout(7) << "child_export_finish requeue " << *origin << dendl;
+	export_queue.emplace_front(origin->dirfrag(), parent->dest);
+      }
+    }
+  }
+}
+
+/*
+ * called on receipt of MExportDirDiscoverAck
+ * the importer now has the directory's _inode_ in memory, and pinned.
+ */
+void Migrator::handle_export_discover_ack(const cref_t<MExportDirDiscoverAck> &m)
+{
+  CDir *dir = mdcache->get_dirfrag(m->get_dirfrag());
+  mds_rank_t dest(m->get_source().num());
+  ceph_assert(dir);
+  
+  dout(7) << "from " << m->get_source()
+	  << " on " << *dir << dendl;
+
+  mds->hit_export_target(dest, -1);
+
+  map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+  if (it == export_state.end() ||
+      it->second.tid != m->get_tid() ||
+      it->second.peer != dest) {
+    dout(7) << "must have aborted" << dendl;
+  } else {
+    ceph_assert(it->second.state == EXPORT_DISCOVERING);
+
+    if (m->is_success()) {
+      // release locks to avoid deadlock
+      MDRequestRef mdr = static_cast<MDRequestImpl*>(it->second.mut.get());
+      ceph_assert(mdr);
+      mdcache->request_finish(mdr);
+      it->second.mut.reset();
+      // freeze the subtree
+      it->second.state = EXPORT_FREEZING;
+      dir->auth_unpin(this);
+      ceph_assert(g_conf()->mds_kill_export_at != 3);
+
+    } else {
+      dout(7) << "peer failed to discover (not active?), canceling" << dendl;
+      export_try_cancel(dir, false);
+    }
+  }
+}
+
+class C_M_ExportSessionsFlushed : public MigratorContext {
+  CDir *dir;
+  uint64_t tid;
+public:
+  C_M_ExportSessionsFlushed(Migrator *m, CDir *d, uint64_t t) :
+    MigratorContext(m), dir(d), tid(t) {
+    dir->get(CDir::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    mig->export_sessions_flushed(dir, tid);
+    dir->put(CDir::PIN_PTRWAITER);
+  }
+};
+
+void Migrator::export_sessions_flushed(CDir *dir, uint64_t tid)
+{
+  dout(7) << *dir << dendl;
+
+  map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+  if (it == export_state.end() ||
+      it->second.state == EXPORT_CANCELLING ||
+      it->second.tid != tid) {
+    // export must have aborted.
+    dout(7) << "export must have aborted on " << dir << dendl;
+    return;
+  }
+
+  ceph_assert(it->second.state == EXPORT_PREPPING || it->second.state == EXPORT_WARNING);
+  ceph_assert(it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0);
+  it->second.warning_ack_waiting.erase(MDS_RANK_NONE);
+  if (it->second.state == EXPORT_WARNING && it->second.warning_ack_waiting.empty())
+    export_go(dir);     // start export.
+}
+
+void Migrator::encode_export_prep_trace(bufferlist &final_bl, CDir *bound, 
+                                        CDir *dir, export_state_t &es, 
+                                        set<inodeno_t> &inodes_added, 
+                                        set<dirfrag_t> &dirfrags_added)
+{
+  ENCODE_START(1, 1, final_bl);
+
+  dout(7) << " started to encode dir " << *bound << dendl;
+  CDir *cur = bound;
+  bufferlist tracebl;
+  char start = '-';
+  
+  while (1) {
+    // don't repeat inodes
+    if (inodes_added.count(cur->inode->ino()))
+      break;
+    inodes_added.insert(cur->inode->ino());
+
+    // prepend dentry + inode
+    ceph_assert(cur->inode->is_auth());
+    bufferlist bl;
+    mdcache->encode_replica_dentry(cur->inode->parent, es.peer, bl);
+    dout(7) << "  added " << *cur->inode->parent << dendl;
+    mdcache->encode_replica_inode(cur->inode, es.peer, bl, mds->mdsmap->get_up_features());
+    dout(7) << "  added " << *cur->inode << dendl;
+    bl.claim_append(tracebl);
+    tracebl = std::move(bl);
+
+    cur = cur->get_parent_dir();
+    // don't repeat dirfrags
+    if (dirfrags_added.count(cur->dirfrag()) || cur == dir) {
+      start = 'd';  // start with dentry
+      break;
+    }
+    dirfrags_added.insert(cur->dirfrag());
+
+    // prepend dir
+    mdcache->encode_replica_dir(cur, es.peer, bl);
+    dout(7) << "  added " << *cur << dendl;
+    bl.claim_append(tracebl);
+    tracebl = std::move(bl);
+    start = 'f';  // start with dirfrag
+  }
+  dirfrag_t df = cur->dirfrag();
+  encode(df, final_bl);
+  encode(start, final_bl);
+  final_bl.claim_append(tracebl);
+  
+  ENCODE_FINISH(final_bl);
+}
+
+void Migrator::export_frozen(CDir *dir, uint64_t tid)
+{
+  dout(7) << *dir << dendl;
+
+  map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+  if (it == export_state.end() || it->second.tid != tid) {
+    dout(7) << "export must have aborted" << dendl;
+    return;
+  }
+
+  ceph_assert(it->second.state == EXPORT_FREEZING);
+  ceph_assert(dir->is_frozen_tree_root());
+
+  it->second.mut = new MutationImpl();
+
+  // ok, try to grab all my locks.
+  CInode *diri = dir->get_inode();
+  if ((diri->is_auth() && diri->is_frozen()) ||
+      !export_try_grab_locks(dir, it->second.mut)) {
+    dout(7) << "export_dir couldn't acquire all needed locks, failing. "
+	    << *dir << dendl;
+    export_try_cancel(dir);
+    return;
+  }
+
+  if (diri->is_auth())
+    it->second.mut->auth_pin(diri);
+
+  mdcache->show_subtrees();
+
+  // CDir::_freeze_tree() should have forced it into subtree.
+  ceph_assert(dir->get_dir_auth() == mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
+  // note the bounds.
+  set<CDir*> bounds;
+  mdcache->get_subtree_bounds(dir, bounds);
+
+  // generate prep message, log entry.
+  auto prep = make_message<MExportDirPrep>(dir->dirfrag(), it->second.tid);
+
+  // include list of bystanders
+  for (const auto &p : dir->get_replicas()) {
+    if (p.first != it->second.peer) {
+      dout(10) << "bystander mds." << p.first << dendl;
+      prep->add_bystander(p.first);
+    }
+  }
+
+  // include base dirfrag
+  mdcache->encode_replica_dir(dir, it->second.peer, prep->basedir);
+  
+  /*
+   * include spanning tree for all nested exports.
+   * these need to be on the destination _before_ the final export so that
+   * dir_auth updates on any nested exports are properly absorbed.
+   * this includes inodes and dirfrags included in the subtree, but
+   * only the inodes at the bounds.
+   *
+   * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
+   */
+  set<inodeno_t> inodes_added;
+  set<dirfrag_t> dirfrags_added;
+
+  // check bounds
+  for (auto &bound : bounds){
+    // pin it.
+    bound->get(CDir::PIN_EXPORTBOUND);
+    bound->state_set(CDir::STATE_EXPORTBOUND);
+
+    dout(7) << "  export bound " << *bound << dendl;
+    prep->add_bound( bound->dirfrag() );
+    
+    bufferlist final_bl;
+    encode_export_prep_trace(final_bl, bound, dir, it->second, inodes_added, dirfrags_added);
+    prep->add_trace(final_bl);
+  }
+
+  // send.
+  it->second.state = EXPORT_PREPPING;
+  mds->send_message_mds(prep, it->second.peer);
+  assert (g_conf()->mds_kill_export_at != 4);
+
+  // make sure any new instantiations of caps are flushed out
+  ceph_assert(it->second.warning_ack_waiting.empty());
+
+  set<client_t> export_client_set;
+  get_export_client_set(dir, export_client_set);
+
+  MDSGatherBuilder gather(g_ceph_context);
+  mds->server->flush_client_sessions(export_client_set, gather);
+  if (gather.has_subs()) {
+    it->second.warning_ack_waiting.insert(MDS_RANK_NONE);
+    gather.set_finisher(new C_M_ExportSessionsFlushed(this, dir, it->second.tid));
+    gather.activate();
+  }
+}
+
+void Migrator::get_export_client_set(CDir *dir, set<client_t>& client_set)
+{
+  deque<CDir*> dfs;
+  dfs.push_back(dir);
+  while (!dfs.empty()) {
+    CDir *dir = dfs.front();
+    dfs.pop_front();
+    for (auto& p : *dir) {
+      CDentry *dn = p.second;
+      if (!dn->get_linkage()->is_primary())
+	continue;
+      CInode *in = dn->get_linkage()->get_inode();
+      if (in->is_dir()) {
+	// directory?
+	auto&& ls = in->get_dirfrags();
+	for (auto& q : ls) {
+	  if (!q->state_test(CDir::STATE_EXPORTBOUND)) {
+	    // include nested dirfrag
+	    ceph_assert(q->get_dir_auth().first == CDIR_AUTH_PARENT);
+	    dfs.push_back(q); // it's ours, recurse (later)
+	  }
+	}
+      }
+      for (auto& q : in->get_client_caps()) {
+	client_set.insert(q.first);
+      }
+    }
+  }
+}
+
+void Migrator::get_export_client_set(CInode *in, set<client_t>& client_set)
+{
+  for (const auto &p : in->get_client_caps()) {
+    client_set.insert(p.first);
+  }
+}
+
+void Migrator::handle_export_prep_ack(const cref_t<MExportDirPrepAck> &m)
+{
+  CDir *dir = mdcache->get_dirfrag(m->get_dirfrag());
+  mds_rank_t dest(m->get_source().num());
+  ceph_assert(dir);
+
+  dout(7) << *dir << dendl;
+
+  mds->hit_export_target(dest, -1);
+
+  map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+  if (it == export_state.end() ||
+      it->second.tid != m->get_tid() ||
+      it->second.peer != mds_rank_t(m->get_source().num())) {
+    // export must have aborted.  
+    dout(7) << "export must have aborted" << dendl;
+    return;
+  }
+  ceph_assert(it->second.state == EXPORT_PREPPING);
+
+  if (!m->is_success()) {
+    dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl;
+    export_try_cancel(dir, false);
+    return;
+  }
+
+  assert (g_conf()->mds_kill_export_at != 5);
+  // send warnings
+  set<CDir*> bounds;
+  mdcache->get_subtree_bounds(dir, bounds);
+
+  ceph_assert(it->second.warning_ack_waiting.empty() ||
+         (it->second.warning_ack_waiting.size() == 1 &&
+	  it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0));
+  ceph_assert(it->second.notify_ack_waiting.empty());
+
+  for (const auto &p : dir->get_replicas()) {
+    if (p.first == it->second.peer) continue;
+    if (mds->is_cluster_degraded() &&
+	!mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first))
+      continue;  // only if active
+    it->second.warning_ack_waiting.insert(p.first);
+    it->second.notify_ack_waiting.insert(p.first);  // we'll eventually get a notifyack, too!
+
+    auto notify = make_message<MExportDirNotify>(dir->dirfrag(), it->second.tid, true,
+        mds_authority_t(mds->get_nodeid(),CDIR_AUTH_UNKNOWN),
+        mds_authority_t(mds->get_nodeid(),it->second.peer));
+    for (auto &cdir : bounds) {
+      notify->get_bounds().push_back(cdir->dirfrag());
+    }
+    mds->send_message_mds(notify, p.first);
+    
+  }
+
+  it->second.state = EXPORT_WARNING;
+
+  ceph_assert(g_conf()->mds_kill_export_at != 6);
+  // nobody to warn?
+  if (it->second.warning_ack_waiting.empty())
+    export_go(dir);  // start export.
+}
+
+
+class C_M_ExportGo : public MigratorContext {
+  CDir *dir;
+  uint64_t tid;
+public:
+  C_M_ExportGo(Migrator *m, CDir *d, uint64_t t) :
+    MigratorContext(m), dir(d), tid(t) {
+    dir->get(CDir::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    mig->export_go_synced(dir, tid);
+    dir->put(CDir::PIN_PTRWAITER);
+  }
+};
+
+void Migrator::export_go(CDir *dir)
+{
+  auto it = export_state.find(dir);
+  ceph_assert(it != export_state.end());
+  dout(7) << *dir << " to " << it->second.peer << dendl;
+
+  // first sync log to flush out e.g. any cap imports
+  mds->mdlog->wait_for_safe(new C_M_ExportGo(this, dir, it->second.tid));
+  mds->mdlog->flush();
+}
+
+void Migrator::export_go_synced(CDir *dir, uint64_t tid)
+{
+  map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+  if (it == export_state.end() ||
+      it->second.state == EXPORT_CANCELLING ||
+      it->second.tid != tid) {
+    // export must have aborted.  
+    dout(7) << "export must have aborted on " << dir << dendl;
+    return;
+  }
+  ceph_assert(it->second.state == EXPORT_WARNING);
+  mds_rank_t dest = it->second.peer;
+
+  dout(7) << *dir << " to " << dest << dendl;
+
+  mdcache->show_subtrees();
+  
+  it->second.state = EXPORT_EXPORTING;
+  ceph_assert(g_conf()->mds_kill_export_at != 7);
+
+  ceph_assert(dir->is_frozen_tree_root());
+
+  // set ambiguous auth
+  mdcache->adjust_subtree_auth(dir, mds->get_nodeid(), dest);
+
+  // take away the popularity we're sending.
+  mds->balancer->subtract_export(dir);
+  
+  // fill export message with cache data
+  auto req = make_message<MExportDir>(dir->dirfrag(), it->second.tid);
+  map<client_t,entity_inst_t> exported_client_map;
+  map<client_t,client_metadata_t> exported_client_metadata_map;
+  uint64_t num_exported_inodes = 0;
+  encode_export_dir(req->export_data, dir, // recur start point
+                    exported_client_map, exported_client_metadata_map,
+                    num_exported_inodes);
+  encode(exported_client_map, req->client_map, mds->mdsmap->get_up_features());
+  encode(exported_client_metadata_map, req->client_map);
+
+  // add bounds to message
+  set<CDir*> bounds;
+  mdcache->get_subtree_bounds(dir, bounds);
+  for (set<CDir*>::iterator p = bounds.begin();
+       p != bounds.end();
+       ++p)
+    req->add_export((*p)->dirfrag());
+
+  // send
+  mds->send_message_mds(req, dest);
+  ceph_assert(g_conf()->mds_kill_export_at != 8);
+
+  mds->hit_export_target(dest, num_exported_inodes+1);
+
+  // stats
+  if (mds->logger) mds->logger->inc(l_mds_exported);
+  if (mds->logger) mds->logger->inc(l_mds_exported_inodes, num_exported_inodes);
+
+  mdcache->show_subtrees();
+}
+
+
+/** encode_export_inode
+ * update our local state for this inode to export.
+ * encode relevant state to be sent over the wire.
+ * used by: encode_export_dir, file_rename (if foreign)
+ *
+ * FIXME: the separation between CInode.encode_export and these methods 
+ * is pretty arbitrary and dumb.
+ */
+void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, 
+				   map<client_t,entity_inst_t>& exported_client_map,
+				   map<client_t,client_metadata_t>& exported_client_metadata_map)
+{
+  ENCODE_START(1, 1, enc_state);
+  dout(7) << *in << dendl;
+  ceph_assert(!in->is_replica(mds->get_nodeid()));
+
+  encode(in->ino(), enc_state);
+  encode(in->last, enc_state);
+  in->encode_export(enc_state);
+
+  // caps 
+  encode_export_inode_caps(in, true, enc_state, exported_client_map, exported_client_metadata_map);
+  ENCODE_FINISH(enc_state);
+}
+
+void Migrator::encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl,
+					map<client_t,entity_inst_t>& exported_client_map,
+					map<client_t,client_metadata_t>& exported_client_metadata_map)
+{
+  ENCODE_START(1, 1, bl);
+  dout(20) << *in << dendl;
+  // encode caps
+  map<client_t,Capability::Export> cap_map;
+  in->export_client_caps(cap_map);
+  encode(cap_map, bl);
+  if (auth_cap) {
+    encode(in->get_mds_caps_wanted(), bl);
+
+    in->state_set(CInode::STATE_EXPORTINGCAPS);
+    in->get(CInode::PIN_EXPORTINGCAPS);
+  }
+
+  // make note of clients named by exported capabilities
+  for (const auto &p : in->get_client_caps()) {
+    if (exported_client_map.count(p.first))
+      continue;
+    Session *session =  mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
+    exported_client_map[p.first] = session->info.inst;
+    exported_client_metadata_map[p.first] = session->info.client_metadata;
+  }
+  ENCODE_FINISH(bl);
+}
+
+void Migrator::finish_export_inode_caps(CInode *in, mds_rank_t peer,
+					map<client_t,Capability::Import>& peer_imported)
+{
+  dout(20) << *in << dendl;
+
+  in->state_clear(CInode::STATE_EXPORTINGCAPS);
+  in->put(CInode::PIN_EXPORTINGCAPS);
+
+  // tell (all) clients about migrating caps.. 
+  for (const auto &p : in->get_client_caps()) {
+    const Capability *cap = &p.second;
+    dout(7) << p.first
+	    << " exported caps on " << *in << dendl;
+    auto m = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, in->ino(), 0,
+				       cap->get_cap_id(), cap->get_mseq(),
+				       mds->get_osd_epoch_barrier());
+    map<client_t,Capability::Import>::iterator q = peer_imported.find(p.first);
+    ceph_assert(q != peer_imported.end());
+    m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
+		    (q->second.cap_id > 0 ? peer : -1), 0);
+    mds->send_message_client_counted(m, p.first);
+  }
+  in->clear_client_caps_after_export();
+  mds->locker->eval(in, CEPH_CAP_LOCKS);
+}
+
+void Migrator::finish_export_inode(CInode *in, mds_rank_t peer,
+				   map<client_t,Capability::Import>& peer_imported,
+				   MDSContext::vec& finished)
+{
+  dout(12) << *in << dendl;
+
+  // clean
+  if (in->is_dirty())
+    in->mark_clean();
+  
+  // clear/unpin cached_by (we're no longer the authority)
+  in->clear_replica_map();
+  
+  // twiddle lock states for auth -> replica transition
+  in->authlock.export_twiddle();
+  in->linklock.export_twiddle();
+  in->dirfragtreelock.export_twiddle();
+  in->filelock.export_twiddle();
+  in->nestlock.export_twiddle();
+  in->xattrlock.export_twiddle();
+  in->snaplock.export_twiddle();
+  in->flocklock.export_twiddle();
+  in->policylock.export_twiddle();
+  
+  // mark auth
+  ceph_assert(in->is_auth());
+  in->state_clear(CInode::STATE_AUTH);
+  in->replica_nonce = CInode::EXPORT_NONCE;
+  
+  in->clear_dirty_rstat();
+
+  // no more auth subtree? clear scatter dirty
+  if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
+    in->clear_scatter_dirty();
+
+  in->clear_dirty_parent();
+
+  in->clear_clientwriteable();
+
+  in->clear_file_locks();
+
+  // waiters
+  in->take_waiting(CInode::WAIT_ANY_MASK, finished);
+
+  in->finish_export();
+  
+  finish_export_inode_caps(in, peer, peer_imported);
+}
+
+void Migrator::encode_export_dir(bufferlist& exportbl,
+				CDir *dir,
+				map<client_t,entity_inst_t>& exported_client_map,
+				map<client_t,client_metadata_t>& exported_client_metadata_map,
+                                uint64_t &num_exported)
+{
+  // This has to be declared before ENCODE_STARTED as it will need to be referenced after ENCODE_FINISH.
+  std::vector<CDir*> subdirs;
+  
+  ENCODE_START(1, 1, exportbl);
+  dout(7) << *dir << " " << dir->get_num_head_items() << " head items" << dendl;
+  
+  ceph_assert(dir->get_projected_version() == dir->get_version());
+
+#ifdef MDS_VERIFY_FRAGSTAT
+  if (dir->is_complete())
+    dir->verify_fragstat();
+#endif
+
+  // dir 
+  dirfrag_t df = dir->dirfrag();
+  encode(df, exportbl);
+  dir->encode_export(exportbl);
+  
+  __u32 nden = dir->items.size();
+  encode(nden, exportbl);
+  
+  // dentries
+  for (auto &p : *dir) {
+    CDentry *dn = p.second;
+    CInode *in = dn->get_linkage()->get_inode();
+
+    num_exported++;
+    
+    // -- dentry
+    dout(7) << " exporting " << *dn << dendl;
+    
+    // dn name
+    encode(dn->get_name(), exportbl);
+    encode(dn->last, exportbl);
+    
+    // state
+    dn->encode_export(exportbl);
+    
+    // points to...
+    
+    // null dentry?
+    if (dn->get_linkage()->is_null()) {
+      exportbl.append("N", 1);  // null dentry
+      continue;
+    }
+    
+    if (dn->get_linkage()->is_remote()) {
+      inodeno_t ino = dn->get_linkage()->get_remote_ino();
+      unsigned char d_type = dn->get_linkage()->get_remote_d_type();
+      auto& alternate_name = dn->alternate_name;
+      // remote link
+      CDentry::encode_remote(ino, d_type, alternate_name, exportbl);
+      continue;
+    }
+
+    // primary link
+    // -- inode
+    exportbl.append("i", 1);    // inode dentry
+
+    ENCODE_START(2, 1, exportbl);
+    encode_export_inode(in, exportbl, exported_client_map, exported_client_metadata_map);  // encode, and (update state for) export
+    encode(dn->alternate_name, exportbl);
+    ENCODE_FINISH(exportbl);
+
+    // directory?
+    auto&& dfs = in->get_dirfrags();
+    for (const auto& t : dfs) {
+      if (!t->state_test(CDir::STATE_EXPORTBOUND)) {
+	// include nested dirfrag
+	ceph_assert(t->get_dir_auth().first == CDIR_AUTH_PARENT);
+	subdirs.push_back(t);  // it's ours, recurse (later)
+      }
+    }
+  }
+
+  ENCODE_FINISH(exportbl);
+  // subdirs
+  for (const auto &dir : subdirs) {
+    encode_export_dir(exportbl, dir, exported_client_map, exported_client_metadata_map, num_exported);
+  }
+}
+
+void Migrator::finish_export_dir(CDir *dir, mds_rank_t peer,
+				 map<inodeno_t,map<client_t,Capability::Import> >& peer_imported,
+				 MDSContext::vec& finished, int *num_dentries)
+{
+  dout(10) << *dir << dendl;
+
+  // release open_by 
+  dir->clear_replica_map();
+
+  // mark
+  ceph_assert(dir->is_auth());
+  dir->state_clear(CDir::STATE_AUTH);
+  dir->remove_bloom();
+  dir->replica_nonce = CDir::EXPORT_NONCE;
+
+  if (dir->is_dirty())
+    dir->mark_clean();
+
+  // suck up all waiters
+  dir->take_waiting(CDir::WAIT_ANY_MASK, finished);    // all dir waiters
+  
+  // pop
+  dir->finish_export();
+
+  // dentries
+  std::vector<CDir*> subdirs;
+  for (auto &p : *dir) {
+    CDentry *dn = p.second;
+    CInode *in = dn->get_linkage()->get_inode();
+
+    // dentry
+    dn->finish_export();
+
+    // inode?
+    if (dn->get_linkage()->is_primary()) {
+      finish_export_inode(in, peer, peer_imported[in->ino()], finished);
+
+      // subdirs?
+      auto&& dirs = in->get_nested_dirfrags();
+      subdirs.insert(std::end(subdirs), std::begin(dirs), std::end(dirs));
+    }
+
+    mdcache->touch_dentry_bottom(dn); // move dentry to tail of LRU
+    ++(*num_dentries);
+  }
+
+  // subdirs
+  for (const auto& dir : subdirs) {
+    finish_export_dir(dir, peer, peer_imported, finished, num_dentries);
+  }
+}
+
+class C_MDS_ExportFinishLogged : public MigratorLogContext {
+  CDir *dir;
+public:
+  C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : MigratorLogContext(m), dir(d) {}
+  void finish(int r) override {
+    mig->export_logged_finish(dir);
+  }
+};
+
+
+/*
+ * i should get an export_ack from the export target.
+ */
+void Migrator::handle_export_ack(const cref_t<MExportDirAck> &m)
+{
+  CDir *dir = mdcache->get_dirfrag(m->get_dirfrag());
+  mds_rank_t dest(m->get_source().num());
+  ceph_assert(dir);
+  ceph_assert(dir->is_frozen_tree_root());  // i'm exporting!
+
+  // yay!
+  dout(7) << *dir << dendl;
+
+  mds->hit_export_target(dest, -1);
+
+  map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+  ceph_assert(it != export_state.end());
+  ceph_assert(it->second.state == EXPORT_EXPORTING);
+  ceph_assert(it->second.tid == m->get_tid());
+
+  auto bp = m->imported_caps.cbegin();
+  decode(it->second.peer_imported, bp);
+
+  it->second.state = EXPORT_LOGGINGFINISH;
+  assert (g_conf()->mds_kill_export_at != 9);
+  set<CDir*> bounds;
+  mdcache->get_subtree_bounds(dir, bounds);
+
+  // log completion. 
+  //  include export bounds, to ensure they're in the journal.
+  EExport *le = new EExport(mds->mdlog, dir, it->second.peer);;
+  mds->mdlog->start_entry(le);
+
+  le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
+  le->metablob.add_dir(dir, false);
+  for (set<CDir*>::iterator p = bounds.begin();
+       p != bounds.end();
+       ++p) {
+    CDir *bound = *p;
+    le->get_bounds().insert(bound->dirfrag());
+    le->metablob.add_dir_context(bound);
+    le->metablob.add_dir(bound, false);
+  }
+
+  // list us second, them first.
+  // this keeps authority().first in sync with subtree auth state in the journal.
+  mdcache->adjust_subtree_auth(dir, it->second.peer, mds->get_nodeid());
+
+  // log export completion, then finish (unfreeze, trigger finish context, etc.)
+  mds->mdlog->submit_entry(le, new C_MDS_ExportFinishLogged(this, dir));
+  mds->mdlog->flush();
+  assert (g_conf()->mds_kill_export_at != 10);
+}
+
+void Migrator::export_notify_abort(CDir *dir, export_state_t& stat, set<CDir*>& bounds)
+{
+  dout(7) << *dir << dendl;
+
+  ceph_assert(stat.state == EXPORT_CANCELLING);
+
+  if (stat.notify_ack_waiting.empty()) {
+    stat.state = EXPORT_CANCELLED;
+    return;
+  }
+
+  dir->auth_pin(this);
+
+  for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
+       p != stat.notify_ack_waiting.end();
+       ++p) {
+    auto notify = make_message<MExportDirNotify>(dir->dirfrag(), stat.tid, true,
+        pair<int,int>(mds->get_nodeid(), stat.peer),
+        pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
+    for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
+      notify->get_bounds().push_back((*i)->dirfrag());
+    mds->send_message_mds(notify, *p);
+  }
+}
+
+/*
+ * this happens if hte dest failes after i send teh export data but before it is acked
+ * that is, we don't know they safely received and logged it, so we reverse our changes
+ * and go on.
+ */
+void Migrator::export_reverse(CDir *dir, export_state_t& stat)
+{
+  dout(7) << *dir << dendl;
+
+  set<CInode*> to_eval;
+
+  set<CDir*> bounds;
+  mdcache->get_subtree_bounds(dir, bounds);
+
+  // remove exporting pins
+  std::deque<CDir*> rq;
+  rq.push_back(dir);
+  while (!rq.empty()) {
+    CDir *t = rq.front(); 
+    rq.pop_front();
+    t->abort_export();
+    for (auto &p : *t) {
+      CDentry *dn = p.second;
+      dn->abort_export();
+      if (!dn->get_linkage()->is_primary())
+	continue;
+      CInode *in = dn->get_linkage()->get_inode();
+      in->abort_export();
+      if (in->state_test(CInode::STATE_EVALSTALECAPS)) {
+	in->state_clear(CInode::STATE_EVALSTALECAPS);
+	to_eval.insert(in);
+      }
+      if (in->is_dir()) {
+        auto&& dirs = in->get_nested_dirfrags();
+        for (const auto& dir : dirs) {
+          rq.push_back(dir);
+        }
+      }
+    }
+  }
+  
+  // unpin bounds
+  for (auto bd : bounds) {
+    bd->put(CDir::PIN_EXPORTBOUND);
+    bd->state_clear(CDir::STATE_EXPORTBOUND);
+  }
+
+  // notify bystanders
+  export_notify_abort(dir, stat, bounds);
+
+  // unfreeze tree, with possible subtree merge.
+  mdcache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
+
+  // process delayed expires
+  mdcache->process_delayed_expire(dir);
+
+  dir->unfreeze_tree();
+  mdcache->try_subtree_merge(dir);
+
+  // revoke/resume stale caps
+  for (auto in : to_eval) {
+    bool need_issue = false;
+    for (auto &p : in->client_caps) {
+      Capability *cap = &p.second;
+      if (!cap->is_stale()) {
+	need_issue = true;
+	break;
+      }
+    }
+    if (need_issue &&
+	(!in->is_auth() || !mds->locker->eval(in, CEPH_CAP_LOCKS)))
+      mds->locker->issue_caps(in);
+  }
+
+  mdcache->show_cache();
+}
+
+
+/*
+ * once i get the ack, and logged the EExportFinish(true),
+ * send notifies (if any), otherwise go straight to finish.
+ * 
+ */
+void Migrator::export_logged_finish(CDir *dir)
+{
+  dout(7) << *dir << dendl;
+
+  export_state_t& stat = export_state[dir];
+
+  // send notifies
+  set<CDir*> bounds;
+  mdcache->get_subtree_bounds(dir, bounds);
+
+  for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
+       p != stat.notify_ack_waiting.end();
+       ++p) {
+    auto notify = make_message<MExportDirNotify>(dir->dirfrag(), stat.tid, true,
+        pair<int,int>(mds->get_nodeid(), stat.peer),
+        pair<int,int>(stat.peer, CDIR_AUTH_UNKNOWN));
+
+    for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
+      notify->get_bounds().push_back((*i)->dirfrag());
+    
+    mds->send_message_mds(notify, *p);
+  }
+
+  // wait for notifyacks
+  stat.state = EXPORT_NOTIFYING;
+  assert (g_conf()->mds_kill_export_at != 11);
+  
+  // no notifies to wait for?
+  if (stat.notify_ack_waiting.empty()) {
+    export_finish(dir);  // skip notify/notify_ack stage.
+  } else {
+    // notify peer to send cap import messages to clients
+    if (!mds->is_cluster_degraded() ||
+	mds->mdsmap->is_clientreplay_or_active_or_stopping(stat.peer)) {
+      mds->send_message_mds(make_message<MExportDirFinish>(dir->dirfrag(), false, stat.tid), stat.peer);
+    } else {
+      dout(7) << "not sending MExportDirFinish, dest has failed" << dendl;
+    }
+  }
+}
+
+/*
+ * warning:
+ *  i'll get an ack from each bystander.
+ *  when i get them all, do the export.
+ * notify:
+ *  i'll get an ack from each bystander.
+ *  when i get them all, unfreeze and send the finish.
+ */
+void Migrator::handle_export_notify_ack(const cref_t<MExportDirNotifyAck> &m)
+{
+  CDir *dir = mdcache->get_dirfrag(m->get_dirfrag());
+  mds_rank_t dest(m->get_source().num());
+  ceph_assert(dir);
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+
+  mds->hit_export_target(dest, -1);
+
+  auto export_state_entry = export_state.find(dir);
+  if (export_state_entry != export_state.end()) {
+    export_state_t& stat = export_state_entry->second;
+    if (stat.state == EXPORT_WARNING &&
+	stat.warning_ack_waiting.erase(from)) {
+      // exporting. process warning.
+      dout(7) << "from " << m->get_source()
+	      << ": exporting, processing warning on " << *dir << dendl;
+      if (stat.warning_ack_waiting.empty())
+	export_go(dir);     // start export.
+    } else if (stat.state == EXPORT_NOTIFYING &&
+	       stat.notify_ack_waiting.erase(from)) {
+      // exporting. process notify.
+      dout(7) << "from " << m->get_source()
+	      << ": exporting, processing notify on " << *dir << dendl;
+      if (stat.notify_ack_waiting.empty())
+	export_finish(dir);
+    } else if (stat.state == EXPORT_CANCELLING &&
+	       m->get_new_auth().second == CDIR_AUTH_UNKNOWN && // not warning ack
+	       stat.notify_ack_waiting.erase(from)) {
+      dout(7) << "from " << m->get_source()
+	      << ": cancelling export, processing notify on " << *dir << dendl;
+      if (stat.notify_ack_waiting.empty()) {
+	export_cancel_finish(export_state_entry);
+      }
+    }
+  }
+  else {
+    auto import_state_entry = import_state.find(dir->dirfrag());
+    if (import_state_entry != import_state.end()) {
+      import_state_t& stat = import_state_entry->second;
+      if (stat.state == IMPORT_ABORTING) {
+	// reversing import
+	dout(7) << "from " << m->get_source()
+	  << ": aborting import on " << *dir << dendl;
+	ceph_assert(stat.bystanders.count(from));
+	stat.bystanders.erase(from);
+	if (stat.bystanders.empty())
+	  import_reverse_unfreeze(dir);
+      }
+    }
+  }
+}
+
+void Migrator::export_finish(CDir *dir)
+{
+  dout(3) << *dir << dendl;
+
+  assert (g_conf()->mds_kill_export_at != 12);
+  map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+  if (it == export_state.end()) {
+    dout(7) << "target must have failed, not sending final commit message.  export succeeded anyway." << dendl;
+    return;
+  }
+
+  // send finish/commit to new auth
+  if (!mds->is_cluster_degraded() ||
+      mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) {
+    mds->send_message_mds(make_message<MExportDirFinish>(dir->dirfrag(), true, it->second.tid), it->second.peer);
+  } else {
+    dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl;
+  }
+  ceph_assert(g_conf()->mds_kill_export_at != 13);
+  
+  // finish export (adjust local cache state)
+  int num_dentries = 0;
+  MDSContext::vec finished;
+  finish_export_dir(dir, it->second.peer,
+		    it->second.peer_imported, finished, &num_dentries);
+
+  ceph_assert(!dir->is_auth());
+  mdcache->adjust_subtree_auth(dir, it->second.peer);
+
+  // unpin bounds
+  set<CDir*> bounds;
+  mdcache->get_subtree_bounds(dir, bounds);
+  for (set<CDir*>::iterator p = bounds.begin();
+       p != bounds.end();
+       ++p) {
+    CDir *bd = *p;
+    bd->put(CDir::PIN_EXPORTBOUND);
+    bd->state_clear(CDir::STATE_EXPORTBOUND);
+  }
+
+  if (dir->state_test(CDir::STATE_AUXSUBTREE))
+    dir->state_clear(CDir::STATE_AUXSUBTREE);
+
+  // discard delayed expires
+  mdcache->discard_delayed_expire(dir);
+
+  dout(7) << "unfreezing" << dendl;
+
+  // unfreeze tree, with possible subtree merge.
+  //  (we do this _after_ removing EXPORTBOUND pins, to allow merges)
+  dir->unfreeze_tree();
+  mdcache->try_subtree_merge(dir);
+
+  // no more auth subtree? clear scatter dirty
+  if (!dir->get_inode()->is_auth() &&
+      !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
+    dir->get_inode()->clear_scatter_dirty();
+    // wake up scatter_nudge waiters
+    dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, finished);
+  }
+
+  if (!finished.empty())
+    mds->queue_waiters(finished);
+
+  MutationRef mut = std::move(it->second.mut);
+  auto parent = std::move(it->second.parent);
+  // remove from exporting list, clean up state
+  total_exporting_size -= it->second.approx_size;
+  export_state.erase(it);
+
+  ceph_assert(dir->state_test(CDir::STATE_EXPORTING));
+  dir->clear_exporting();
+
+  mdcache->show_subtrees();
+  audit();
+
+  mdcache->trim(num_dentries); // try trimming exported dentries
+
+  // send pending import_maps?
+  mdcache->maybe_send_pending_resolves();
+
+  // drop locks, unpin path
+  if (mut) {
+    mds->locker->drop_locks(mut.get());
+    mut->cleanup();
+  }
+
+  if (parent)
+    child_export_finish(parent, true);
+
+  maybe_do_queued_export();
+}
+
+
+
+class C_MDS_ExportDiscover : public MigratorContext {
+public:
+  C_MDS_ExportDiscover(Migrator *mig, const cref_t<MExportDirDiscover>& m) : MigratorContext(mig), m(m) {}
+  void finish(int r) override {
+    mig->handle_export_discover(m, true);
+  }
+private:
+  cref_t<MExportDirDiscover> m;
+};
+
+class C_MDS_ExportDiscoverFactory : public MDSContextFactory {
+public:
+  C_MDS_ExportDiscoverFactory(Migrator *mig, cref_t<MExportDirDiscover> m) : mig(mig), m(m) {}
+  MDSContext *build() {
+    return new C_MDS_ExportDiscover(mig, m);
+  }
+private:
+  Migrator *mig;
+  cref_t<MExportDirDiscover> m;
+};
+
+// ==========================================================
+// IMPORT
+
+void Migrator::handle_export_discover(const cref_t<MExportDirDiscover> &m, bool started)
+{
+  mds_rank_t from = m->get_source_mds();
+  ceph_assert(from != mds->get_nodeid());
+
+  dout(7) << m->get_path() << dendl;
+
+  // note import state
+  dirfrag_t df = m->get_dirfrag();
+
+  if (!mds->is_active()) {
+    dout(7) << " not active, send NACK " << dendl;
+    mds->send_message_mds(make_message<MExportDirDiscoverAck>(df, m->get_tid(), false), from);
+    return;
+  }
+
+  // only start discovering on this message once.
+  import_state_t *p_state;
+  map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
+  if (!started) {
+    ceph_assert(it == import_state.end());
+    p_state = &import_state[df];
+    p_state->state = IMPORT_DISCOVERING;
+    p_state->peer = from;
+    p_state->tid = m->get_tid();
+  } else {
+    // am i retrying after ancient path_traverse results?
+    if (it == import_state.end() ||
+	it->second.peer != from ||
+	it->second.tid != m->get_tid()) {
+      dout(7) << " dropping obsolete message" << dendl;
+      return;
+    }
+    ceph_assert(it->second.state == IMPORT_DISCOVERING);
+    p_state = &it->second;
+  }
+
+  C_MDS_ExportDiscoverFactory cf(this, m);
+  if (!mdcache->is_open()) {
+    dout(10) << " waiting for root" << dendl;
+    mds->mdcache->wait_for_open(cf.build());
+    return;
+  }
+
+  assert (g_conf()->mds_kill_import_at != 1);
+
+  // do we have it?
+  CInode *in = mdcache->get_inode(m->get_dirfrag().ino);
+  if (!in) {
+    // must discover it!
+    filepath fpath(m->get_path());
+    vector<CDentry*> trace;
+    MDRequestRef null_ref;
+    int r = mdcache->path_traverse(null_ref, cf, fpath,
+				   MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
+				   &trace);
+    if (r > 0) return;
+    if (r < 0) {
+      dout(7) << "failed to discover or not dir " << m->get_path() << ", NAK" << dendl;
+      ceph_abort();    // this shouldn't happen if the auth pins its path properly!!!!
+    }
+
+    ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
+  }
+
+  // yay
+  dout(7) << "have " << df << " inode " << *in << dendl;
+  
+  p_state->state = IMPORT_DISCOVERED;
+
+  // pin inode in the cache (for now)
+  ceph_assert(in->is_dir());
+  in->get(CInode::PIN_IMPORTING);
+
+  // reply
+  dout(7) << " sending export_discover_ack on " << *in << dendl;
+  mds->send_message_mds(make_message<MExportDirDiscoverAck>(df, m->get_tid()), p_state->peer);
+  assert (g_conf()->mds_kill_import_at != 2);
+}
+
+void Migrator::import_reverse_discovering(dirfrag_t df)
+{
+  import_state.erase(df);
+}
+
+void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri)
+{
+  // unpin base
+  diri->put(CInode::PIN_IMPORTING);
+  import_state.erase(df);
+}
+
+void Migrator::import_reverse_prepping(CDir *dir, import_state_t& stat)
+{
+  set<CDir*> bounds;
+  mdcache->map_dirfrag_set(stat.bound_ls, bounds);
+  import_remove_pins(dir, bounds);
+  import_reverse_final(dir);
+}
+
+void Migrator::handle_export_cancel(const cref_t<MExportDirCancel> &m)
+{
+  dout(7) << "on " << m->get_dirfrag() << dendl;
+  dirfrag_t df = m->get_dirfrag();
+  map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
+  if (it == import_state.end()) {
+    ceph_abort_msg("got export_cancel in weird state");
+  } else if (it->second.state == IMPORT_DISCOVERING) {
+    import_reverse_discovering(df);
+  } else if (it->second.state == IMPORT_DISCOVERED) {
+    CInode *in = mdcache->get_inode(df.ino);
+    ceph_assert(in);
+    import_reverse_discovered(df, in);
+  } else if (it->second.state == IMPORT_PREPPING) {
+    CDir *dir = mdcache->get_dirfrag(df);
+    ceph_assert(dir);
+    import_reverse_prepping(dir, it->second);
+  } else if (it->second.state == IMPORT_PREPPED) {
+    CDir *dir = mdcache->get_dirfrag(df);
+    ceph_assert(dir);
+    set<CDir*> bounds;
+    mdcache->get_subtree_bounds(dir, bounds);
+    import_remove_pins(dir, bounds);
+    // adjust auth back to the exportor
+    mdcache->adjust_subtree_auth(dir, it->second.peer);
+    import_reverse_unfreeze(dir);
+  } else {
+    ceph_abort_msg("got export_cancel in weird state");
+  }
+}
+
+class C_MDS_ExportPrep : public MigratorContext {
+public:
+  C_MDS_ExportPrep(Migrator *mig, const cref_t<MExportDirPrep>& m) : MigratorContext(mig), m(m) {}
+  void finish(int r) override {
+    mig->handle_export_prep(m, true);
+  }
+private:
+  cref_t<MExportDirPrep> m;
+};
+
+class C_MDS_ExportPrepFactory : public MDSContextFactory {
+public:
+  C_MDS_ExportPrepFactory(Migrator *mig, cref_t<MExportDirPrep> m) : mig(mig), m(m) {}
+  MDSContext *build() {
+    return new C_MDS_ExportPrep(mig, m);
+  }
+private:
+  Migrator *mig;
+  cref_t<MExportDirPrep> m;
+};
+
+void Migrator::decode_export_prep_trace(bufferlist::const_iterator& blp, mds_rank_t oldauth, MDSContext::vec& finished)
+{
+  DECODE_START(1, blp);
+  dirfrag_t df;
+  decode(df, blp);
+  char start;
+  decode(start, blp);
+  dout(10) << " trace from " << df << " start " << start << dendl;
+  
+  CDir *cur = nullptr;
+  if (start == 'd') {
+    cur = mdcache->get_dirfrag(df);
+    ceph_assert(cur);
+    dout(10) << "  had " << *cur << dendl;
+  } else if (start == 'f') {
+    CInode *in = mdcache->get_inode(df.ino);
+    ceph_assert(in); 
+    dout(10) << "  had " << *in << dendl; 
+    mdcache->decode_replica_dir(cur, blp, in, oldauth, finished);
+    dout(10) << "  added " << *cur << dendl;
+  } else if (start == '-') {
+    // nothing
+  } else
+    ceph_abort_msg("unrecognized start char");
+
+  while (!blp.end()) {
+    CDentry *dn = nullptr;
+    mdcache->decode_replica_dentry(dn, blp, cur, finished);
+    dout(10) << "  added " << *dn << dendl;
+    CInode *in = nullptr;
+    mdcache->decode_replica_inode(in, blp, dn, finished);
+    dout(10) << "  added " << *in << dendl;
+    if (blp.end())
+      break;
+    mdcache->decode_replica_dir(cur, blp, in, oldauth, finished);
+    dout(10) << "  added " << *cur << dendl;
+  }
+  
+  DECODE_FINISH(blp);
+}
+
+void Migrator::handle_export_prep(const cref_t<MExportDirPrep> &m, bool did_assim)
+{
+  mds_rank_t oldauth = mds_rank_t(m->get_source().num());
+  ceph_assert(oldauth != mds->get_nodeid());
+
+  CDir *dir;
+  CInode *diri;
+  MDSContext::vec finished;
+
+  // assimilate root dir.
+  map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
+  if (!did_assim) {
+    ceph_assert(it != import_state.end());
+    ceph_assert(it->second.state == IMPORT_DISCOVERED);
+    ceph_assert(it->second.peer == oldauth);
+    diri = mdcache->get_inode(m->get_dirfrag().ino);
+    ceph_assert(diri);
+    auto p = m->basedir.cbegin();
+    mdcache->decode_replica_dir(dir, p, diri, oldauth, finished);
+    dout(7) << "on " << *dir << " (first pass)" << dendl;
+  } else {
+    if (it == import_state.end() ||
+	it->second.peer != oldauth ||
+	it->second.tid != m->get_tid()) {
+      dout(7) << "obsolete message, dropping" << dendl;
+      return;
+    }
+    ceph_assert(it->second.state == IMPORT_PREPPING);
+    ceph_assert(it->second.peer == oldauth);
+
+    dir = mdcache->get_dirfrag(m->get_dirfrag());
+    ceph_assert(dir);
+    dout(7) << "on " << *dir << " (subsequent pass)" << dendl;
+    diri = dir->get_inode();
+  }
+  ceph_assert(dir->is_auth() == false);
+
+  mdcache->show_subtrees();
+
+  // build import bound map
+  map<inodeno_t, fragset_t> import_bound_fragset;
+  for (const auto &bound : m->get_bounds()) {
+    dout(10) << " bound " << bound << dendl;
+    import_bound_fragset[bound.ino].insert_raw(bound.frag);
+  }
+  // assimilate contents?
+  if (!did_assim) {
+    dout(7) << "doing assim on " << *dir << dendl;
+
+    // change import state
+    it->second.state = IMPORT_PREPPING;
+    it->second.bound_ls = m->get_bounds();
+    it->second.bystanders = m->get_bystanders();
+    ceph_assert(g_conf()->mds_kill_import_at != 3);
+
+    // bystander list
+    dout(7) << "bystanders are " << it->second.bystanders << dendl;
+
+    // move pin to dir
+    diri->put(CInode::PIN_IMPORTING);
+    dir->get(CDir::PIN_IMPORTING);  
+    dir->state_set(CDir::STATE_IMPORTING);
+
+    // assimilate traces to exports
+    // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
+    for (const auto &bl : m->traces) {
+      auto blp = bl.cbegin();
+      decode_export_prep_trace(blp, oldauth, finished);
+    }
+
+    // make bound sticky
+    for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
+	 p != import_bound_fragset.end();
+	 ++p) {
+      p->second.simplify();
+      CInode *in = mdcache->get_inode(p->first);
+      ceph_assert(in);
+      in->get_stickydirs();
+      dout(7) << " set stickydirs on bound inode " << *in << dendl;
+    }
+
+  } else {
+    dout(7) << " not doing assim on " << *dir << dendl;
+  }
+
+  MDSGatherBuilder gather(g_ceph_context);
+
+  if (!finished.empty())
+    mds->queue_waiters(finished);
+
+
+  bool success = true;
+  if (mds->is_active()) {
+    // open all bounds
+    set<CDir*> import_bounds;
+    for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
+	 p != import_bound_fragset.end();
+	 ++p) {
+      CInode *in = mdcache->get_inode(p->first);
+      ceph_assert(in);
+
+      // map fragset into a frag_t list, based on the inode fragtree
+      frag_vec_t leaves;
+      for (const auto& frag : p->second) {
+	in->dirfragtree.get_leaves_under(frag, leaves);
+      }
+      dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << leaves << dendl;
+
+      for (const auto& leaf : leaves) {
+	CDir *bound = mdcache->get_dirfrag(dirfrag_t(p->first, leaf));
+	if (!bound) {
+	  dout(7) << "  opening bounding dirfrag " << leaf << " on " << *in << dendl;
+	  mdcache->open_remote_dirfrag(in, leaf, gather.new_sub());
+	  continue;
+	}
+
+	if (!bound->state_test(CDir::STATE_IMPORTBOUND)) {
+	  dout(7) << "  pinning import bound " << *bound << dendl;
+	  bound->get(CDir::PIN_IMPORTBOUND);
+	  bound->state_set(CDir::STATE_IMPORTBOUND);
+	} else {
+	  dout(7) << "  already pinned import bound " << *bound << dendl;
+	}
+	import_bounds.insert(bound);
+      }
+    }
+
+    if (gather.has_subs()) {
+      C_MDS_ExportPrepFactory cf(this, m);
+      gather.set_finisher(cf.build());
+      gather.activate();
+      return;
+    }
+
+    dout(7) << " all ready, noting auth and freezing import region" << dendl;
+
+    if (!mdcache->is_readonly() &&
+	// for pinning scatter gather. loner has a higher chance to get wrlock
+	diri->filelock.can_wrlock(diri->get_loner()) &&
+	diri->nestlock.can_wrlock(diri->get_loner())) {
+      it->second.mut = new MutationImpl();
+      // force some locks.  hacky.
+      mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut);
+      mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut);
+
+      // note that i am an ambiguous auth for this subtree.
+      // specify bounds, since the exporter explicitly defines the region.
+      mdcache->adjust_bounded_subtree_auth(dir, import_bounds,
+					 pair<int,int>(oldauth, mds->get_nodeid()));
+      mdcache->verify_subtree_bounds(dir, import_bounds);
+      // freeze.
+      dir->_freeze_tree();
+      // note new state
+      it->second.state = IMPORT_PREPPED;
+    } else {
+      dout(7) << " couldn't acquire all needed locks, failing. " << *dir << dendl;
+      success = false;
+    }
+  } else {
+    dout(7) << " not active, failing. " << *dir << dendl;
+    success = false;
+  }
+
+  if (!success)
+    import_reverse_prepping(dir, it->second);
+
+  // ok!
+  dout(7) << " sending export_prep_ack on " << *dir << dendl;
+  mds->send_message(make_message<MExportDirPrepAck>(dir->dirfrag(), success, m->get_tid()), m->get_connection());
+
+  ceph_assert(g_conf()->mds_kill_import_at != 4);
+}
+
+
+
+
+class C_MDS_ImportDirLoggedStart : public MigratorLogContext {
+  dirfrag_t df;
+  CDir *dir;
+  mds_rank_t from;
+public:
+  map<client_t,pair<Session*,uint64_t> > imported_session_map;
+
+  C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, mds_rank_t f) :
+    MigratorLogContext(m), df(d->dirfrag()), dir(d), from(f) {
+    dir->get(CDir::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    mig->import_logged_start(df, dir, from, imported_session_map);
+    dir->put(CDir::PIN_PTRWAITER);
+  }
+};
+
+void Migrator::handle_export_dir(const cref_t<MExportDir> &m)
+{
+  assert (g_conf()->mds_kill_import_at != 5);
+  CDir *dir = mdcache->get_dirfrag(m->dirfrag);
+  ceph_assert(dir);
+
+  mds_rank_t oldauth = mds_rank_t(m->get_source().num());
+  dout(7) << "importing " << *dir << " from " << oldauth << dendl;
+
+  ceph_assert(!dir->is_auth());
+  ceph_assert(dir->freeze_tree_state);
+  
+  map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->dirfrag);
+  ceph_assert(it != import_state.end());
+  ceph_assert(it->second.state == IMPORT_PREPPED);
+  ceph_assert(it->second.tid == m->get_tid());
+  ceph_assert(it->second.peer == oldauth);
+
+  if (!dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()))
+    dir->get_inode()->dirfragtree.force_to_leaf(g_ceph_context, dir->get_frag());
+
+  mdcache->show_subtrees();
+
+  C_MDS_ImportDirLoggedStart *onlogged = new C_MDS_ImportDirLoggedStart(this, dir, oldauth);
+
+  // start the journal entry
+  EImportStart *le = new EImportStart(mds->mdlog, dir->dirfrag(), m->bounds, oldauth);
+  mds->mdlog->start_entry(le);
+
+  le->metablob.add_dir_context(dir);
+  
+  // adjust auth (list us _first_)
+  mdcache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth);
+
+  // new client sessions, open these after we journal
+  // include imported sessions in EImportStart
+  auto cmp = m->client_map.cbegin();
+  map<client_t,entity_inst_t> client_map;
+  map<client_t,client_metadata_t> client_metadata_map;
+  decode(client_map, cmp);
+  decode(client_metadata_map, cmp);
+  ceph_assert(cmp.end());
+  le->cmapv = mds->server->prepare_force_open_sessions(client_map, client_metadata_map,
+						       onlogged->imported_session_map);
+  encode(client_map, le->client_map, mds->mdsmap->get_up_features());
+  encode(client_metadata_map, le->client_map);
+
+  auto blp = m->export_data.cbegin();
+  int num_imported_inodes = 0;
+  while (!blp.end()) {
+    decode_import_dir(blp,
+                      oldauth, 
+                      dir,                 // import root
+                      le,
+                      mds->mdlog->get_current_segment(),
+                      it->second.peer_exports,
+                      it->second.updated_scatterlocks,
+                      num_imported_inodes);
+  }
+  dout(10) << " " << m->bounds.size() << " imported bounds" << dendl;
+  
+  // include bounds in EImportStart
+  set<CDir*> import_bounds;
+  for (const auto &bound : m->bounds) {
+    CDir *bd = mdcache->get_dirfrag(bound);
+    ceph_assert(bd);
+    le->metablob.add_dir(bd, false);  // note that parent metadata is already in the event
+    import_bounds.insert(bd);
+  }
+  mdcache->verify_subtree_bounds(dir, import_bounds);
+
+  // adjust popularity
+  mds->balancer->add_import(dir);
+
+  dout(7) << "did " << *dir << dendl;
+
+  // note state
+  it->second.state = IMPORT_LOGGINGSTART;
+  assert (g_conf()->mds_kill_import_at != 6);
+
+  // log it
+  mds->mdlog->submit_entry(le, onlogged);
+  mds->mdlog->flush();
+
+  // some stats
+  if (mds->logger) {
+    mds->logger->inc(l_mds_imported);
+    mds->logger->inc(l_mds_imported_inodes, num_imported_inodes);
+  }
+}
+
+
+/*
+ * this is an import helper
+ *  called by import_finish, and import_reverse and friends.
+ */
+void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds)
+{
+  import_state_t& stat = import_state[dir->dirfrag()];
+  // root
+  dir->put(CDir::PIN_IMPORTING);
+  dir->state_clear(CDir::STATE_IMPORTING);
+
+  // bounding inodes
+  set<inodeno_t> did;
+  for (list<dirfrag_t>::iterator p = stat.bound_ls.begin();
+       p != stat.bound_ls.end();
+       ++p) {
+    if (did.count(p->ino))
+      continue;
+    did.insert(p->ino);
+    CInode *in = mdcache->get_inode(p->ino);
+    ceph_assert(in);
+    in->put_stickydirs();
+  }
+
+  if (stat.state == IMPORT_PREPPING) {
+    for (auto bd : bounds) {
+      if (bd->state_test(CDir::STATE_IMPORTBOUND)) {
+	bd->put(CDir::PIN_IMPORTBOUND);
+	bd->state_clear(CDir::STATE_IMPORTBOUND);
+      }
+    }
+  } else if (stat.state >= IMPORT_PREPPED) {
+    // bounding dirfrags
+    for (auto bd : bounds) {
+      ceph_assert(bd->state_test(CDir::STATE_IMPORTBOUND));
+      bd->put(CDir::PIN_IMPORTBOUND);
+      bd->state_clear(CDir::STATE_IMPORTBOUND);
+    }
+  }
+}
+
+class C_MDC_QueueContexts : public MigratorContext {
+public:
+  MDSContext::vec contexts;
+  C_MDC_QueueContexts(Migrator *m) : MigratorContext(m) {}
+  void finish(int r) override {
+    // execute contexts immediately after 'this' context
+    get_mds()->queue_waiters_front(contexts);
+  }
+};
+
+/*
+ * note: this does teh full work of reversing and import and cleaning up
+ *  state.  
+ * called by both handle_mds_failure and by handle_resolve (if we are
+ *  a survivor coping with an exporter failure+recovery).
+ */
+void Migrator::import_reverse(CDir *dir)
+{
+  dout(7) << *dir << dendl;
+
+  import_state_t& stat = import_state[dir->dirfrag()];
+  stat.state = IMPORT_ABORTING;
+
+  set<CDir*> bounds;
+  mdcache->get_subtree_bounds(dir, bounds);
+
+  // remove pins
+  import_remove_pins(dir, bounds);
+
+  // update auth, with possible subtree merge.
+  ceph_assert(dir->is_subtree_root());
+  if (mds->is_resolve())
+    mdcache->trim_non_auth_subtree(dir);
+
+  mdcache->adjust_subtree_auth(dir, stat.peer);
+
+  auto fin = new C_MDC_QueueContexts(this);
+  if (!dir->get_inode()->is_auth() &&
+      !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
+    dir->get_inode()->clear_scatter_dirty();
+    // wake up scatter_nudge waiters
+    dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
+  }
+
+  int num_dentries = 0;
+  // adjust auth bits.
+  std::deque<CDir*> q;
+  q.push_back(dir);
+  while (!q.empty()) {
+    CDir *cur = q.front();
+    q.pop_front();
+    
+    // dir
+    cur->abort_import();
+
+    for (auto &p : *cur) {
+      CDentry *dn = p.second;
+
+      // dentry
+      dn->state_clear(CDentry::STATE_AUTH);
+      dn->clear_replica_map();
+      dn->set_replica_nonce(CDentry::EXPORT_NONCE);
+      if (dn->is_dirty()) 
+	dn->mark_clean();
+
+      // inode?
+      if (dn->get_linkage()->is_primary()) {
+	CInode *in = dn->get_linkage()->get_inode();
+	in->state_clear(CDentry::STATE_AUTH);
+	in->clear_replica_map();
+	in->set_replica_nonce(CInode::EXPORT_NONCE);
+	if (in->is_dirty()) 
+	  in->mark_clean();
+	in->clear_dirty_rstat();
+	if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
+	  in->clear_scatter_dirty();
+	  in->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
+	}
+
+	in->clear_dirty_parent();
+
+	in->clear_clientwriteable();
+	in->state_clear(CInode::STATE_NEEDSRECOVER);
+
+	in->authlock.clear_gather();
+	in->linklock.clear_gather();
+	in->dirfragtreelock.clear_gather();
+	in->filelock.clear_gather();
+
+	in->clear_file_locks();
+
+	// non-bounding dir?
+	auto&& dfs = in->get_dirfrags();
+	for (const auto& dir : dfs) {
+	  if (bounds.count(dir) == 0)
+	    q.push_back(dir);
+        }
+      }
+
+      mdcache->touch_dentry_bottom(dn); // move dentry to tail of LRU
+      ++num_dentries;
+    }
+  }
+
+  dir->add_waiter(CDir::WAIT_UNFREEZE, fin);
+
+  if (stat.state == IMPORT_ACKING) {
+    // remove imported caps
+    for (map<CInode*,map<client_t,Capability::Export> >::iterator p = stat.peer_exports.begin();
+	 p != stat.peer_exports.end();
+	 ++p) {
+      CInode *in = p->first;
+      for (map<client_t,Capability::Export>::iterator q = p->second.begin();
+	   q != p->second.end();
+	   ++q) {
+	Capability *cap = in->get_client_cap(q->first);
+	if (!cap) {
+	  ceph_assert(!stat.session_map.count(q->first));
+	  continue;
+	}
+	if (cap->is_importing())
+	  in->remove_client_cap(q->first);
+	else
+	  cap->clear_clientwriteable();
+      }
+      in->put(CInode::PIN_IMPORTINGCAPS);
+    }
+    for (auto& p : stat.session_map) {
+      Session *session = p.second.first;
+      session->dec_importing();
+    }
+  }
+	 
+  // log our failure
+  mds->mdlog->start_submit_entry(new EImportFinish(dir, false));	// log failure
+
+  mdcache->trim(num_dentries); // try trimming dentries
+
+  // notify bystanders; wait in aborting state
+  import_notify_abort(dir, bounds);
+}
+
+void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
+{
+  dout(7) << *dir << dendl;
+
+  import_state_t& stat = import_state[dir->dirfrag()];
+  for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
+       p != stat.bystanders.end();
+       ++p) {
+    auto notify = make_message<MExportDirNotify>(dir->dirfrag(), stat.tid, false,
+        pair<int,int>(stat.peer, mds->get_nodeid()),
+        pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
+    for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
+      notify->get_bounds().push_back((*i)->dirfrag());
+    mds->send_message_mds(notify, *p);
+  }
+}
+
+void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
+{
+  dout(7) << *dir << dendl;
+  
+  import_state_t& stat = import_state[dir->dirfrag()];
+  for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
+       p != stat.bystanders.end(); ) {
+    if (mds->is_cluster_degraded() &&
+	!mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)) {
+      // this can happen if both exporter and bystander fail in the same mdsmap epoch
+      stat.bystanders.erase(p++);
+      continue;
+    }
+    auto notify = make_message<MExportDirNotify>(dir->dirfrag(), stat.tid, true,
+        mds_authority_t(stat.peer, mds->get_nodeid()),
+        mds_authority_t(stat.peer, CDIR_AUTH_UNKNOWN));
+    for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
+      notify->get_bounds().push_back((*i)->dirfrag());
+    mds->send_message_mds(notify, *p);
+    ++p;
+  }
+  if (stat.bystanders.empty()) {
+    dout(7) << "no bystanders, finishing reverse now" << dendl;
+    import_reverse_unfreeze(dir);
+  } else {
+    assert (g_conf()->mds_kill_import_at != 10);
+  }
+}
+
+void Migrator::import_reverse_unfreeze(CDir *dir)
+{
+  dout(7) << *dir << dendl;
+  ceph_assert(!dir->is_auth());
+  mdcache->discard_delayed_expire(dir);
+  dir->unfreeze_tree();
+  if (dir->is_subtree_root())
+    mdcache->try_subtree_merge(dir);
+  import_reverse_final(dir);
+}
+
+void Migrator::import_reverse_final(CDir *dir) 
+{
+  dout(7) << *dir << dendl;
+
+  // clean up
+  map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
+  ceph_assert(it != import_state.end());
+
+  MutationRef mut = it->second.mut;
+  import_state.erase(it);
+
+  // send pending import_maps?
+  mdcache->maybe_send_pending_resolves();
+
+  if (mut) {
+    mds->locker->drop_locks(mut.get());
+    mut->cleanup();
+  }
+
+  mdcache->show_subtrees();
+  //audit();  // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
+}
+
+
+
+
+void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
+				   map<client_t,pair<Session*,uint64_t> >& imported_session_map)
+{
+  dout(7) << *dir << dendl;
+
+  map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
+  if (it == import_state.end() ||
+      it->second.state != IMPORT_LOGGINGSTART) {
+    dout(7) << "import " << df << " must have aborted" << dendl;
+    mds->server->finish_force_open_sessions(imported_session_map);
+    return;
+  }
+
+  // note state
+  it->second.state = IMPORT_ACKING;
+
+  assert (g_conf()->mds_kill_import_at != 7);
+
+  // force open client sessions and finish cap import
+  mds->server->finish_force_open_sessions(imported_session_map, false);
+  
+  map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
+  for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
+       p != it->second.peer_exports.end();
+       ++p) {
+    // parameter 'peer' is NONE, delay sending cap import messages to client
+    finish_import_inode_caps(p->first, MDS_RANK_NONE, true, imported_session_map,
+			     p->second, imported_caps[p->first->ino()]);
+  }
+
+  it->second.session_map.swap(imported_session_map);
+  
+  // send notify's etc.
+  dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl;
+
+  // test surviving observer of a failed migration that did not complete
+  //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
+
+  auto ack = make_message<MExportDirAck>(dir->dirfrag(), it->second.tid);
+  encode(imported_caps, ack->imported_caps);
+
+  mds->send_message_mds(ack, from);
+  assert (g_conf()->mds_kill_import_at != 8);
+
+  mdcache->show_subtrees();
+}
+
+void Migrator::handle_export_finish(const cref_t<MExportDirFinish> &m)
+{
+  CDir *dir = mdcache->get_dirfrag(m->get_dirfrag());
+  ceph_assert(dir);
+  dout(7) << *dir << (m->is_last() ? " last" : "") << dendl;
+
+  map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
+  ceph_assert(it != import_state.end());
+  ceph_assert(it->second.tid == m->get_tid());
+
+  import_finish(dir, false, m->is_last());
+}
+
+void Migrator::import_finish(CDir *dir, bool notify, bool last)
+{
+  dout(7) << *dir << dendl;
+
+  map<dirfrag_t,import_state_t>::iterator it = import_state.find(dir->dirfrag());
+  ceph_assert(it != import_state.end());
+  ceph_assert(it->second.state == IMPORT_ACKING || it->second.state == IMPORT_FINISHING);
+
+  if (it->second.state == IMPORT_ACKING) {
+    ceph_assert(dir->is_auth());
+    mdcache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
+  }
+
+  // log finish
+  ceph_assert(g_conf()->mds_kill_import_at != 9);
+
+  if (it->second.state == IMPORT_ACKING) {
+    for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
+	p != it->second.peer_exports.end();
+	++p) {
+      CInode *in = p->first;
+      ceph_assert(in->is_auth());
+      for (map<client_t,Capability::Export>::iterator q = p->second.begin();
+	  q != p->second.end();
+	  ++q) {
+	auto r = it->second.session_map.find(q->first);
+	if (r == it->second.session_map.end())
+	  continue;
+
+	Session *session = r->second.first;
+	Capability *cap = in->get_client_cap(q->first);
+	ceph_assert(cap);
+	cap->merge(q->second, true);
+	cap->clear_importing();
+	mdcache->do_cap_import(session, in, cap, q->second.cap_id, q->second.seq,
+				    q->second.mseq - 1, it->second.peer, CEPH_CAP_FLAG_AUTH);
+      }
+      p->second.clear();
+      in->replica_caps_wanted = 0;
+    }
+    for (auto& p : it->second.session_map) {
+      Session *session = p.second.first;
+      session->dec_importing();
+    }
+  }
+
+  if (!last) {
+    ceph_assert(it->second.state == IMPORT_ACKING);
+    it->second.state = IMPORT_FINISHING;
+    return;
+  }
+
+  // remove pins
+  set<CDir*> bounds;
+  mdcache->get_subtree_bounds(dir, bounds);
+
+  if (notify)
+    import_notify_finish(dir, bounds);
+
+  import_remove_pins(dir, bounds);
+
+  map<CInode*, map<client_t,Capability::Export> > peer_exports;
+  it->second.peer_exports.swap(peer_exports);
+
+  // clear import state (we're done!)
+  MutationRef mut = it->second.mut;
+  import_state.erase(it);
+
+  mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
+
+  // process delayed expires
+  mdcache->process_delayed_expire(dir);
+
+  // unfreeze tree, with possible subtree merge.
+  dir->unfreeze_tree();
+  mdcache->try_subtree_merge(dir);
+
+  mdcache->show_subtrees();
+  //audit();  // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
+
+  if (mut) {
+    mds->locker->drop_locks(mut.get());
+    mut->cleanup();
+  }
+
+  // re-eval imported caps
+  for (map<CInode*, map<client_t,Capability::Export> >::iterator p = peer_exports.begin();
+       p != peer_exports.end();
+       ++p) {
+    if (p->first->is_auth())
+      mds->locker->eval(p->first, CEPH_CAP_LOCKS, true);
+    p->first->put(CInode::PIN_IMPORTINGCAPS);
+  }
+
+  // send pending import_maps?
+  mdcache->maybe_send_pending_resolves();
+
+  // did i just import mydir?
+  if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
+    mdcache->populate_mydir();
+
+  // is it empty?
+  if (dir->get_num_head_items() == 0 &&
+      !dir->inode->is_auth()) {
+    // reexport!
+    export_empty_import(dir);
+  }
+}
+
+void Migrator::decode_import_inode(CDentry *dn, bufferlist::const_iterator& blp,
+				   mds_rank_t oldauth, LogSegment *ls,
+				   map<CInode*, map<client_t,Capability::Export> >& peer_exports,
+				   list<ScatterLock*>& updated_scatterlocks)
+{ 
+  CInode *in;
+  bool added = false;
+  DECODE_START(1, blp); 
+  dout(15) << " on " << *dn << dendl;
+
+  inodeno_t ino;
+  snapid_t last;
+  decode(ino, blp);
+  decode(last, blp);
+
+  in = mdcache->get_inode(ino, last);
+  if (!in) {
+    in = new CInode(mds->mdcache, true, 2, last);
+    added = true;
+  }
+
+  // state after link  -- or not!  -sage
+  in->decode_import(blp, ls);  // cap imports are noted for later action
+
+  // caps
+  decode_import_inode_caps(in, true, blp, peer_exports);
+
+  DECODE_FINISH(blp);
+
+  // link before state  -- or not!  -sage
+  if (dn->get_linkage()->get_inode() != in) {
+    ceph_assert(!dn->get_linkage()->get_inode());
+    dn->dir->link_primary_inode(dn, in);
+  }
+
+  if (in->is_dir())
+    dn->dir->pop_lru_subdirs.push_back(&in->item_pop_lru);
+ 
+  // add inode?
+  if (added) {
+    mdcache->add_inode(in);
+    dout(10) << "added " << *in << dendl;
+  } else {
+    dout(10) << "  had " << *in << dendl;
+  }
+
+  if (in->get_inode()->is_dirty_rstat())
+    in->mark_dirty_rstat();
+
+  if (!in->get_inode()->client_ranges.empty())
+    in->mark_clientwriteable();
+  
+  // clear if dirtyscattered, since we're going to journal this
+  //  but not until we _actually_ finish the import...
+  if (in->filelock.is_dirty()) {
+    updated_scatterlocks.push_back(&in->filelock);
+    mds->locker->mark_updated_scatterlock(&in->filelock);
+  }
+
+  if (in->dirfragtreelock.is_dirty()) {
+    updated_scatterlocks.push_back(&in->dirfragtreelock);
+    mds->locker->mark_updated_scatterlock(&in->dirfragtreelock);
+  }
+
+  // adjust replica list
+  //assert(!in->is_replica(oldauth));  // not true on failed export
+  in->add_replica(oldauth, CInode::EXPORT_NONCE);
+  if (in->is_replica(mds->get_nodeid()))
+    in->remove_replica(mds->get_nodeid());
+
+  if (in->snaplock.is_stable() &&
+      in->snaplock.get_state() != LOCK_SYNC)
+      mds->locker->try_eval(&in->snaplock, NULL);
+
+  if (in->policylock.is_stable() &&
+      in->policylock.get_state() != LOCK_SYNC)
+      mds->locker->try_eval(&in->policylock, NULL);
+}
+
+void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
+					bufferlist::const_iterator &blp,
+					map<CInode*, map<client_t,Capability::Export> >& peer_exports)
+{
+  DECODE_START(1, blp);
+  map<client_t,Capability::Export> cap_map;
+  decode(cap_map, blp);
+  if (auth_cap) {
+    mempool::mds_co::compact_map<int32_t,int32_t> mds_wanted;
+    decode(mds_wanted, blp);
+    mds_wanted.erase(mds->get_nodeid());
+    in->set_mds_caps_wanted(mds_wanted);
+  }
+  if (!cap_map.empty() ||
+      (auth_cap && (in->get_caps_wanted() & ~CEPH_CAP_PIN))) {
+    peer_exports[in].swap(cap_map);
+    in->get(CInode::PIN_IMPORTINGCAPS);
+  }
+  DECODE_FINISH(blp);
+}
+
+void Migrator::finish_import_inode_caps(CInode *in, mds_rank_t peer, bool auth_cap,
+					const map<client_t,pair<Session*,uint64_t> >& session_map,
+					const map<client_t,Capability::Export> &export_map,
+					map<client_t,Capability::Import> &import_map)
+{
+  const auto& client_ranges = in->get_projected_inode()->client_ranges;
+  auto r = client_ranges.cbegin();
+  bool needs_recover = false;
+
+  for (auto& it : export_map) {
+    dout(10) << "for client." << it.first << " on " << *in << dendl;
+
+    auto p = session_map.find(it.first);
+    if (p == session_map.end()) {
+      dout(10) << " no session for client." << it.first << dendl;
+      (void)import_map[it.first];
+      continue;
+    }
+
+    Session *session = p->second.first;
+
+    Capability *cap = in->get_client_cap(it.first);
+    if (!cap) {
+      cap = in->add_client_cap(it.first, session);
+      if (peer < 0)
+	cap->mark_importing();
+    }
+
+    if (auth_cap) {
+      while (r != client_ranges.cend() && r->first < it.first) {
+	needs_recover = true;
+	++r;
+      }
+      if (r != client_ranges.cend() && r->first == it.first) {
+	cap->mark_clientwriteable();
+	++r;
+      }
+    }
+
+    // Always ask exporter mds to send cap export messages for auth caps.
+    // For non-auth caps, ask exporter mds to send cap export messages to
+    // clients who haven't opened sessions. The cap export messages will
+    // make clients open sessions.
+    if (auth_cap || !session->get_connection()) {
+      Capability::Import& im = import_map[it.first];
+      im.cap_id = cap->get_cap_id();
+      im.mseq = auth_cap ? it.second.mseq : cap->get_mseq();
+      im.issue_seq = cap->get_last_seq() + 1;
+    }
+
+    if (peer >= 0) {
+      cap->merge(it.second, auth_cap);
+      mdcache->do_cap_import(session, in, cap, it.second.cap_id,
+				  it.second.seq, it.second.mseq - 1, peer,
+				  auth_cap ? CEPH_CAP_FLAG_AUTH : CEPH_CAP_FLAG_RELEASE);
+    }
+  }
+
+  if (auth_cap) {
+    if (r != client_ranges.cend())
+      needs_recover = true;
+    if (needs_recover)
+      in->state_set(CInode::STATE_NEEDSRECOVER);
+  }
+
+  if (peer >= 0) {
+    in->replica_caps_wanted = 0;
+    in->put(CInode::PIN_IMPORTINGCAPS);
+  }
+}
+
+void Migrator::decode_import_dir(bufferlist::const_iterator& blp,
+				mds_rank_t oldauth,
+				CDir *import_root,
+				EImportStart *le,
+				LogSegment *ls,
+				map<CInode*,map<client_t,Capability::Export> >& peer_exports,
+				list<ScatterLock*>& updated_scatterlocks, int &num_imported)
+{
+  DECODE_START(1, blp);
+  // set up dir
+  dirfrag_t df;
+  decode(df, blp);
+
+  CInode *diri = mdcache->get_inode(df.ino);
+  ceph_assert(diri);
+  CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag);
+  ceph_assert(dir);
+  
+  dout(7) << *dir << dendl;
+
+  if (!dir->freeze_tree_state) {
+    ceph_assert(dir->get_version() == 0);
+    dir->freeze_tree_state = import_root->freeze_tree_state;
+  }
+
+  // assimilate state
+  dir->decode_import(blp, ls);
+
+  // adjust replica list
+  //assert(!dir->is_replica(oldauth));    // not true on failed export
+  dir->add_replica(oldauth, CDir::EXPORT_NONCE);
+  if (dir->is_replica(mds->get_nodeid()))
+    dir->remove_replica(mds->get_nodeid());
+
+  // add to journal entry
+  if (le) 
+    le->metablob.add_import_dir(dir);
+
+  int num_imported = 0;
+
+  // take all waiters on this dir
+  // NOTE: a pass of imported data is guaranteed to get all of my waiters because
+  // a replica's presense in my cache implies/forces it's presense in authority's.
+  MDSContext::vec waiters;
+  dir->take_waiting(CDir::WAIT_ANY_MASK, waiters);
+  for (auto c : waiters)
+    dir->add_waiter(CDir::WAIT_UNFREEZE, c);  // UNFREEZE will get kicked both on success or failure
+  
+  dout(15) << "doing contents" << dendl;
+  
+  // contents
+  __u32 nden;
+  decode(nden, blp);
+  
+  for (; nden>0; nden--) {
+    num_imported++;
+    
+    // dentry
+    string dname;
+    snapid_t last;
+    decode(dname, blp);
+    decode(last, blp);
+    
+    CDentry *dn = dir->lookup_exact_snap(dname, last);
+    if (!dn)
+      dn = dir->add_null_dentry(dname, 1, last);
+    
+    dn->decode_import(blp, ls);
+
+    dn->add_replica(oldauth, CDentry::EXPORT_NONCE);
+    if (dn->is_replica(mds->get_nodeid()))
+      dn->remove_replica(mds->get_nodeid());
+
+    // dentry lock in unreadable state can block path traverse
+    if (dn->lock.get_state() != LOCK_SYNC)
+      mds->locker->try_eval(&dn->lock, NULL);
+
+    dout(15) << " got " << *dn << dendl;
+    
+    // points to...
+    char icode;
+    decode(icode, blp);
+    
+    if (icode == 'N') {
+      // null dentry
+      ceph_assert(dn->get_linkage()->is_null());  
+      
+      // fall thru
+    }
+    else if (icode == 'L' || icode == 'l') {
+      // remote link
+      inodeno_t ino;
+      unsigned char d_type;
+      mempool::mds_co::string alternate_name;
+
+      CDentry::decode_remote(icode, ino, d_type, alternate_name, blp);
+
+      if (dn->get_linkage()->is_remote()) {
+	ceph_assert(dn->get_linkage()->get_remote_ino() == ino);
+        ceph_assert(dn->get_alternate_name() == alternate_name);
+      } else {
+	dir->link_remote_inode(dn, ino, d_type);
+        dn->set_alternate_name(std::move(alternate_name));
+      }
+    }
+    else if (icode == 'I' || icode == 'i') {
+      // inode
+      ceph_assert(le);
+      if (icode == 'i') {
+        DECODE_START(2, blp);
+        decode_import_inode(dn, blp, oldauth, ls,
+                            peer_exports, updated_scatterlocks);
+        ceph_assert(!dn->is_projected());
+        decode(dn->alternate_name, blp);
+        DECODE_FINISH(blp);
+      } else {
+        decode_import_inode(dn, blp, oldauth, ls,
+                            peer_exports, updated_scatterlocks);
+      }
+    }
+    
+    // add dentry to journal entry
+    if (le)
+      le->metablob.add_import_dentry(dn);
+  }
+  
+#ifdef MDS_VERIFY_FRAGSTAT
+  if (dir->is_complete())
+    dir->verify_fragstat();
+#endif
+
+  dir->inode->maybe_export_pin();
+
+  dout(7) << " done " << *dir << dendl;
+  DECODE_FINISH(blp);
+}
+
+
+
+
+
+// authority bystander
+
+void Migrator::handle_export_notify(const cref_t<MExportDirNotify> &m)
+{
+  if (!(mds->is_clientreplay() || mds->is_active() || mds->is_stopping())) {
+    return;
+  }
+
+  CDir *dir = mdcache->get_dirfrag(m->get_dirfrag());
+
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+  mds_authority_t old_auth = m->get_old_auth();
+  mds_authority_t new_auth = m->get_new_auth();
+  
+  if (!dir) {
+    dout(7) << old_auth << " -> " << new_auth
+	    << " on missing dir " << m->get_dirfrag() << dendl;
+  } else if (dir->authority() != old_auth) {
+    dout(7) << "old_auth was " << dir->authority() 
+	    << " != " << old_auth << " -> " << new_auth
+	    << " on " << *dir << dendl;
+  } else {
+    dout(7) << old_auth << " -> " << new_auth
+	    << " on " << *dir << dendl;
+    // adjust auth
+    set<CDir*> have;
+    mdcache->map_dirfrag_set(m->get_bounds(), have);
+    mdcache->adjust_bounded_subtree_auth(dir, have, new_auth);
+    
+    // induce a merge?
+    mdcache->try_subtree_merge(dir);
+  }
+  
+  // send ack
+  if (m->wants_ack()) {
+    mds->send_message_mds(make_message<MExportDirNotifyAck>(m->get_dirfrag(), m->get_tid(), m->get_new_auth()), from);
+  } else {
+    // aborted.  no ack.
+    dout(7) << "no ack requested" << dendl;
+  }
+}
+
+/** cap exports **/
+void Migrator::export_caps(CInode *in)
+{
+  mds_rank_t dest = in->authority().first;
+  dout(7) << "to mds." << dest << " " << *in << dendl;
+
+  ceph_assert(in->is_any_caps());
+  ceph_assert(!in->is_auth());
+  ceph_assert(!in->is_ambiguous_auth());
+  ceph_assert(!in->state_test(CInode::STATE_EXPORTINGCAPS));
+
+  auto ex = make_message<MExportCaps>();
+  ex->ino = in->ino();
+
+  encode_export_inode_caps(in, false, ex->cap_bl, ex->client_map, ex->client_metadata_map);
+
+  mds->send_message_mds(ex, dest);
+}
+
+void Migrator::handle_export_caps_ack(const cref_t<MExportCapsAck> &ack)
+{
+  mds_rank_t from = ack->get_source().num();
+  CInode *in = mdcache->get_inode(ack->ino);
+  if (in) {
+    ceph_assert(!in->is_auth());
+
+    dout(10) << *ack << " from "
+	     << ack->get_source() << " on " << *in << dendl;
+
+    map<client_t,Capability::Import> imported_caps;
+    map<client_t,uint64_t> caps_ids;
+    auto blp = ack->cap_bl.cbegin();
+    decode(imported_caps, blp);
+    decode(caps_ids, blp);
+
+    for (auto& it : imported_caps) {
+      Capability *cap = in->get_client_cap(it.first);
+      if (!cap || cap->get_cap_id() != caps_ids.at(it.first))
+	continue;
+
+      dout(7) << " telling client." << it.first
+	      << " exported caps on " << *in << dendl;
+      auto m = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, in->ino(), 0,
+				       cap->get_cap_id(), cap->get_mseq(),
+				       mds->get_osd_epoch_barrier());
+      m->set_cap_peer(it.second.cap_id, it.second.issue_seq, it.second.mseq, from, 0);
+      mds->send_message_client_counted(m, it.first);
+
+      in->remove_client_cap(it.first);
+    }
+
+    mds->locker->request_inode_file_caps(in);
+    mds->locker->try_eval(in, CEPH_CAP_LOCKS);
+  }
+}
+
+void Migrator::handle_gather_caps(const cref_t<MGatherCaps> &m)
+{
+  CInode *in = mdcache->get_inode(m->ino);
+  if (!in)
+    return;
+
+  dout(10) << *m << " from " << m->get_source()
+           << " on " << *in << dendl;
+
+  if (in->is_any_caps() &&
+      !in->is_auth() &&
+      !in->is_ambiguous_auth() &&
+      !in->state_test(CInode::STATE_EXPORTINGCAPS))
+    export_caps(in);
+}
+
+class C_M_LoggedImportCaps : public MigratorLogContext {
+  CInode *in;
+  mds_rank_t from;
+public:
+  map<client_t,pair<Session*,uint64_t> > imported_session_map;
+  map<CInode*, map<client_t,Capability::Export> > peer_exports;
+
+  C_M_LoggedImportCaps(Migrator *m, CInode *i, mds_rank_t f) : MigratorLogContext(m), in(i), from(f) {}
+  void finish(int r) override {
+    mig->logged_import_caps(in, from, imported_session_map, peer_exports);
+  }  
+};
+
+void Migrator::handle_export_caps(const cref_t<MExportCaps> &ex)
+{
+  dout(10) << *ex << " from " << ex->get_source() << dendl;
+  CInode *in = mdcache->get_inode(ex->ino);
+  
+  ceph_assert(in);
+  ceph_assert(in->is_auth());
+
+  // FIXME
+  if (!in->can_auth_pin()) {
+    return;
+  }
+
+  in->auth_pin(this);
+
+  map<client_t,entity_inst_t> client_map{ex->client_map};
+  map<client_t,client_metadata_t> client_metadata_map{ex->client_metadata_map};
+
+  C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
+      this, in, mds_rank_t(ex->get_source().num()));
+
+  version_t pv = mds->server->prepare_force_open_sessions(client_map, client_metadata_map,
+							  finish->imported_session_map);
+  // decode new caps
+  auto blp = ex->cap_bl.cbegin();
+  decode_import_inode_caps(in, false, blp, finish->peer_exports);
+  ceph_assert(!finish->peer_exports.empty());   // thus, inode is pinned.
+
+  // journal open client sessions
+  ESessions *le = new ESessions(pv, std::move(client_map),
+				std::move(client_metadata_map));
+  mds->mdlog->start_submit_entry(le, finish);
+  mds->mdlog->flush();
+}
+
+
+void Migrator::logged_import_caps(CInode *in, 
+				  mds_rank_t from,
+				  map<client_t,pair<Session*,uint64_t> >& imported_session_map,
+				  map<CInode*, map<client_t,Capability::Export> >& peer_exports)
+{
+  dout(10) << *in << dendl;
+  // see export_go() vs export_go_synced()
+  ceph_assert(in->is_auth());
+
+  // force open client sessions and finish cap import
+  mds->server->finish_force_open_sessions(imported_session_map);
+
+  auto it = peer_exports.find(in);
+  ceph_assert(it != peer_exports.end());
+
+  // clients will release caps from the exporter when they receive the cap import message.
+  map<client_t,Capability::Import> imported_caps;
+  finish_import_inode_caps(in, from, false, imported_session_map, it->second, imported_caps);
+  mds->locker->eval(in, CEPH_CAP_LOCKS, true);
+
+  if (!imported_caps.empty()) {
+    auto ack = make_message<MExportCapsAck>(in->ino());
+    map<client_t,uint64_t> peer_caps_ids;
+    for (auto &p : imported_caps )
+      peer_caps_ids[p.first] = it->second.at(p.first).cap_id;
+
+    encode(imported_caps, ack->cap_bl);
+    encode(peer_caps_ids, ack->cap_bl);
+    mds->send_message_mds(ack, from);
+  }
+
+  in->auth_unpin(this);
+}
+
+Migrator::Migrator(MDSRank *m, MDCache *c) : mds(m), mdcache(c) {
+  max_export_size = g_conf().get_val<Option::size_t>("mds_max_export_size");
+  inject_session_race = g_conf().get_val<bool>("mds_inject_migrator_session_race");
+}
+
+void Migrator::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
+{
+  if (changed.count("mds_max_export_size"))
+    max_export_size = g_conf().get_val<Option::size_t>("mds_max_export_size");
+  if (changed.count("mds_inject_migrator_session_race")) {
+    inject_session_race = g_conf().get_val<bool>("mds_inject_migrator_session_race");
+    dout(0) << "mds_inject_migrator_session_race is " << inject_session_race << dendl;
+  }
+}
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
new file mode 100644
index 000000000..f7c094294
--- /dev/null
+++ b/src/mds/Migrator.h
@@ -0,0 +1,378 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ * Handles the import and export of  mds authorities and actual cache data.
+ * See src/doc/exports.txt for a description.
+ */
+
+#ifndef CEPH_MDS_MIGRATOR_H
+#define CEPH_MDS_MIGRATOR_H
+
+#include "include/types.h"
+
+#include "MDSContext.h"
+
+#include <map>
+#include <list>
+#include <set>
+#include <string_view>
+
+#include "messages/MExportCaps.h"
+#include "messages/MExportCapsAck.h"
+#include "messages/MExportDir.h"
+#include "messages/MExportDirAck.h"
+#include "messages/MExportDirCancel.h"
+#include "messages/MExportDirDiscover.h"
+#include "messages/MExportDirDiscoverAck.h"
+#include "messages/MExportDirFinish.h"
+#include "messages/MExportDirNotify.h"
+#include "messages/MExportDirNotifyAck.h"
+#include "messages/MExportDirPrep.h"
+#include "messages/MExportDirPrepAck.h"
+#include "messages/MGatherCaps.h"
+
+class MDSRank;
+class CDir;
+class CInode;
+class CDentry;
+class Session;
+class EImportStart;
+
+class Migrator {
+public:
+  // export stages.  used to clean up intelligently if there's a failure.
+  const static int EXPORT_CANCELLED	= 0;  // cancelled
+  const static int EXPORT_CANCELLING	= 1;  // waiting for cancel notifyacks
+  const static int EXPORT_LOCKING	= 2;  // acquiring locks
+  const static int EXPORT_DISCOVERING	= 3;  // dest is disovering export dir
+  const static int EXPORT_FREEZING	= 4;  // we're freezing the dir tree
+  const static int EXPORT_PREPPING	= 5;  // sending dest spanning tree to export bounds
+  const static int EXPORT_WARNING	= 6;  // warning bystanders of dir_auth_pending
+  const static int EXPORT_EXPORTING	= 7;  // sent actual export, waiting for ack
+  const static int EXPORT_LOGGINGFINISH	= 8;  // logging EExportFinish
+  const static int EXPORT_NOTIFYING	= 9;  // waiting for notifyacks
+
+  // -- imports --
+  const static int IMPORT_DISCOVERING   = 1; // waiting for prep
+  const static int IMPORT_DISCOVERED    = 2; // waiting for prep
+  const static int IMPORT_PREPPING      = 3; // opening dirs on bounds
+  const static int IMPORT_PREPPED       = 4; // opened bounds, waiting for import
+  const static int IMPORT_LOGGINGSTART  = 5; // got import, logging EImportStart
+  const static int IMPORT_ACKING        = 6; // logged EImportStart, sent ack, waiting for finish
+  const static int IMPORT_FINISHING     = 7; // sent cap imports, waiting for finish
+  const static int IMPORT_ABORTING      = 8; // notifying bystanders of an abort before unfreezing
+
+  // -- cons --
+  Migrator(MDSRank *m, MDCache *c);
+
+  static std::string_view get_export_statename(int s) {
+    switch (s) {
+    case EXPORT_CANCELLING: return "cancelling";
+    case EXPORT_LOCKING: return "locking";
+    case EXPORT_DISCOVERING: return "discovering";
+    case EXPORT_FREEZING: return "freezing";
+    case EXPORT_PREPPING: return "prepping";
+    case EXPORT_WARNING: return "warning";
+    case EXPORT_EXPORTING: return "exporting";
+    case EXPORT_LOGGINGFINISH: return "loggingfinish";
+    case EXPORT_NOTIFYING: return "notifying";
+    default: ceph_abort(); return std::string_view();
+    }
+  }
+
+  static std::string_view get_import_statename(int s) {
+    switch (s) {
+    case IMPORT_DISCOVERING: return "discovering";
+    case IMPORT_DISCOVERED: return "discovered";
+    case IMPORT_PREPPING: return "prepping";
+    case IMPORT_PREPPED: return "prepped";
+    case IMPORT_LOGGINGSTART: return "loggingstart";
+    case IMPORT_ACKING: return "acking";
+    case IMPORT_FINISHING: return "finishing";
+    case IMPORT_ABORTING: return "aborting";
+    default: ceph_abort(); return std::string_view();
+    }
+  }
+
+  void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
+
+  void dispatch(const cref_t<Message> &);
+
+  void show_importing();
+  void show_exporting();
+
+  int get_num_exporting() const { return export_state.size(); }
+  int get_export_queue_size() const { return export_queue.size(); }
+  
+  // -- status --
+  int is_exporting(CDir *dir) const {
+    auto it = export_state.find(dir);
+    if (it != export_state.end()) return it->second.state;
+    return 0;
+  }
+  bool is_exporting() const { return !export_state.empty(); }
+  int is_importing(dirfrag_t df) const {
+    auto it = import_state.find(df);
+    if (it != import_state.end()) return it->second.state;
+    return 0;
+  }
+  bool is_importing() const { return !import_state.empty(); }
+
+  bool is_ambiguous_import(dirfrag_t df) const {
+    auto it = import_state.find(df);
+    if (it == import_state.end())
+      return false;
+    if (it->second.state >= IMPORT_LOGGINGSTART &&
+	it->second.state < IMPORT_ABORTING)
+      return true;
+    return false;
+  }
+
+  int get_import_state(dirfrag_t df) const {
+    auto it = import_state.find(df);
+    ceph_assert(it != import_state.end());
+    return it->second.state;
+  }
+  int get_import_peer(dirfrag_t df) const {
+    auto it = import_state.find(df);
+    ceph_assert(it != import_state.end());
+    return it->second.peer;
+  }
+
+  int get_export_state(CDir *dir) const {
+    auto it = export_state.find(dir);
+    ceph_assert(it != export_state.end());
+    return it->second.state;
+  }
+  // this returns true if we are export @dir,
+  // and are not waiting for @who to be
+  // be warned of ambiguous auth.
+  // only returns meaningful results during EXPORT_WARNING state.
+  bool export_has_warned(CDir *dir, mds_rank_t who) {
+    auto it = export_state.find(dir);
+    ceph_assert(it != export_state.end());
+    ceph_assert(it->second.state == EXPORT_WARNING);
+    return (it->second.warning_ack_waiting.count(who) == 0);
+  }
+
+  bool export_has_notified(CDir *dir, mds_rank_t who) const {
+    auto it = export_state.find(dir);
+    ceph_assert(it != export_state.end());
+    ceph_assert(it->second.state == EXPORT_NOTIFYING);
+    return (it->second.notify_ack_waiting.count(who) == 0);
+  }
+
+  void export_freeze_inc_num_waiters(CDir *dir) {
+    auto it = export_state.find(dir);
+    ceph_assert(it != export_state.end());
+    it->second.num_remote_waiters++;
+  }
+  void find_stale_export_freeze();
+
+  // -- misc --
+  void handle_mds_failure_or_stop(mds_rank_t who);
+
+  void audit();
+
+  // -- import/export --
+  // exporter
+  void dispatch_export_dir(MDRequestRef& mdr, int count);
+  void export_dir(CDir *dir, mds_rank_t dest);
+  void export_empty_import(CDir *dir);
+
+  void export_dir_nicely(CDir *dir, mds_rank_t dest);
+  void maybe_do_queued_export();
+  void clear_export_queue() {
+    export_queue.clear();
+    export_queue_gen++;
+  }
+  
+  void maybe_split_export(CDir* dir, uint64_t max_size, bool null_okay,
+			  vector<pair<CDir*, size_t> >& results);
+
+  bool export_try_grab_locks(CDir *dir, MutationRef& mut);
+  void get_export_client_set(CDir *dir, std::set<client_t> &client_set);
+  void get_export_client_set(CInode *in, std::set<client_t> &client_set);
+
+  void encode_export_inode(CInode *in, bufferlist& bl, 
+			   std::map<client_t,entity_inst_t>& exported_client_map,
+			   std::map<client_t,client_metadata_t>& exported_client_metadata_map);
+  void encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl,
+				std::map<client_t,entity_inst_t>& exported_client_map,
+				std::map<client_t,client_metadata_t>& exported_client_metadata_map);
+  void finish_export_inode(CInode *in, mds_rank_t target,
+			   std::map<client_t,Capability::Import>& peer_imported,
+			   MDSContext::vec& finished);
+  void finish_export_inode_caps(CInode *in, mds_rank_t target,
+			        std::map<client_t,Capability::Import>& peer_imported);
+
+
+  void encode_export_dir(bufferlist& exportbl,
+			CDir *dir,
+			std::map<client_t,entity_inst_t>& exported_client_map,
+			std::map<client_t,client_metadata_t>& exported_client_metadata_map,
+                        uint64_t &num_exported);
+  void finish_export_dir(CDir *dir, mds_rank_t target,
+			 std::map<inodeno_t,std::map<client_t,Capability::Import> >& peer_imported,
+			 MDSContext::vec& finished, int *num_dentries);
+
+  void clear_export_proxy_pins(CDir *dir);
+
+  void export_caps(CInode *in);
+
+  void decode_import_inode(CDentry *dn, bufferlist::const_iterator& blp,
+			   mds_rank_t oldauth, LogSegment *ls,
+			   std::map<CInode*, std::map<client_t,Capability::Export> >& cap_imports,
+			   std::list<ScatterLock*>& updated_scatterlocks);
+  void decode_import_inode_caps(CInode *in, bool auth_cap, bufferlist::const_iterator &blp,
+				std::map<CInode*, std::map<client_t,Capability::Export> >& cap_imports);
+  void finish_import_inode_caps(CInode *in, mds_rank_t from, bool auth_cap,
+				const std::map<client_t,pair<Session*,uint64_t> >& smap,
+				const std::map<client_t,Capability::Export> &export_map,
+				std::map<client_t,Capability::Import> &import_map);
+  void decode_import_dir(bufferlist::const_iterator& blp,
+			mds_rank_t oldauth,
+			CDir *import_root,
+			EImportStart *le, 
+			LogSegment *ls,
+			std::map<CInode*, std::map<client_t,Capability::Export> >& cap_imports,
+			std::list<ScatterLock*>& updated_scatterlocks, int &num_imported);
+
+  void import_reverse(CDir *dir);
+
+  void import_finish(CDir *dir, bool notify, bool last=true);
+
+protected:
+  struct export_base_t {
+    export_base_t(dirfrag_t df, mds_rank_t d, unsigned c, uint64_t g) :
+      dirfrag(df), dest(d), pending_children(c), export_queue_gen(g) {}
+    dirfrag_t dirfrag;
+    mds_rank_t dest;
+    unsigned pending_children;
+    uint64_t export_queue_gen;
+    bool restart = false;
+  };
+
+  // export fun
+  struct export_state_t {
+    export_state_t() {}
+
+    int state = 0;
+    mds_rank_t peer = MDS_RANK_NONE;
+    uint64_t tid = 0;
+    std::set<mds_rank_t> warning_ack_waiting;
+    std::set<mds_rank_t> notify_ack_waiting;
+    std::map<inodeno_t,std::map<client_t,Capability::Import> > peer_imported;
+    MutationRef mut;
+    size_t approx_size = 0;
+    // for freeze tree deadlock detection
+    utime_t last_cum_auth_pins_change;
+    int last_cum_auth_pins = 0;
+    int num_remote_waiters = 0; // number of remote authpin waiters
+    std::shared_ptr<export_base_t> parent;
+  };
+
+  // import fun
+  struct import_state_t {
+    import_state_t() : mut() {}
+    int state = 0;
+    mds_rank_t peer = 0;
+    uint64_t tid = 0;
+    std::set<mds_rank_t> bystanders;
+    std::list<dirfrag_t> bound_ls;
+    std::list<ScatterLock*> updated_scatterlocks;
+    std::map<client_t,pair<Session*,uint64_t> > session_map;
+    std::map<CInode*, std::map<client_t,Capability::Export> > peer_exports;
+    MutationRef mut;
+  };
+
+  typedef map<CDir*, export_state_t>::iterator export_state_iterator;
+
+  friend class C_MDC_ExportFreeze;
+  friend class C_MDS_ExportFinishLogged;
+  friend class C_M_ExportGo;
+  friend class C_M_ExportSessionsFlushed;
+  friend class C_MDS_ExportDiscover;
+  friend class C_MDS_ExportPrep;
+  friend class MigratorContext;
+  friend class MigratorLogContext;
+  friend class C_MDS_ImportDirLoggedStart;
+  friend class C_MDS_ImportDirLoggedFinish;
+  friend class C_M_LoggedImportCaps;
+
+  void handle_export_discover_ack(const cref_t<MExportDirDiscoverAck> &m);
+  void export_frozen(CDir *dir, uint64_t tid);
+  void handle_export_prep_ack(const cref_t<MExportDirPrepAck> &m);
+  void export_sessions_flushed(CDir *dir, uint64_t tid);
+  void export_go(CDir *dir);
+  void export_go_synced(CDir *dir, uint64_t tid);
+  void export_try_cancel(CDir *dir, bool notify_peer=true);
+  void export_cancel_finish(export_state_iterator& it);
+  void export_reverse(CDir *dir, export_state_t& stat);
+  void export_notify_abort(CDir *dir, export_state_t& stat, std::set<CDir*>& bounds);
+  void handle_export_ack(const cref_t<MExportDirAck> &m);
+  void export_logged_finish(CDir *dir);
+  void handle_export_notify_ack(const cref_t<MExportDirNotifyAck> &m);
+  void export_finish(CDir *dir);
+  void child_export_finish(std::shared_ptr<export_base_t>& parent, bool success);
+  void encode_export_prep_trace(bufferlist& bl, CDir *bound, CDir *dir, export_state_t &es,
+                               set<inodeno_t> &inodes_added, set<dirfrag_t> &dirfrags_added);
+  void decode_export_prep_trace(bufferlist::const_iterator& blp, mds_rank_t oldauth, MDSContext::vec &finished);
+
+  void handle_gather_caps(const cref_t<MGatherCaps> &m);
+
+  // importer
+  void handle_export_discover(const cref_t<MExportDirDiscover> &m, bool started=false);
+  void handle_export_cancel(const cref_t<MExportDirCancel> &m);
+  void handle_export_prep(const cref_t<MExportDirPrep> &m, bool did_assim=false);
+  void handle_export_dir(const cref_t<MExportDir> &m);
+
+  void import_reverse_discovering(dirfrag_t df);
+  void import_reverse_discovered(dirfrag_t df, CInode *diri);
+  void import_reverse_prepping(CDir *dir, import_state_t& stat);
+  void import_remove_pins(CDir *dir, std::set<CDir*>& bounds);
+  void import_reverse_unfreeze(CDir *dir);
+  void import_reverse_final(CDir *dir);
+  void import_notify_abort(CDir *dir, std::set<CDir*>& bounds);
+  void import_notify_finish(CDir *dir, std::set<CDir*>& bounds);
+  void import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
+			   std::map<client_t,pair<Session*,uint64_t> >& imported_session_map);
+  void handle_export_finish(const cref_t<MExportDirFinish> &m);
+
+  void handle_export_caps(const cref_t<MExportCaps> &m);
+  void handle_export_caps_ack(const cref_t<MExportCapsAck> &m);
+  void logged_import_caps(CInode *in,
+			  mds_rank_t from,
+			  std::map<client_t,pair<Session*,uint64_t> >& imported_session_map,
+			  std::map<CInode*, std::map<client_t,Capability::Export> >& cap_imports);
+
+  // bystander
+  void handle_export_notify(const cref_t<MExportDirNotify> &m);
+
+  std::map<CDir*, export_state_t>  export_state;
+
+  uint64_t total_exporting_size = 0;
+  unsigned num_locking_exports = 0; // exports in locking state (approx_size == 0)
+
+  std::list<pair<dirfrag_t,mds_rank_t> >  export_queue;
+  uint64_t export_queue_gen = 1;
+
+  std::map<dirfrag_t, import_state_t>  import_state;
+
+private:
+  MDSRank *mds;
+  MDCache *mdcache;
+  uint64_t max_export_size = 0;
+  bool inject_session_race = false;
+};
+
+#endif
diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc
new file mode 100644
index 000000000..7e3fb22bd
--- /dev/null
+++ b/src/mds/Mutation.cc
@@ -0,0 +1,611 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "Mutation.h"
+#include "ScatterLock.h"
+#include "CInode.h"
+#include "CDir.h"
+
+// MutationImpl
+
+void MutationImpl::pin(MDSCacheObject *o)
+{
+  auto& stat = object_states[o];
+  if (!stat.pinned) {
+    o->get(MDSCacheObject::PIN_REQUEST);
+    stat.pinned = true;
+    ++num_pins;
+  }      
+}
+
+void MutationImpl::unpin(MDSCacheObject *o)
+{
+  auto& stat = object_states[o];
+  ceph_assert(stat.pinned);
+  o->put(MDSCacheObject::PIN_REQUEST);
+  stat.pinned = false;
+  --num_pins;
+}
+
+void MutationImpl::set_stickydirs(CInode *in)
+{
+  if (!stickydiri || stickydiri != in) {
+    in->get_stickydirs();
+    if (stickydiri)
+      stickydiri->put_stickydirs();
+    stickydiri = in;
+  }
+}
+
+void MutationImpl::put_stickydirs()
+{
+  if (stickydiri) {
+    stickydiri->put_stickydirs();
+    stickydiri = nullptr;
+
+  }
+}
+
+void MutationImpl::drop_pins()
+{
+  for (auto& p : object_states) {
+    if (p.second.pinned) {
+      p.first->put(MDSCacheObject::PIN_REQUEST);
+      p.second.pinned = false;
+      --num_pins;
+    }
+  }
+}
+
+void MutationImpl::start_locking(SimpleLock *lock, int target)
+{
+  ceph_assert(locking == NULL);
+  pin(lock->get_parent());
+  locking = lock;
+  locking_target_mds = target;
+}
+
+void MutationImpl::finish_locking(SimpleLock *lock)
+{
+  ceph_assert(locking == lock);
+  locking = NULL;
+  locking_target_mds = -1;
+}
+
+bool MutationImpl::is_rdlocked(SimpleLock *lock) const {
+  auto it = locks.find(lock);
+  if (it != locks.end() && it->is_rdlock())
+    return true;
+  if (lock_cache)
+    return static_cast<const MutationImpl*>(lock_cache)->is_rdlocked(lock);
+  return false;
+}
+
+bool MutationImpl::is_wrlocked(SimpleLock *lock) const {
+  auto it = locks.find(lock);
+  if (it != locks.end() && it->is_wrlock())
+    return true;
+  if (lock_cache)
+    return static_cast<const MutationImpl*>(lock_cache)->is_wrlocked(lock);
+  return false;
+}
+
+void MutationImpl::LockOpVec::erase_rdlock(SimpleLock* lock)
+{
+  for (int i = size() - 1; i >= 0; --i) {
+    auto& op = (*this)[i];
+    if (op.lock == lock && op.is_rdlock()) {
+      erase(begin() + i);
+      return;
+    }
+  }
+}
+void MutationImpl::LockOpVec::sort_and_merge()
+{
+  // sort locks on the same object
+  auto cmp = [](const LockOp &l, const LockOp &r) {
+    ceph_assert(l.lock->get_parent() == r.lock->get_parent());
+    return l.lock->type->type < r.lock->type->type;
+  };
+  for (auto i = begin(), j = i; ; ++i) {
+    if (i == end()) {
+      std::sort(j, i, cmp);
+      break;
+    }
+    if (j->lock->get_parent() != i->lock->get_parent()) {
+      std::sort(j, i, cmp);
+      j = i;
+    }
+  }
+  // merge ops on the same lock
+  for (auto i = end() - 1; i > begin(); ) {
+    auto j = i;
+    while (--j >= begin()) {
+      if (i->lock != j->lock)
+	break;
+    }
+    if (i - j == 1) {
+      i = j;
+      continue;
+    }
+    // merge
+    ++j;
+    for (auto k = i; k > j; --k) {
+      if (k->is_remote_wrlock()) {
+	ceph_assert(!j->is_remote_wrlock());
+	j->wrlock_target = k->wrlock_target;
+      }
+      j->flags |= k->flags;
+    }
+    if (j->is_xlock()) {
+      // xlock overwrites other types
+      ceph_assert(!j->is_remote_wrlock());
+      j->flags = LockOp::XLOCK;
+    }
+    erase(j + 1, i + 1);
+    i = j - 1;
+  }
+}
+
+// auth pins
+bool MutationImpl::is_auth_pinned(MDSCacheObject *object) const
+{ 
+  auto stat_p = find_object_state(object);
+  if (!stat_p)
+    return false;
+  return stat_p->auth_pinned || stat_p->remote_auth_pinned != MDS_RANK_NONE;
+}
+
+void MutationImpl::auth_pin(MDSCacheObject *object)
+{
+  auto &stat = object_states[object];
+  if (!stat.auth_pinned) {
+    object->auth_pin(this);
+    stat.auth_pinned = true;
+    ++num_auth_pins;
+  }
+}
+
+void MutationImpl::auth_unpin(MDSCacheObject *object)
+{
+  auto &stat = object_states[object];
+  ceph_assert(stat.auth_pinned);
+  object->auth_unpin(this);
+  stat.auth_pinned = false;
+  --num_auth_pins;
+}
+
+void MutationImpl::drop_local_auth_pins()
+{
+  for (auto& p : object_states) {
+    if (p.second.auth_pinned) {
+      ceph_assert(p.first->is_auth());
+      p.first->auth_unpin(this);
+      p.second.auth_pinned = false;
+      --num_auth_pins;
+    }
+  }
+}
+
+void MutationImpl::set_remote_auth_pinned(MDSCacheObject *object, mds_rank_t from)
+{
+  auto &stat = object_states[object];
+  if (stat.remote_auth_pinned == MDS_RANK_NONE) {
+    stat.remote_auth_pinned = from;
+    ++num_remote_auth_pins;
+  } else {
+    ceph_assert(stat.remote_auth_pinned == from);
+  }
+}
+
+void MutationImpl::_clear_remote_auth_pinned(ObjectState &stat)
+{
+  ceph_assert(stat.remote_auth_pinned != MDS_RANK_NONE);
+  stat.remote_auth_pinned = MDS_RANK_NONE;
+  --num_remote_auth_pins;
+}
+
+void MutationImpl::add_updated_lock(ScatterLock *lock)
+{
+  updated_locks.push_back(lock);
+}
+
+void MutationImpl::add_cow_inode(CInode *in)
+{
+  pin(in);
+  dirty_cow_inodes.push_back(in);
+}
+
+void MutationImpl::add_cow_dentry(CDentry *dn)
+{
+  pin(dn);
+  dirty_cow_dentries.emplace_back(dn, dn->get_projected_version());
+}
+
+void MutationImpl::apply()
+{
+  for (auto& obj : projected_nodes) {
+    if (CInode *in = dynamic_cast<CInode*>(obj))
+      in->pop_and_dirty_projected_inode(ls, nullptr);
+  }
+
+  for (const auto& in : dirty_cow_inodes) {
+    in->_mark_dirty(ls);
+  }
+
+  for (const auto& [dn, v] : dirty_cow_dentries) {
+    dn->mark_dirty(v, ls);
+  }
+
+  for (auto& obj : projected_nodes) {
+    if (CDir *dir = dynamic_cast<CDir*>(obj))
+      dir->pop_and_dirty_projected_fnode(ls, nullptr);
+  }
+
+  for (const auto& lock : updated_locks) {
+    lock->mark_dirty();
+  }
+
+  projected_nodes.clear();
+}
+
+void MutationImpl::cleanup()
+{
+  drop_local_auth_pins();
+  drop_pins();
+}
+
+void MutationImpl::_dump_op_descriptor_unlocked(ostream& stream) const
+{
+  stream << "Mutation";
+}
+
+// MDRequestImpl
+
+MDRequestImpl::~MDRequestImpl()
+{
+  delete _more;
+}
+
+MDRequestImpl::More* MDRequestImpl::more()
+{ 
+  if (!_more)
+    _more = new More();
+  return _more;
+}
+
+bool MDRequestImpl::has_more() const
+{
+  return _more != nullptr;
+}
+
+bool MDRequestImpl::has_witnesses()
+{
+  return (_more != nullptr) && (!_more->witnessed.empty());
+}
+
+bool MDRequestImpl::peer_did_prepare()
+{
+  return has_more() && more()->peer_commit;
+}
+
+bool MDRequestImpl::peer_rolling_back()
+{
+  return has_more() && more()->peer_rolling_back;
+}
+
+bool MDRequestImpl::freeze_auth_pin(CInode *inode)
+{
+  ceph_assert(!more()->rename_inode || more()->rename_inode == inode);
+  more()->rename_inode = inode;
+  more()->is_freeze_authpin = true;
+  auth_pin(inode);
+  if (!inode->freeze_inode(1)) {
+    return false;
+  }
+  inode->freeze_auth_pin();
+  inode->unfreeze_inode();
+  return true;
+}
+
+void MDRequestImpl::unfreeze_auth_pin(bool clear_inode)
+{
+  ceph_assert(more()->is_freeze_authpin);
+  CInode *inode = more()->rename_inode;
+  if (inode->is_frozen_auth_pin())
+    inode->unfreeze_auth_pin();
+  else
+    inode->unfreeze_inode();
+  more()->is_freeze_authpin = false;
+  if (clear_inode)
+    more()->rename_inode = NULL;
+}
+
+void MDRequestImpl::set_remote_frozen_auth_pin(CInode *inode)
+{
+  more()->rename_inode = inode;
+  more()->is_remote_frozen_authpin = true;
+}
+
+void MDRequestImpl::set_ambiguous_auth(CInode *inode)
+{
+  ceph_assert(!more()->rename_inode || more()->rename_inode == inode);
+  ceph_assert(!more()->is_ambiguous_auth);
+
+  inode->set_ambiguous_auth();
+  more()->rename_inode = inode;
+  more()->is_ambiguous_auth = true;
+}
+
+void MDRequestImpl::clear_ambiguous_auth()
+{
+  CInode *inode = more()->rename_inode;
+  ceph_assert(inode && more()->is_ambiguous_auth);
+  inode->clear_ambiguous_auth();
+  more()->is_ambiguous_auth = false;
+}
+
+bool MDRequestImpl::can_auth_pin(MDSCacheObject *object)
+{
+  return object->can_auth_pin() ||
+         (is_auth_pinned(object) && has_more() &&
+	  more()->is_freeze_authpin &&
+	  more()->rename_inode == object);
+}
+
+void MDRequestImpl::drop_local_auth_pins()
+{
+  if (has_more() && more()->is_freeze_authpin)
+    unfreeze_auth_pin(true);
+  MutationImpl::drop_local_auth_pins();
+}
+
+const filepath& MDRequestImpl::get_filepath()
+{
+  if (client_request)
+    return client_request->get_filepath();
+  return more()->filepath1;
+}
+
+const filepath& MDRequestImpl::get_filepath2()
+{
+  if (client_request)
+    return client_request->get_filepath2();
+  return more()->filepath2;
+}
+
+void MDRequestImpl::set_filepath(const filepath& fp)
+{
+  ceph_assert(!client_request);
+  more()->filepath1 = fp;
+}
+
+void MDRequestImpl::set_filepath2(const filepath& fp)
+{
+  ceph_assert(!client_request);
+  more()->filepath2 = fp;
+}
+
+bool MDRequestImpl::is_queued_for_replay() const
+{
+  return client_request ? client_request->is_queued_for_replay() : false;
+}
+
+bool MDRequestImpl::can_batch()
+{
+  if (num_auth_pins || num_remote_auth_pins || lock_cache || !locks.empty())
+    return false;
+
+  auto op = client_request->get_op();
+  auto& path = client_request->get_filepath();
+  if (op == CEPH_MDS_OP_GETATTR) {
+    if (path.depth() == 0)
+      return true;
+  } else if (op == CEPH_MDS_OP_LOOKUP) {
+    if (path.depth() == 1 && !path.is_last_snap())
+      return true;
+  }
+
+  return false;
+}
+
+std::unique_ptr<BatchOp> MDRequestImpl::release_batch_op()
+{
+  int mask = client_request->head.args.getattr.mask;
+  auto it = batch_op_map->find(mask);
+  std::unique_ptr<BatchOp> bop = std::move(it->second);
+  batch_op_map->erase(it);
+  return bop;
+}
+
+int MDRequestImpl::compare_paths()
+{
+  if (dir_root[0] < dir_root[1])
+    return -1;
+  if (dir_root[0] > dir_root[1])
+    return 1;
+  if (dir_depth[0] < dir_depth[1])
+    return -1;
+  if (dir_depth[0] > dir_depth[1])
+    return 1;
+  return 0;
+}
+
+cref_t<MClientRequest> MDRequestImpl::release_client_request()
+{
+  msg_lock.lock();
+  cref_t<MClientRequest> req;
+  req.swap(client_request);
+  client_request = req;
+  msg_lock.unlock();
+  return req;
+}
+
+void MDRequestImpl::reset_peer_request(const cref_t<MMDSPeerRequest>& req)
+{
+  msg_lock.lock();
+  cref_t<MMDSPeerRequest> old;
+  old.swap(peer_request);
+  peer_request = req;
+  msg_lock.unlock();
+  old.reset();
+}
+
+void MDRequestImpl::print(ostream &out) const
+{
+  out << "request(" << reqid << " nref=" << nref;
+  //if (request) out << " " << *request;
+  if (is_peer()) out << " peer_to mds." << peer_to_mds;
+  if (client_request) out << " cr=" << client_request;
+  if (peer_request) out << " sr=" << peer_request;
+  out << ")";
+}
+
+void MDRequestImpl::dump(Formatter *f) const
+{
+  _dump(f);
+}
+
+void MDRequestImpl::_dump(Formatter *f) const
+{
+  f->dump_string("flag_point", state_string());
+  f->dump_stream("reqid") << reqid;
+  {
+    msg_lock.lock();
+    auto _client_request = client_request;
+    auto _peer_request =peer_request;
+    msg_lock.unlock();
+
+    if (_client_request) {
+      f->dump_string("op_type", "client_request");
+      f->open_object_section("client_info");
+      f->dump_stream("client") << _client_request->get_orig_source();
+      f->dump_int("tid", _client_request->get_tid());
+      f->close_section(); // client_info
+    } else if (is_peer()) { // replies go to an existing mdr
+      f->dump_string("op_type", "peer_request");
+      f->open_object_section("leader_info");
+      f->dump_stream("leader") << peer_to_mds;
+      f->close_section(); // leader_info
+
+      if (_peer_request) {
+        f->open_object_section("request_info");
+        f->dump_int("attempt", _peer_request->get_attempt());
+        f->dump_string("op_type",
+           MMDSPeerRequest::get_opname(_peer_request->get_op()));
+        f->dump_int("lock_type", _peer_request->get_lock_type());
+        f->dump_stream("object_info") << _peer_request->get_object_info();
+        f->dump_stream("srcdnpath") << _peer_request->srcdnpath;
+        f->dump_stream("destdnpath") << _peer_request->destdnpath;
+        f->dump_stream("witnesses") << _peer_request->witnesses;
+        f->dump_bool("has_inode_export",
+           _peer_request->inode_export_v != 0);
+        f->dump_int("inode_export_v", _peer_request->inode_export_v);
+        f->dump_stream("op_stamp") << _peer_request->op_stamp;
+        f->close_section(); // request_info
+      }
+    }
+    else if (internal_op != -1) { // internal request
+      f->dump_string("op_type", "internal_op");
+      f->dump_int("internal_op", internal_op);
+      f->dump_string("op_name", ceph_mds_op_name(internal_op));
+    }
+    else {
+      f->dump_string("op_type", "no_available_op_found");
+    }
+  }
+  {
+    f->open_array_section("events");
+    std::lock_guard l(lock);
+    for (auto& i : events) {
+      f->dump_object("event", i);
+    }
+    f->close_section(); // events
+  }
+}
+
+void MDRequestImpl::_dump_op_descriptor_unlocked(ostream& stream) const
+{
+  msg_lock.lock();
+  auto _client_request = client_request;
+  auto _peer_request = peer_request;
+  msg_lock.unlock();
+
+  if (_client_request) {
+    _client_request->print(stream);
+  } else if (_peer_request) {
+    _peer_request->print(stream);
+  } else if (is_peer()) {
+    stream << "peer_request:" << reqid;
+  } else if (internal_op >= 0) {
+    stream << "internal op " << ceph_mds_op_name(internal_op) << ":" << reqid;
+  } else {
+    // drat, it's triggered by a peer request, but we don't have a message
+    // FIXME
+    stream << "rejoin:" << reqid;
+  }
+}
+
+void MDLockCache::attach_locks()
+{
+  ceph_assert(!items_lock);
+  items_lock.reset(new LockItem[locks.size()]);
+  int i = 0;
+  for (auto& p : locks) {
+    items_lock[i].parent = this;
+    p.lock->add_cache(items_lock[i]);
+    ++i;
+  }
+}
+
+void MDLockCache::attach_dirfrags(std::vector<CDir*>&& dfv)
+{
+  std::sort(dfv.begin(), dfv.end());
+  auto last = std::unique(dfv.begin(), dfv.end());
+  dfv.erase(last, dfv.end());
+  auth_pinned_dirfrags = std::move(dfv);
+
+  ceph_assert(!items_dir);
+  items_dir.reset(new DirItem[auth_pinned_dirfrags.size()]);
+  int i = 0;
+  for (auto dir : auth_pinned_dirfrags) {
+    items_dir[i].parent = this;
+    dir->lock_caches_with_auth_pins.push_back(&items_dir[i].item_dir);
+    ++i;
+  }
+}
+
+void MDLockCache::detach_locks()
+{
+  ceph_assert(items_lock);
+  int i = 0;
+  for (auto& p : locks) {
+    auto& item = items_lock[i];
+    p.lock->remove_cache(item);
+    ++i;
+  }
+  items_lock.reset();
+}
+
+void MDLockCache::detach_dirfrags()
+{
+  ceph_assert(items_dir);
+  int i = 0;
+  for (auto dir : auth_pinned_dirfrags) {
+    (void)dir;
+    items_dir[i].item_dir.remove_myself();
+    ++i;
+  }
+  items_dir.reset();
+}
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
new file mode 100644
index 000000000..6d4073aaf
--- /dev/null
+++ b/src/mds/Mutation.h
@@ -0,0 +1,534 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_MUTATION_H
+#define CEPH_MDS_MUTATION_H
+
+#include "include/interval_set.h"
+#include "include/elist.h"
+#include "include/filepath.h"
+
+#include "MDSCacheObject.h"
+#include "MDSContext.h"
+
+#include "SimpleLock.h"
+#include "Capability.h"
+#include "BatchOp.h"
+
+#include "common/TrackedOp.h"
+#include "messages/MClientRequest.h"
+#include "messages/MMDSPeerRequest.h"
+#include "messages/MClientReply.h"
+
+class LogSegment;
+class CInode;
+class CDir;
+class CDentry;
+class Session;
+class ScatterLock;
+struct sr_t;
+struct MDLockCache;
+
+struct MutationImpl : public TrackedOp {
+public:
+  // -- my pins and auth_pins --
+  struct ObjectState {
+    bool pinned = false;
+    bool auth_pinned = false;
+    mds_rank_t remote_auth_pinned = MDS_RANK_NONE;
+  };
+
+  // held locks
+  struct LockOp {
+    enum {
+      RDLOCK		= 1,
+      WRLOCK		= 2,
+      XLOCK		= 4,
+      REMOTE_WRLOCK	= 8,
+      STATE_PIN		= 16, // no RW after locked, just pin lock state
+    };
+
+    LockOp(SimpleLock *l, unsigned f=0, mds_rank_t t=MDS_RANK_NONE) :
+      lock(l), flags(f), wrlock_target(t) {}
+
+    bool is_rdlock() const { return !!(flags & RDLOCK); }
+    bool is_xlock() const { return !!(flags & XLOCK); }
+    bool is_wrlock() const { return !!(flags & WRLOCK); }
+    void clear_wrlock() const { flags &= ~WRLOCK; }
+    bool is_remote_wrlock() const { return !!(flags & REMOTE_WRLOCK); }
+    void clear_remote_wrlock() const {
+      flags &= ~REMOTE_WRLOCK;
+      wrlock_target = MDS_RANK_NONE;
+    }
+    bool is_state_pin() const { return !!(flags & STATE_PIN); }
+    bool operator<(const LockOp& r) const {
+      return lock < r.lock;
+    }
+
+    SimpleLock* lock;
+    mutable unsigned flags;
+    mutable mds_rank_t wrlock_target;
+  };
+
+  struct LockOpVec : public std::vector<LockOp> {
+    LockOpVec() {
+      reserve(32);
+    }
+
+    void add_rdlock(SimpleLock *lock) {
+      emplace_back(lock, LockOp::RDLOCK);
+    }
+    void erase_rdlock(SimpleLock *lock);
+    void add_xlock(SimpleLock *lock, int idx=-1) {
+      if (idx >= 0)
+	emplace(cbegin() + idx, lock, LockOp::XLOCK);
+      else
+	emplace_back(lock, LockOp::XLOCK);
+    }
+    void add_wrlock(SimpleLock *lock, int idx=-1) {
+      if (idx >= 0)
+	emplace(cbegin() + idx, lock, LockOp::WRLOCK);
+      else
+	emplace_back(lock, LockOp::WRLOCK);
+    }
+    void add_remote_wrlock(SimpleLock *lock, mds_rank_t rank) {
+      ceph_assert(rank != MDS_RANK_NONE);
+      emplace_back(lock, LockOp::REMOTE_WRLOCK, rank);
+    }
+    void lock_scatter_gather(SimpleLock *lock) {
+      emplace_back(lock, LockOp::WRLOCK | LockOp::STATE_PIN);
+    }
+    void sort_and_merge();
+  };
+
+  using lock_set = std::set<LockOp>;
+  using lock_iterator = lock_set::iterator;
+
+  // keep our default values synced with MDRequestParam's
+  MutationImpl() : TrackedOp(nullptr, utime_t()) {}
+  MutationImpl(OpTracker *tracker, utime_t initiated,
+	       const metareqid_t &ri, __u32 att=0, mds_rank_t peer_to=MDS_RANK_NONE)
+    : TrackedOp(tracker, initiated),
+      reqid(ri), attempt(att),
+      peer_to_mds(peer_to) {}
+  ~MutationImpl() override {
+    ceph_assert(!locking);
+    ceph_assert(!lock_cache);
+    ceph_assert(num_pins == 0);
+    ceph_assert(num_auth_pins == 0);
+  }
+
+  const ObjectState* find_object_state(MDSCacheObject *obj) const {
+    auto it = object_states.find(obj);
+    return it != object_states.end() ? &it->second : nullptr;
+  }
+
+  bool is_any_remote_auth_pin() const { return num_remote_auth_pins > 0; }
+
+  void disable_lock_cache() {
+    lock_cache_disabled = true;
+  }
+
+  lock_iterator emplace_lock(SimpleLock *l, unsigned f=0, mds_rank_t t=MDS_RANK_NONE) {
+    last_locked = l;
+    return locks.emplace(l, f, t).first;
+  }
+
+  bool is_rdlocked(SimpleLock *lock) const;
+  bool is_wrlocked(SimpleLock *lock) const;
+  bool is_xlocked(SimpleLock *lock) const {
+    auto it = locks.find(lock);
+    return it != locks.end() && it->is_xlock();
+  }
+  bool is_remote_wrlocked(SimpleLock *lock) const {
+    auto it = locks.find(lock);
+    return it != locks.end() && it->is_remote_wrlock();
+  }
+  bool is_last_locked(SimpleLock *lock) const {
+    return lock == last_locked;
+  }
+
+  bool is_leader() const { return peer_to_mds == MDS_RANK_NONE; }
+  bool is_peer() const { return peer_to_mds != MDS_RANK_NONE; }
+
+  client_t get_client() const {
+    if (reqid.name.is_client())
+      return client_t(reqid.name.num());
+    return -1;
+  }
+
+  void set_mds_stamp(utime_t t) {
+    mds_stamp = t;
+  }
+  utime_t get_mds_stamp() const {
+    return mds_stamp;
+  }
+  void set_op_stamp(utime_t t) {
+    op_stamp = t;
+  }
+  utime_t get_op_stamp() const {
+    if (op_stamp != utime_t())
+      return op_stamp;
+    return get_mds_stamp();
+  }
+
+  // pin items in cache
+  void pin(MDSCacheObject *object);
+  void unpin(MDSCacheObject *object);
+  void set_stickydirs(CInode *in);
+  void put_stickydirs();
+  void drop_pins();
+
+  void start_locking(SimpleLock *lock, int target=-1);
+  void finish_locking(SimpleLock *lock);
+
+  // auth pins
+  bool is_auth_pinned(MDSCacheObject *object) const;
+  void auth_pin(MDSCacheObject *object);
+  void auth_unpin(MDSCacheObject *object);
+  void drop_local_auth_pins();
+  void set_remote_auth_pinned(MDSCacheObject* object, mds_rank_t from);
+  void _clear_remote_auth_pinned(ObjectState& stat);
+
+  void add_projected_node(MDSCacheObject* obj) {
+    projected_nodes.insert(obj);
+  }
+  void remove_projected_node(MDSCacheObject* obj) {
+    projected_nodes.erase(obj);
+  }
+  bool is_projected(MDSCacheObject *obj) const {
+    return projected_nodes.count(obj);
+  }
+  void add_updated_lock(ScatterLock *lock);
+  void add_cow_inode(CInode *in);
+  void add_cow_dentry(CDentry *dn);
+  void apply();
+  void cleanup();
+
+  virtual void print(std::ostream &out) const {
+    out << "mutation(" << this << ")";
+  }
+
+  virtual void dump(ceph::Formatter *f) const {}
+  void _dump_op_descriptor_unlocked(std::ostream& stream) const override;
+
+  metareqid_t reqid;
+  __u32 attempt = 0;      // which attempt for this request
+  LogSegment *ls = nullptr;  // the log segment i'm committing to
+
+  // flag mutation as peer
+  mds_rank_t peer_to_mds = MDS_RANK_NONE;  // this is a peer request if >= 0.
+
+  ceph::unordered_map<MDSCacheObject*, ObjectState> object_states;
+  int num_pins = 0;
+  int num_auth_pins = 0;
+  int num_remote_auth_pins = 0;
+  // cache pins (so things don't expire)
+  CInode* stickydiri = nullptr;
+
+  lock_set locks;  // full ordering
+  MDLockCache* lock_cache = nullptr;
+  bool lock_cache_disabled = false;
+  SimpleLock *last_locked = nullptr;
+  // Lock we are currently trying to acquire. If we give up for some reason,
+  // be sure to eval() this.
+  SimpleLock *locking = nullptr;
+  mds_rank_t locking_target_mds = -1;
+
+  // if this flag is set, do not attempt to acquire further locks.
+  //  (useful for wrlock, which may be a moving auth target)
+  enum {
+    SNAP_LOCKED		= 1,
+    SNAP2_LOCKED	= 2,
+    PATH_LOCKED		= 4,
+    ALL_LOCKED		= 8,
+  };
+  int locking_state = 0;
+
+  bool committing = false;
+  bool aborted = false;
+  bool killed = false;
+
+  // for applying projected inode changes
+  std::set<MDSCacheObject*> projected_nodes;
+  std::list<ScatterLock*> updated_locks;
+
+  std::list<CInode*> dirty_cow_inodes;
+  std::list<std::pair<CDentry*,version_t> > dirty_cow_dentries;
+
+private:
+  utime_t mds_stamp; ///< mds-local timestamp (real time)
+  utime_t op_stamp;  ///< op timestamp (client provided)
+};
+
+/**
+ * MDRequestImpl: state we track for requests we are currently processing.
+ * mostly information about locks held, so that we can drop them all
+ * the request is finished or forwarded. see request_*().
+ */
+struct MDRequestImpl : public MutationImpl {
+  // TrackedOp stuff
+  typedef boost::intrusive_ptr<MDRequestImpl> Ref;
+
+  // break rarely-used fields into a separately allocated structure 
+  // to save memory for most ops
+  struct More {
+    More() {}
+
+    int peer_error = 0;
+    std::set<mds_rank_t> peers;           // mds nodes that have peer requests to me (implies client_request)
+    std::set<mds_rank_t> waiting_on_peer; // peers i'm waiting for peerreq replies from.
+
+    // for rename/link/unlink
+    std::set<mds_rank_t> witnessed;       // nodes who have journaled a RenamePrepare
+    std::map<MDSCacheObject*,version_t> pvmap;
+
+    bool has_journaled_peers = false;
+    bool peer_update_journaled = false;
+    bool peer_rolling_back = false;
+    
+    // for rename
+    std::set<mds_rank_t> extra_witnesses; // replica list from srcdn auth (rename)
+    mds_rank_t srcdn_auth_mds = MDS_RANK_NONE;
+    ceph::buffer::list inode_import;
+    version_t inode_import_v = 0;
+    CInode* rename_inode = nullptr;
+    bool is_freeze_authpin = false;
+    bool is_ambiguous_auth = false;
+    bool is_remote_frozen_authpin = false;
+    bool is_inode_exporter = false;
+    bool rdonly_checks = false;
+
+    std::map<client_t, std::pair<Session*, uint64_t> > imported_session_map;
+    std::map<CInode*, std::map<client_t,Capability::Export> > cap_imports;
+    
+    // for lock/flock
+    bool flock_was_waiting = false;
+
+    // for snaps
+    version_t stid = 0;
+    ceph::buffer::list snapidbl;
+
+    sr_t *srci_srnode = nullptr;
+    sr_t *desti_srnode = nullptr;
+
+    // called when peer commits or aborts
+    Context *peer_commit = nullptr;
+    ceph::buffer::list rollback_bl;
+
+    MDSContext::vec waiting_for_finish;
+
+    // export & fragment
+    CDir* export_dir = nullptr;
+    dirfrag_t fragment_base;
+
+    // for internal ops doing lookup
+    filepath filepath1;
+    filepath filepath2;
+  } *_more = nullptr;
+
+  // ---------------------------------------------------
+  struct Params {
+    // keep these default values synced to MutationImpl's
+    Params() {}
+    const utime_t& get_recv_stamp() const {
+      return initiated;
+    }
+    const utime_t& get_throttle_stamp() const {
+      return throttled;
+    }
+    const utime_t& get_recv_complete_stamp() const {
+      return all_read;
+    }
+    const utime_t& get_dispatch_stamp() const {
+      return dispatched;
+    }
+    metareqid_t reqid;
+    __u32 attempt = 0;
+    ceph::cref_t<MClientRequest> client_req;
+    ceph::cref_t<Message> triggering_peer_req;
+    mds_rank_t peer_to = MDS_RANK_NONE;
+    utime_t initiated;
+    utime_t throttled, all_read, dispatched;
+    int internal_op = -1;
+  };
+  MDRequestImpl(const Params* params, OpTracker *tracker) :
+    MutationImpl(tracker, params->initiated,
+		 params->reqid, params->attempt, params->peer_to),
+    item_session_request(this), client_request(params->client_req),
+    internal_op(params->internal_op) {}
+  ~MDRequestImpl() override;
+  
+  More* more();
+  bool has_more() const;
+  bool has_witnesses();
+  bool peer_did_prepare();
+  bool peer_rolling_back();
+  bool freeze_auth_pin(CInode *inode);
+  void unfreeze_auth_pin(bool clear_inode=false);
+  void set_remote_frozen_auth_pin(CInode *inode);
+  bool can_auth_pin(MDSCacheObject *object);
+  void drop_local_auth_pins();
+  void set_ambiguous_auth(CInode *inode);
+  void clear_ambiguous_auth();
+  const filepath& get_filepath();
+  const filepath& get_filepath2();
+  void set_filepath(const filepath& fp);
+  void set_filepath2(const filepath& fp);
+  bool is_queued_for_replay() const;
+  int compare_paths();
+
+  bool can_batch();
+  bool is_batch_head() {
+    return batch_op_map != nullptr;
+  }
+  std::unique_ptr<BatchOp> release_batch_op();
+
+  void print(std::ostream &out) const override;
+  void dump(ceph::Formatter *f) const override;
+
+  ceph::cref_t<MClientRequest> release_client_request();
+  void reset_peer_request(const ceph::cref_t<MMDSPeerRequest>& req=nullptr);
+
+  Session *session = nullptr;
+  elist<MDRequestImpl*>::item item_session_request;  // if not on list, op is aborted.
+
+  // -- i am a client (leader) request
+  ceph::cref_t<MClientRequest> client_request; // client request (if any)
+
+  // tree and depth info of path1 and path2
+  inodeno_t dir_root[2] = {0, 0};
+  int dir_depth[2] = {-1, -1};
+  file_layout_t dir_layout;
+  // store up to two sets of dn vectors, inode pointers, for request path1 and path2.
+  std::vector<CDentry*> dn[2];
+  CInode *in[2] = {};
+  CDentry *straydn = nullptr;
+  snapid_t snapid = CEPH_NOSNAP;
+
+  CInode *tracei = nullptr;
+  CDentry *tracedn = nullptr;
+
+  inodeno_t alloc_ino = 0, used_prealloc_ino = 0;
+  interval_set<inodeno_t> prealloc_inos;
+
+  int snap_caps = 0;
+  int getattr_caps = 0;		///< caps requested by getattr
+  bool no_early_reply = false;
+  bool did_early_reply = false;
+  bool o_trunc = false;		///< request is an O_TRUNC mutation
+  bool has_completed = false;	///< request has already completed
+
+  ceph::buffer::list reply_extra_bl;
+
+  // inos we did a embedded cap release on, and may need to eval if we haven't since reissued
+  std::map<vinodeno_t, ceph_seq_t> cap_releases;
+
+  // -- i am a peer request
+  ceph::cref_t<MMDSPeerRequest> peer_request; // peer request (if one is pending; implies peer == true)
+
+  // -- i am an internal op
+  int internal_op;
+  Context *internal_op_finish = nullptr;
+  void *internal_op_private = nullptr;
+
+  // indicates how may retries of request have been made
+  int retry = 0;
+
+  std::map<int, std::unique_ptr<BatchOp> > *batch_op_map = nullptr;
+
+  // indicator for vxattr osdmap update
+  bool waited_for_osdmap = false;
+
+protected:
+  void _dump(ceph::Formatter *f) const override;
+  void _dump_op_descriptor_unlocked(std::ostream& stream) const override;
+private:
+  mutable ceph::spinlock msg_lock;
+};
+
+struct MDPeerUpdate {
+  MDPeerUpdate(int oo, ceph::buffer::list &rbl) :
+    origop(oo) {
+    rollback = std::move(rbl);
+  }
+  ~MDPeerUpdate() {
+    if (waiter)
+      waiter->complete(0);
+  }
+  int origop;
+  ceph::buffer::list rollback;
+  Context *waiter = nullptr;
+  std::set<CInode*> olddirs;
+  std::set<CInode*> unlinked;
+};
+
+struct MDLockCacheItem {
+  MDLockCache *parent = nullptr;
+  elist<MDLockCacheItem*>::item item_lock;
+};
+
+struct MDLockCache : public MutationImpl {
+  using LockItem = MDLockCacheItem;
+
+  struct DirItem {
+    MDLockCache *parent = nullptr;
+    elist<DirItem*>::item item_dir;
+  };
+
+  MDLockCache(Capability *cap, int op) :
+    MutationImpl(), diri(cap->get_inode()), client_cap(cap), opcode(op) {
+    client_cap->lock_caches.push_back(&item_cap_lock_cache);
+  }
+
+  CInode *get_dir_inode() { return diri; }
+  void set_dir_layout(file_layout_t& layout) {
+    dir_layout = layout;
+  }
+  const file_layout_t& get_dir_layout() const {
+    return dir_layout;
+  }
+
+  void attach_locks();
+  void attach_dirfrags(std::vector<CDir*>&& dfv);
+  void detach_locks();
+  void detach_dirfrags();
+
+  CInode *diri;
+  Capability *client_cap;
+  int opcode;
+  file_layout_t dir_layout;
+
+  elist<MDLockCache*>::item item_cap_lock_cache;
+
+  // link myself to locked locks
+  std::unique_ptr<LockItem[]> items_lock;
+
+  // link myself to auth-pinned dirfrags
+  std::unique_ptr<DirItem[]> items_dir;
+  std::vector<CDir*> auth_pinned_dirfrags;
+
+  int ref = 1;
+  bool invalidating = false;
+};
+
+typedef boost::intrusive_ptr<MutationImpl> MutationRef;
+typedef boost::intrusive_ptr<MDRequestImpl> MDRequestRef;
+
+inline std::ostream& operator<<(std::ostream &out, const MutationImpl &mut)
+{
+  mut.print(out);
+  return out;
+}
+#endif
diff --git a/src/mds/OpenFileTable.cc b/src/mds/OpenFileTable.cc
new file mode 100644
index 000000000..687ddc3e9
--- /dev/null
+++ b/src/mds/OpenFileTable.cc
@@ -0,0 +1,1228 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "acconfig.h"
+#include "mds/CInode.h"
+#include "mds/CDir.h"
+#include "mds/MDSRank.h"
+#include "mds/MDCache.h"
+#include "osdc/Objecter.h"
+#include "OpenFileTable.h"
+
+#include "common/config.h"
+#include "common/errno.h"
+
+enum {
+  l_oft_first = 1000000,
+  l_oft_omap_total_objs,
+  l_oft_omap_total_kv_pairs,
+  l_oft_omap_total_updates,
+  l_oft_omap_total_removes,
+  l_oft_last
+};
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+  return *_dout << "mds." << mds->get_nodeid() << ".openfiles ";
+}
+
+OpenFileTable::OpenFileTable(MDSRank *m) : mds(m) {
+  PerfCountersBuilder b(mds->cct, "oft", l_oft_first, l_oft_last);
+
+  b.add_u64(l_oft_omap_total_objs, "omap_total_objs");
+  b.add_u64(l_oft_omap_total_kv_pairs, "omap_total_kv_pairs");
+  b.add_u64(l_oft_omap_total_updates, "omap_total_updates");
+  b.add_u64(l_oft_omap_total_removes, "omap_total_removes");
+  logger.reset(b.create_perf_counters());
+  mds->cct->get_perfcounters_collection()->add(logger.get());
+  logger->set(l_oft_omap_total_objs, 0);
+  logger->set(l_oft_omap_total_kv_pairs, 0);
+  logger->set(l_oft_omap_total_updates, 0);
+  logger->set(l_oft_omap_total_removes, 0);
+}
+
+OpenFileTable::~OpenFileTable() {
+  if (logger) {
+    mds->cct->get_perfcounters_collection()->remove(logger.get());
+  }
+}
+
+void OpenFileTable::get_ref(CInode *in, frag_t fg)
+{
+  do {
+    auto p = anchor_map.find(in->ino());
+    if (!in->is_dir()) {
+      ceph_assert(fg == -1U);
+      ceph_assert(p == anchor_map.end());
+    }
+
+    if (p != anchor_map.end()) {
+      ceph_assert(in->state_test(CInode::STATE_TRACKEDBYOFT));
+      ceph_assert(p->second.nref > 0);
+      p->second.nref++;
+
+      if (fg != -1U) {
+	auto ret = p->second.frags.insert(fg);
+	ceph_assert(ret.second);
+	dirty_items.emplace(in->ino(), (int)DIRTY_UNDEF);
+      }
+      break;
+    }
+
+    CDentry *dn = in->get_parent_dn();
+    CInode *pin = dn ? dn->get_dir()->get_inode() : nullptr;
+
+    auto ret = anchor_map.emplace(std::piecewise_construct, std::forward_as_tuple(in->ino()),
+				  std::forward_as_tuple(in->ino(), (pin ? pin->ino() : inodeno_t(0)),
+				  (dn ? dn->get_name() : string()), in->d_type(), 1));
+    ceph_assert(ret.second == true);
+    in->state_set(CInode::STATE_TRACKEDBYOFT);
+
+    if (fg != -1U)
+      ret.first->second.frags.insert(fg);
+
+    auto ret1 = dirty_items.emplace(in->ino(), (int)DIRTY_NEW);
+    if (!ret1.second) {
+      int omap_idx = ret1.first->second;
+      ceph_assert(omap_idx >= 0);
+      ret.first->second.omap_idx = omap_idx;
+    }
+
+    in = pin;
+    fg = -1U;
+  } while (in);
+}
+
+void OpenFileTable::put_ref(CInode *in, frag_t fg)
+{
+  do {
+    ceph_assert(in->state_test(CInode::STATE_TRACKEDBYOFT));
+    auto p = anchor_map.find(in->ino());
+    ceph_assert(p != anchor_map.end());
+    ceph_assert(p->second.nref > 0);
+
+    if (!in->is_dir()) {
+      ceph_assert(fg == -1U);
+      ceph_assert(p->second.nref == 1);
+    }
+
+    if (p->second.nref > 1) {
+      p->second.nref--;
+      if (fg != -1U) {
+	auto ret = p->second.frags.erase(fg);
+	ceph_assert(ret);
+	dirty_items.emplace(in->ino(), (int)DIRTY_UNDEF);
+      }
+      break;
+    }
+
+    CDentry *dn = in->get_parent_dn();
+    CInode *pin = dn ? dn->get_dir()->get_inode() : nullptr;
+    if (dn) {
+      ceph_assert(p->second.dirino == pin->ino());
+      ceph_assert(p->second.d_name == dn->get_name());
+    } else {
+      ceph_assert(p->second.dirino == inodeno_t(0));
+      ceph_assert(p->second.d_name == "");
+    }
+
+    if (fg != -1U) {
+      ceph_assert(p->second.frags.size() == 1);
+      ceph_assert(*p->second.frags.begin() == fg);
+    }
+
+    int omap_idx = p->second.omap_idx;
+    anchor_map.erase(p);
+    in->state_clear(CInode::STATE_TRACKEDBYOFT);
+
+    auto ret = dirty_items.emplace(in->ino(), omap_idx);
+    if (!ret.second) {
+      if (ret.first->second == DIRTY_NEW) {
+	ceph_assert(omap_idx < 0);
+	dirty_items.erase(ret.first);
+      } else {
+	ceph_assert(omap_idx >= 0);
+	ret.first->second = omap_idx;
+      }
+    }
+
+    in = pin;
+    fg = -1U;
+  } while (in);
+}
+
+void OpenFileTable::add_inode(CInode *in)
+{
+  dout(10) << __func__ << " " << *in << dendl;
+  get_ref(in);
+}
+
+void OpenFileTable::remove_inode(CInode *in)
+{
+  dout(10) << __func__ << " " << *in << dendl;
+  put_ref(in);
+}
+
+void OpenFileTable::add_dirfrag(CDir *dir)
+{
+  dout(10) << __func__ << " " << *dir << dendl;
+  ceph_assert(!dir->state_test(CDir::STATE_TRACKEDBYOFT));
+  dir->state_set(CDir::STATE_TRACKEDBYOFT);
+  get_ref(dir->get_inode(), dir->get_frag());
+}
+
+void OpenFileTable::remove_dirfrag(CDir *dir)
+{
+  dout(10) << __func__ << " " << *dir << dendl;
+  ceph_assert(dir->state_test(CDir::STATE_TRACKEDBYOFT));
+  dir->state_clear(CDir::STATE_TRACKEDBYOFT);
+  put_ref(dir->get_inode(), dir->get_frag());
+}
+
+void OpenFileTable::notify_link(CInode *in)
+{
+  dout(10) << __func__ << " " << *in << dendl;
+  auto p = anchor_map.find(in->ino());
+  ceph_assert(p != anchor_map.end());
+  ceph_assert(p->second.nref > 0);
+  ceph_assert(p->second.dirino == inodeno_t(0));
+  ceph_assert(p->second.d_name == "");
+
+  CDentry *dn = in->get_parent_dn();
+  CInode *pin = dn->get_dir()->get_inode();
+
+  p->second.dirino = pin->ino();
+  p->second.d_name = dn->get_name();
+  dirty_items.emplace(in->ino(), (int)DIRTY_UNDEF);
+
+  get_ref(pin);
+}
+
+void OpenFileTable::notify_unlink(CInode *in)
+{
+  dout(10) << __func__ << " " << *in << dendl;
+  auto p = anchor_map.find(in->ino());
+  ceph_assert(p != anchor_map.end());
+  ceph_assert(p->second.nref > 0);
+
+  CDentry *dn = in->get_parent_dn();
+  CInode *pin = dn->get_dir()->get_inode();
+  ceph_assert(p->second.dirino == pin->ino());
+  ceph_assert(p->second.d_name == dn->get_name());
+
+  p->second.dirino = inodeno_t(0);
+  p->second.d_name = "";
+  dirty_items.emplace(in->ino(), (int)DIRTY_UNDEF);
+
+  put_ref(pin);
+}
+
+object_t OpenFileTable::get_object_name(unsigned idx) const
+{
+  char s[30];
+  snprintf(s, sizeof(s), "mds%d_openfiles.%x", int(mds->get_nodeid()), idx);
+  return object_t(s);
+}
+
+void OpenFileTable::_encode_header(bufferlist &bl, int j_state)
+{
+  std::string_view magic = CEPH_FS_ONDISK_MAGIC;
+  encode(magic, bl);
+  ENCODE_START(1, 1, bl);
+  encode(omap_version, bl);
+  encode(omap_num_objs, bl);
+  encode((__u8)j_state, bl);
+  ENCODE_FINISH(bl);
+}
+
+class C_IO_OFT_Save : public MDSIOContextBase {
+protected:
+  OpenFileTable *oft;
+  uint64_t log_seq;
+  MDSContext *fin;
+  MDSRank *get_mds() override { return oft->mds; }
+public:
+  C_IO_OFT_Save(OpenFileTable *t, uint64_t s, MDSContext *c) :
+    oft(t), log_seq(s), fin(c) {}
+  void finish(int r) {
+    oft->_commit_finish(r, log_seq, fin);
+  }
+  void print(ostream& out) const override {
+    out << "openfiles_save";
+  }
+};
+
+void OpenFileTable::_commit_finish(int r, uint64_t log_seq, MDSContext *fin)
+{
+  dout(10) << __func__ << " log_seq " << log_seq << " committed_log_seq " << committed_log_seq
+           << " committing_log_seq " << committing_log_seq << dendl;
+  if (r < 0) {
+    mds->handle_write_error(r);
+    return;
+  }
+
+  ceph_assert(log_seq == committing_log_seq);
+  ceph_assert(log_seq >= committed_log_seq);
+  committed_log_seq = log_seq;
+  num_pending_commit--;
+
+  if (fin)
+    fin->complete(r);
+}
+
+class C_IO_OFT_Journal : public MDSIOContextBase {
+protected:
+  OpenFileTable *oft;
+  uint64_t log_seq;
+  MDSContext *fin;
+  std::map<unsigned, std::vector<ObjectOperation> > ops_map;
+  MDSRank *get_mds() override { return oft->mds; }
+public:
+  C_IO_OFT_Journal(OpenFileTable *t, uint64_t s, MDSContext *c,
+		   std::map<unsigned, std::vector<ObjectOperation> >& ops) :
+    oft(t), log_seq(s), fin(c) {
+    ops_map.swap(ops);
+  }
+  void finish(int r) {
+    oft->_journal_finish(r, log_seq, fin, ops_map);
+  }
+  void print(ostream& out) const override {
+    out << "openfiles_journal";
+  }
+};
+
+void OpenFileTable::_journal_finish(int r, uint64_t log_seq, MDSContext *c,
+				    std::map<unsigned, std::vector<ObjectOperation> >& ops_map)
+{
+  dout(10) << __func__ << " log_seq " << log_seq << dendl;
+  if (r < 0) {
+    mds->handle_write_error(r);
+    return;
+  }
+
+  C_GatherBuilder gather(g_ceph_context,
+			 new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c),
+			 mds->finisher));
+  SnapContext snapc;
+  object_locator_t oloc(mds->get_metadata_pool());
+  for (auto& [idx, vops] : ops_map) {
+    object_t oid = get_object_name(idx);
+    for (auto& op : vops) {
+      mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(),
+			    0, gather.new_sub());
+    }
+  }
+  gather.activate();
+
+  journal_state = JOURNAL_NONE;
+  return;
+}
+
+void OpenFileTable::commit(MDSContext *c, uint64_t log_seq, int op_prio)
+{
+  dout(10) << __func__ << " log_seq " << log_seq << " committing_log_seq:"
+          << committing_log_seq << dendl;
+
+  ceph_assert(num_pending_commit == 0);
+  num_pending_commit++;
+  ceph_assert(log_seq >= committing_log_seq);
+  committing_log_seq = log_seq;
+
+  omap_version++;
+
+  C_GatherBuilder gather(g_ceph_context);
+
+  SnapContext snapc;
+  object_locator_t oloc(mds->get_metadata_pool());
+
+  const unsigned max_write_size = mds->mdcache->max_dir_commit_size;
+
+  struct omap_update_ctl {
+    unsigned write_size = 0;
+    unsigned journal_idx = 0;
+    bool clear = false;
+    std::map<string, bufferlist> to_update, journaled_update;
+    std::set<string> to_remove, journaled_remove;
+  };
+  std::vector<omap_update_ctl> omap_updates(omap_num_objs);
+
+  using ceph::encode;
+  auto journal_func = [&](unsigned idx) {
+    auto& ctl = omap_updates.at(idx);
+
+    ObjectOperation op;
+    op.priority = op_prio;
+
+    if (ctl.clear) {
+      ctl.clear = false;
+      op.omap_clear();
+      op.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK);
+    }
+
+    if (ctl.journal_idx == 0) {
+      if (journal_state == JOURNAL_NONE)
+	journal_state = JOURNAL_START;
+      else
+	ceph_assert(journal_state == JOURNAL_START);
+
+      bufferlist header;
+      _encode_header(header, journal_state);
+      op.omap_set_header(header);
+    }
+
+    bufferlist bl;
+    encode(omap_version, bl);
+    encode(ctl.to_update, bl);
+    encode(ctl.to_remove, bl);
+
+    char key[32];
+    snprintf(key, sizeof(key), "_journal.%x", ctl.journal_idx++);
+    std::map<string, bufferlist> tmp_map;
+    tmp_map[key].swap(bl);
+    op.omap_set(tmp_map);
+
+    object_t oid = get_object_name(idx);
+    mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), 0,
+			  gather.new_sub());
+
+#ifdef HAVE_STDLIB_MAP_SPLICING
+    ctl.journaled_update.merge(ctl.to_update);
+    ctl.journaled_remove.merge(ctl.to_remove);
+#else
+    ctl.journaled_update.insert(make_move_iterator(begin(ctl.to_update)),
+				make_move_iterator(end(ctl.to_update)));
+    ctl.journaled_remove.insert(make_move_iterator(begin(ctl.to_remove)),
+				make_move_iterator(end(ctl.to_remove)));
+#endif
+    ctl.to_update.clear();
+    ctl.to_remove.clear();
+  };
+
+  std::map<unsigned, std::vector<ObjectOperation> > ops_map;
+
+  auto create_op_func = [&](unsigned idx, bool update_header) {
+    auto& ctl = omap_updates.at(idx);
+
+    auto& op_vec = ops_map[idx];
+    op_vec.resize(op_vec.size() + 1);
+    ObjectOperation& op = op_vec.back();
+    op.priority = op_prio;
+
+    if (ctl.clear) {
+      ctl.clear = false;
+      op.omap_clear();
+      op.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK);
+    }
+
+    if (update_header) {
+      bufferlist header;
+      _encode_header(header, journal_state);
+      op.omap_set_header(header);
+    }
+
+    if (!ctl.to_update.empty()) {
+      op.omap_set(ctl.to_update);
+      ctl.to_update.clear();
+    }
+    if (!ctl.to_remove.empty()) {
+      op.omap_rm_keys(ctl.to_remove);
+      ctl.to_remove.clear();
+    }
+  };
+
+  auto submit_ops_func = [&]() {
+    gather.set_finisher(new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c),
+					 mds->finisher));
+    for (auto& [idx, vops] : ops_map) {
+      object_t oid = get_object_name(idx);
+      for (auto& op : vops) {
+	mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(),
+			      0, gather.new_sub());
+      }
+    }
+    gather.activate();
+  };
+
+  bool first_commit = !loaded_anchor_map.empty();
+
+  unsigned first_free_idx = 0;
+  unsigned old_num_objs = omap_num_objs;
+  if (omap_num_objs == 0) {
+    omap_num_objs = 1;
+    omap_num_items.resize(omap_num_objs);
+    omap_updates.resize(omap_num_objs);
+    omap_updates.back().clear = true;
+  }
+
+  for (auto& [ino, state] : dirty_items) {
+    auto p = anchor_map.find(ino);
+
+    if (first_commit) {
+      auto q = loaded_anchor_map.find(ino);
+      if (q != loaded_anchor_map.end()) {
+	ceph_assert(p != anchor_map.end());
+	p->second.omap_idx = q->second.omap_idx;
+	bool same = (p->second == q->second);
+	loaded_anchor_map.erase(q);
+	if (same)
+	  continue;
+      }
+    }
+
+    char key[32];
+    int len = snprintf(key, sizeof(key), "%llx", (unsigned long long)ino.val);
+
+    int omap_idx;
+    if (p != anchor_map.end()) {
+      omap_idx = p->second.omap_idx;
+      if (omap_idx < 0) {
+	ceph_assert(state == DIRTY_NEW);
+	// find omap object to store the key
+	for (unsigned i = first_free_idx; i < omap_num_objs; i++) {
+	  if (omap_num_items[i] < MAX_ITEMS_PER_OBJ) {
+	    omap_idx = i;
+	    break;
+	  }
+	}
+	if (omap_idx < 0) {
+	  ++omap_num_objs;
+	  ceph_assert(omap_num_objs <= MAX_OBJECTS);
+	  omap_num_items.resize(omap_num_objs);
+	  omap_updates.resize(omap_num_objs);
+	  omap_updates.back().clear = true;
+	  omap_idx = omap_num_objs - 1;
+	}
+	first_free_idx = omap_idx;
+
+	p->second.omap_idx = omap_idx;
+	++omap_num_items[omap_idx];
+      }
+    } else {
+      omap_idx = state;
+      unsigned& count = omap_num_items.at(omap_idx);
+      ceph_assert(count > 0);
+      --count;
+      if ((unsigned)omap_idx < first_free_idx && count < MAX_ITEMS_PER_OBJ)
+	first_free_idx = omap_idx;
+    }
+    auto& ctl = omap_updates.at(omap_idx);
+    if (ctl.write_size >= max_write_size) {
+      journal_func(omap_idx);
+      ctl.write_size = 0;
+    }
+    if (p != anchor_map.end()) {
+      bufferlist bl;
+      encode(p->second, bl);
+      encode((__u32)0, bl); // frags set was encoded here
+
+      ctl.write_size += bl.length() + len + 2 * sizeof(__u32);
+      ctl.to_update[key].swap(bl);
+    } else {
+      ctl.write_size += len + sizeof(__u32);
+      ctl.to_remove.emplace(key);
+    }
+  }
+
+  dirty_items.clear();
+
+  if (first_commit) {
+    for (auto& [ino, anchor] : loaded_anchor_map) {
+      char key[32];
+      int len = snprintf(key, sizeof(key), "%llx", (unsigned long long)ino.val);
+
+      int omap_idx = anchor.omap_idx;
+      unsigned& count = omap_num_items.at(omap_idx);
+      ceph_assert(count > 0);
+      --count;
+
+      auto& ctl = omap_updates.at(omap_idx);
+      if (ctl.write_size >= max_write_size) {
+        journal_func(omap_idx);
+        ctl.write_size = 0;
+      }
+      ctl.write_size += len + sizeof(__u32);
+      ctl.to_remove.emplace(key);
+    }
+    loaded_anchor_map.clear();
+  }
+
+  size_t total_items = 0;
+  {
+    unsigned used_objs = 1;
+    std::vector<unsigned> objs_to_write;
+    bool journaled = false;
+    for (unsigned i = 0; i < omap_num_objs; i++) {
+      total_items += omap_num_items[i];
+      if (omap_updates[i].journal_idx)
+	journaled = true;
+      else if (omap_updates[i].write_size)
+	objs_to_write.push_back(i);
+
+      if (omap_num_items[i] > 0)
+	used_objs = i + 1;
+    }
+    ceph_assert(total_items == anchor_map.size());
+    // adjust omap object count
+    if (used_objs < omap_num_objs) {
+      omap_num_objs = used_objs;
+      omap_num_items.resize(omap_num_objs);
+    }
+    // skip journal if only one osd request is required and object count
+    // does not change.
+    if (!journaled && old_num_objs == omap_num_objs &&
+	objs_to_write.size() <= 1) {
+      ceph_assert(journal_state == JOURNAL_NONE);
+      ceph_assert(!gather.has_subs());
+
+      unsigned omap_idx = objs_to_write.empty() ? 0 : objs_to_write.front();
+      create_op_func(omap_idx, true);
+      submit_ops_func();
+      return;
+    }
+  }
+
+  for (unsigned omap_idx = 0; omap_idx < omap_updates.size(); omap_idx++) {
+    auto& ctl = omap_updates[omap_idx];
+    if (ctl.write_size > 0) {
+      journal_func(omap_idx);
+      ctl.write_size = 0;
+    }
+  }
+
+  if (journal_state == JOURNAL_START) {
+    ceph_assert(gather.has_subs());
+    journal_state = JOURNAL_FINISH;
+  } else {
+    // only object count changes
+    ceph_assert(journal_state == JOURNAL_NONE);
+    ceph_assert(!gather.has_subs());
+  }
+
+  uint64_t total_updates = 0;
+  uint64_t total_removes = 0;
+
+  for (unsigned omap_idx = 0; omap_idx < omap_updates.size(); omap_idx++) {
+    auto& ctl = omap_updates[omap_idx];
+    ceph_assert(ctl.to_update.empty() && ctl.to_remove.empty());
+    if (ctl.journal_idx == 0)
+      ceph_assert(ctl.journaled_update.empty() && ctl.journaled_remove.empty());
+
+    bool first = true;
+    for (auto& it : ctl.journaled_update) {
+      if (ctl.write_size >= max_write_size) {
+        create_op_func(omap_idx, first);
+        ctl.write_size = 0;
+        first = false;
+      }
+      ctl.write_size += it.first.length() + it.second.length() + 2 * sizeof(__u32);
+      ctl.to_update[it.first].swap(it.second);
+      total_updates++;
+    }
+
+    for (auto& key : ctl.journaled_remove) {
+      if (ctl.write_size >= max_write_size) {
+        create_op_func(omap_idx, first);
+        ctl.write_size = 0;
+        first = false;
+      }
+
+      ctl.write_size += key.length() + sizeof(__u32);
+      ctl.to_remove.emplace(key);
+      total_removes++;
+    }
+
+    for (unsigned i = 0; i < ctl.journal_idx; ++i) {
+      char key[32];
+      snprintf(key, sizeof(key), "_journal.%x", i);
+      ctl.to_remove.emplace(key);
+    }
+
+    // update first object's omap header if object count changes
+    if (ctl.clear ||
+	ctl.journal_idx > 0 ||
+	(omap_idx == 0 && old_num_objs != omap_num_objs))
+      create_op_func(omap_idx, first);
+  }
+
+  ceph_assert(!ops_map.empty());
+  if (journal_state == JOURNAL_FINISH) {
+    gather.set_finisher(new C_OnFinisher(new C_IO_OFT_Journal(this, log_seq, c, ops_map),
+					 mds->finisher));
+    gather.activate();
+  } else {
+    submit_ops_func();
+  }
+  logger->set(l_oft_omap_total_objs, omap_num_objs);
+  logger->set(l_oft_omap_total_kv_pairs, total_items);
+  logger->inc(l_oft_omap_total_updates, total_updates);
+  logger->inc(l_oft_omap_total_removes, total_removes);
+}
+
+class C_IO_OFT_Load : public MDSIOContextBase {
+protected:
+  OpenFileTable *oft;
+  MDSRank *get_mds() override { return oft->mds; }
+
+public:
+  int header_r = 0;  //< Return value from OMAP header read
+  int values_r = 0;  //< Return value from OMAP value read
+  bufferlist header_bl;
+  std::map<std::string, bufferlist> values;
+  unsigned index;
+  bool first;
+  bool more = false;
+
+  C_IO_OFT_Load(OpenFileTable *t, unsigned i, bool f) :
+    oft(t), index(i), first(f) {}
+  void finish(int r) override {
+    oft->_load_finish(r, header_r, values_r, index, first, more, header_bl, values);
+  }
+  void print(ostream& out) const override {
+    out << "openfiles_load";
+  }
+};
+
+class C_IO_OFT_Recover : public MDSIOContextBase {
+protected:
+  OpenFileTable *oft;
+  MDSRank *get_mds() override { return oft->mds; }
+public:
+  C_IO_OFT_Recover(OpenFileTable *t) : oft(t) {}
+  void finish(int r) override {
+    oft->_recover_finish(r);
+  }
+  void print(ostream& out) const override {
+    out << "openfiles_recover";
+  }
+};
+
+void OpenFileTable::_recover_finish(int r)
+{
+  if (r < 0) {
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+    _reset_states();
+  } else {
+    dout(10) << __func__ << ": load complete" << dendl;
+  }
+
+  journal_state = JOURNAL_NONE;
+  load_done = true;
+  finish_contexts(g_ceph_context, waiting_for_load);
+  waiting_for_load.clear();
+}
+
+void OpenFileTable::_read_omap_values(const std::string& key, unsigned idx,
+                                      bool first)
+{
+    object_t oid = get_object_name(idx);
+    dout(10) << __func__ << ": load from '" << oid << ":" << key << "'" << dendl;
+    object_locator_t oloc(mds->get_metadata_pool());
+    C_IO_OFT_Load *c = new C_IO_OFT_Load(this, idx, first);
+    ObjectOperation op;
+    if (first)
+      op.omap_get_header(&c->header_bl, &c->header_r);
+    op.omap_get_vals(key, "", uint64_t(-1),
+		     &c->values, &c->more, &c->values_r);
+    mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, nullptr, 0,
+			new C_OnFinisher(c, mds->finisher));
+}
+
+void OpenFileTable::_load_finish(int op_r, int header_r, int values_r,
+				 unsigned idx, bool first, bool more,
+				 bufferlist &header_bl,
+				 std::map<std::string, bufferlist> &values)
+{
+  using ceph::decode;
+  int err = -CEPHFS_EINVAL;
+
+  auto decode_func = [this](unsigned idx, inodeno_t ino, bufferlist &bl) {
+    auto p = bl.cbegin();
+
+    size_t count = loaded_anchor_map.size();
+    auto it = loaded_anchor_map.emplace_hint(loaded_anchor_map.end(),
+					    std::piecewise_construct,
+					    std::make_tuple(ino),
+					    std::make_tuple());
+    RecoveredAnchor& anchor = it->second;
+    decode(anchor, p);
+    frag_vec_t frags; // unused
+    decode(frags, p);
+    ceph_assert(ino == anchor.ino);
+    anchor.omap_idx = idx;
+    anchor.auth = MDS_RANK_NONE;
+
+
+    if (loaded_anchor_map.size() > count)
+      ++omap_num_items[idx];
+  };
+
+  if (op_r < 0) {
+    derr << __func__ << " got " << cpp_strerror(op_r) << dendl;
+    err = op_r;
+    goto out;
+  }
+
+  try {
+    if (first) {
+      auto p = header_bl.cbegin();
+
+      string magic;
+      version_t version;
+      unsigned num_objs;
+      __u8 jstate;
+
+      if (header_bl.length() == 13) {
+	// obsolete format.
+	decode(version, p);
+	decode(num_objs, p);
+	decode(jstate, p);
+      } else {
+	decode(magic, p);
+	if (magic != CEPH_FS_ONDISK_MAGIC) {
+	  CachedStackStringStream css;
+	  *css << "invalid magic '" << magic << "'";
+	  throw buffer::malformed_input(css->str());
+	}
+
+	DECODE_START(1, p);
+	decode(version, p);
+	decode(num_objs, p);
+	decode(jstate, p);
+	DECODE_FINISH(p);
+      }
+
+      if (num_objs > MAX_OBJECTS) {
+	  CachedStackStringStream css;
+	  *css << "invalid object count '" << num_objs << "'";
+	  throw buffer::malformed_input(css->str());
+      }
+      if (jstate > JOURNAL_FINISH) {
+	  CachedStackStringStream css;
+	  *css << "invalid journal state '" << jstate << "'";
+	  throw buffer::malformed_input(css->str());
+      }
+
+      if (version > omap_version) {
+	omap_version = version;
+	omap_num_objs = num_objs;
+	omap_num_items.resize(omap_num_objs);
+	journal_state = jstate;
+      } else if (version == omap_version) {
+	ceph_assert(omap_num_objs == num_objs);
+	if (jstate > journal_state)
+	  journal_state = jstate;
+      }
+    }
+
+    for (auto& it : values) {
+      if (it.first.compare(0, 9, "_journal.") == 0) {
+	if (idx >= loaded_journals.size())
+	  loaded_journals.resize(idx + 1);
+
+	if (journal_state == JOURNAL_FINISH) {
+	  loaded_journals[idx][it.first].swap(it.second);
+	} else { // incomplete journal
+	  loaded_journals[idx][it.first].length();
+	}
+	continue;
+      }
+
+      inodeno_t ino;
+      sscanf(it.first.c_str(), "%llx", (unsigned long long*)&ino.val);
+      decode_func(idx, ino, it.second);
+    }
+  } catch (buffer::error &e) {
+    derr << __func__ << ": corrupted header/values: " << e.what() << dendl;
+    goto out;
+  }
+
+  if (more || idx + 1 < omap_num_objs) {
+    // Issue another read if we're not at the end of the omap
+    std::string last_key;
+    if (more)
+      last_key = values.rbegin()->first;
+    else
+      idx++;
+
+    _read_omap_values(last_key, idx, !more);
+    return;
+  }
+
+  // replay journal
+  if (loaded_journals.size() > 0) {
+    dout(10) << __func__ << ": recover journal" << dendl;
+
+    C_GatherBuilder gather(g_ceph_context,
+			   new C_OnFinisher(new C_IO_OFT_Recover(this),
+					    mds->finisher));
+    object_locator_t oloc(mds->get_metadata_pool());
+    SnapContext snapc;
+
+    for (unsigned omap_idx = 0; omap_idx < loaded_journals.size(); omap_idx++) {
+      auto& loaded_journal = loaded_journals[omap_idx];
+
+      std::vector<ObjectOperation> op_vec;
+      try {
+	for (auto& it : loaded_journal) {
+	  if (journal_state != JOURNAL_FINISH)
+	    continue;
+	  auto p = it.second.cbegin();
+	  version_t version;
+	  std::map<string, bufferlist> to_update;
+	  std::set<string> to_remove;
+	  decode(version, p);
+	  if (version != omap_version)
+	    continue;
+	  decode(to_update, p);
+	  decode(to_remove, p);
+	  it.second.clear();
+
+	  for (auto& q : to_update) {
+	    inodeno_t ino;
+	    sscanf(q.first.c_str(), "%llx", (unsigned long long*)&ino.val);
+	    decode_func(omap_idx, ino, q.second);
+	  }
+	  for (auto& q : to_remove) {
+	    inodeno_t ino;
+	    sscanf(q.c_str(), "%llx",(unsigned long long*)&ino.val);
+	    ceph_assert(ino.val > 0);
+	    if (loaded_anchor_map.erase(ino)) {
+	      unsigned& count = omap_num_items[omap_idx];
+	      ceph_assert(count > 0);
+	      --count;
+	    }
+	  }
+
+	  op_vec.resize(op_vec.size() + 1);
+	  ObjectOperation& op = op_vec.back();
+	  op.priority = CEPH_MSG_PRIO_HIGH;
+	  if (!to_update.empty())
+	    op.omap_set(to_update);
+	  if (!to_remove.empty())
+	    op.omap_rm_keys(to_remove);
+	}
+      } catch (buffer::error &e) {
+	derr << __func__ << ": corrupted journal: " << e.what() << dendl;
+	goto out;
+      }
+
+      op_vec.resize(op_vec.size() + 1);
+      ObjectOperation& op = op_vec.back();
+      {
+	bufferlist header;
+	if (journal_state == JOURNAL_FINISH)
+	  _encode_header(header, JOURNAL_FINISH);
+	else
+	  _encode_header(header, JOURNAL_NONE);
+	op.omap_set_header(header);
+      }
+      {
+	// remove journal
+	std::set<string> to_remove;
+	for (auto &it : loaded_journal)
+	  to_remove.emplace(it.first);
+	op.omap_rm_keys(to_remove);
+      }
+      loaded_journal.clear();
+
+      object_t oid = get_object_name(omap_idx);
+      for (auto& op : op_vec) {
+	mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(),
+			      0, gather.new_sub());
+      }
+    }
+    gather.activate();
+    return;
+  }
+
+  journal_state = JOURNAL_NONE;
+  err = 0;
+  dout(10) << __func__ << ": load complete" << dendl;
+out:
+
+  if (err < 0)
+    _reset_states();
+
+  load_done = true;
+  finish_contexts(g_ceph_context, waiting_for_load);
+  waiting_for_load.clear();
+}
+
+void OpenFileTable::load(MDSContext *onload)
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(!load_done);
+  if (onload)
+    waiting_for_load.push_back(onload);
+
+  _read_omap_values("", 0, true);
+}
+
+void OpenFileTable::_get_ancestors(const Anchor& parent,
+				   vector<inode_backpointer_t>& ancestors,
+				   mds_rank_t& auth_hint)
+{
+  inodeno_t dirino = parent.dirino;
+  std::string_view d_name = parent.d_name;
+
+  bool first = true;
+  ancestors.clear();
+  while (true) {
+    ancestors.push_back(inode_backpointer_t(dirino, string{d_name}, 0));
+
+    auto p = loaded_anchor_map.find(dirino);
+    if (p == loaded_anchor_map.end())
+      break;
+
+    if (first)
+      auth_hint = p->second.auth;
+
+    dirino = p->second.dirino;
+    d_name = p->second.d_name;
+    if (dirino == inodeno_t(0))
+      break;
+
+    first = false;
+  }
+}
+
+class C_OFT_OpenInoFinish: public MDSContext {
+  OpenFileTable *oft;
+  inodeno_t ino;
+  MDSRank *get_mds() override { return oft->mds; }
+public:
+  C_OFT_OpenInoFinish(OpenFileTable *t, inodeno_t i) : oft(t), ino(i) {}
+  void finish(int r) override {
+    oft->_open_ino_finish(ino, r);
+  }
+};
+
+void OpenFileTable::_open_ino_finish(inodeno_t ino, int r)
+{
+  if (prefetch_state == DIR_INODES && r >= 0 && ino != inodeno_t(0)) {
+    auto p = loaded_anchor_map.find(ino);
+    ceph_assert(p != loaded_anchor_map.end());
+    p->second.auth = mds_rank_t(r);
+  }
+
+  if (r != mds->get_nodeid())
+    mds->mdcache->rejoin_prefetch_ino_finish(ino, r);
+
+  num_opening_inodes--;
+  if (num_opening_inodes == 0) {
+    if (prefetch_state == DIR_INODES)  {
+      if (g_conf().get_val<bool>("mds_oft_prefetch_dirfrags")) {
+	prefetch_state = DIRFRAGS;
+	_prefetch_dirfrags();
+      } else {
+	prefetch_state = FILE_INODES;
+	_prefetch_inodes();
+      }
+    } else if (prefetch_state == FILE_INODES) {
+      prefetch_state = DONE;
+      logseg_destroyed_inos.clear();
+      destroyed_inos_set.clear();
+      finish_contexts(g_ceph_context, waiting_for_prefetch);
+      waiting_for_prefetch.clear();
+    } else {
+      ceph_abort();
+    }
+  }
+}
+
+void OpenFileTable::_prefetch_dirfrags()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(prefetch_state == DIRFRAGS);
+
+  MDCache *mdcache = mds->mdcache;
+  std::vector<CDir*> fetch_queue;
+
+  for (auto& [ino, anchor] : loaded_anchor_map) {
+    if (anchor.frags.empty())
+      continue;
+    CInode *diri = mdcache->get_inode(ino);
+    if (!diri)
+      continue;
+
+    if (!diri->is_dir()) {
+      dout(10) << " " << *diri << " is not dir" << dendl;
+      continue;
+    }
+
+    if (diri->state_test(CInode::STATE_REJOINUNDEF))
+      continue;
+
+    for (auto& fg: anchor.frags) {
+      CDir *dir = diri->get_dirfrag(fg);
+      if (dir) {
+	if (dir->is_auth() && !dir->is_complete())
+	  fetch_queue.push_back(dir);
+      } else {
+	frag_vec_t leaves;
+	diri->dirfragtree.get_leaves_under(fg, leaves);
+	for (auto& leaf : leaves) {
+	  if (diri->is_auth()) {
+	    dir = diri->get_or_open_dirfrag(mdcache, leaf);
+	  } else {
+	    dir = diri->get_dirfrag(leaf);
+	  }
+	  if (dir && dir->is_auth() && !dir->is_complete())
+	    fetch_queue.push_back(dir);
+	}
+      }
+    }
+  }
+
+  MDSGatherBuilder gather(g_ceph_context);
+  int num_opening_dirfrags = 0;
+  for (const auto& dir : fetch_queue) {
+    if (dir->state_test(CDir::STATE_REJOINUNDEF))
+      ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
+    dir->fetch(gather.new_sub());
+
+    if (!(++num_opening_dirfrags % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+
+  auto finish_func = [this](int r) {
+    prefetch_state = FILE_INODES;
+    _prefetch_inodes();
+  };
+  if (gather.has_subs()) {
+    gather.set_finisher(
+	new MDSInternalContextWrapper(mds,
+	  new LambdaContext(std::move(finish_func))));
+    gather.activate();
+  } else {
+    finish_func(0);
+  }
+}
+
+void OpenFileTable::_prefetch_inodes()
+{
+  dout(10) << __func__ << " state " << prefetch_state << dendl;
+  ceph_assert(!num_opening_inodes);
+  num_opening_inodes = 1;
+
+  int64_t pool;
+  if (prefetch_state == DIR_INODES)
+    pool = mds->get_metadata_pool();
+  else if (prefetch_state == FILE_INODES)
+    pool = mds->mdsmap->get_first_data_pool();
+  else
+    ceph_abort();
+
+  MDCache *mdcache = mds->mdcache;
+
+  if (destroyed_inos_set.empty()) {
+    for (auto& it : logseg_destroyed_inos)
+      destroyed_inos_set.insert(it.second.begin(), it.second.end());
+  }
+
+  for (auto& [ino, anchor] : loaded_anchor_map) {
+    if (destroyed_inos_set.count(ino))
+	continue;
+    if (anchor.d_type == DT_DIR) {
+      if (prefetch_state != DIR_INODES)
+	continue;
+      if (MDS_INO_IS_MDSDIR(ino)) {
+	anchor.auth = MDS_INO_MDSDIR_OWNER(ino);
+	continue;
+      }
+      if (MDS_INO_IS_STRAY(ino)) {
+	anchor.auth = MDS_INO_STRAY_OWNER(ino);
+	continue;
+      }
+    } else {
+      if (prefetch_state != FILE_INODES)
+	continue;
+      // load all file inodes for MDCache::identify_files_to_recover()
+    }
+    CInode *in = mdcache->get_inode(ino);
+    if (in)
+      continue;
+
+    num_opening_inodes++;
+
+    auto fin = new C_OFT_OpenInoFinish(this, ino);
+    if (anchor.dirino != inodeno_t(0)) {
+      vector<inode_backpointer_t> ancestors;
+      mds_rank_t auth_hint = MDS_RANK_NONE;
+      _get_ancestors(anchor, ancestors, auth_hint);
+      mdcache->open_ino(ino, pool, fin, false, false, &ancestors, auth_hint);
+    } else {
+      mdcache->open_ino(ino, pool, fin, false);
+    }
+
+    if (!(num_opening_inodes % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+
+  _open_ino_finish(inodeno_t(0), 0);
+}
+
+bool OpenFileTable::prefetch_inodes()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(!prefetch_state);
+  prefetch_state = DIR_INODES;
+
+  if (!load_done) {
+    wait_for_load(
+	new MDSInternalContextWrapper(mds,
+	  new LambdaContext([this](int r) {
+	    _prefetch_inodes();
+	    })
+	  )
+	);
+    return true;
+  }
+
+  _prefetch_inodes();
+  return !is_prefetched();
+}
+
+bool OpenFileTable::should_log_open(CInode *in)
+{
+  if (in->state_test(CInode::STATE_TRACKEDBYOFT)) {
+    // inode just journaled
+    if (in->last_journaled >= committing_log_seq)
+      return false;
+    // item not dirty. it means the item has already been saved
+    auto p = dirty_items.find(in->ino());
+    if (p == dirty_items.end())
+      return false;
+  }
+  return true;
+}
+
+void OpenFileTable::note_destroyed_inos(uint64_t seq, const vector<inodeno_t>& inos)
+{
+   auto& vec = logseg_destroyed_inos[seq];
+   vec.insert(vec.end(), inos.begin(), inos.end());
+}
+
+void OpenFileTable::trim_destroyed_inos(uint64_t seq)
+{
+  auto p = logseg_destroyed_inos.begin();
+  while (p != logseg_destroyed_inos.end()) {
+    if (p->first >= seq)
+      break;
+    logseg_destroyed_inos.erase(p++);
+  }
+}
diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h
new file mode 100644
index 000000000..bfe7e098f
--- /dev/null
+++ b/src/mds/OpenFileTable.h
@@ -0,0 +1,152 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef OPEN_FILE_TABLE_H
+#define OPEN_FILE_TABLE_H
+
+#include "mdstypes.h"
+#include "Anchor.h"
+
+#include "MDSContext.h"
+
+class CDir;
+class CInode;
+class MDSRank;
+
+class OpenFileTable
+{
+public:
+  explicit OpenFileTable(MDSRank *m);
+  ~OpenFileTable();
+
+  void add_inode(CInode *in);
+  void remove_inode(CInode *in);
+  void add_dirfrag(CDir *dir);
+  void remove_dirfrag(CDir *dir);
+  void notify_link(CInode *in);
+  void notify_unlink(CInode *in);
+  bool is_any_dirty() const { return !dirty_items.empty(); }
+
+  void commit(MDSContext *c, uint64_t log_seq, int op_prio);
+  uint64_t get_committed_log_seq() const { return committed_log_seq; }
+  bool is_any_committing() const { return num_pending_commit > 0; }
+
+  void load(MDSContext *c);
+  bool is_loaded() const { return load_done; }
+  void wait_for_load(MDSContext *c) {
+    ceph_assert(!load_done);
+    waiting_for_load.push_back(c);
+  }
+
+  bool prefetch_inodes();
+  bool is_prefetched() const { return prefetch_state == DONE; }
+  void wait_for_prefetch(MDSContext *c) {
+    ceph_assert(!is_prefetched());
+    waiting_for_prefetch.push_back(c);
+  }
+
+  bool should_log_open(CInode *in);
+
+  void note_destroyed_inos(uint64_t seq, const vector<inodeno_t>& inos);
+  void trim_destroyed_inos(uint64_t seq);
+
+protected:
+  friend class C_IO_OFT_Recover;
+  friend class C_IO_OFT_Load;
+  friend class C_IO_OFT_Save;
+  friend class C_IO_OFT_Journal;
+  friend class C_OFT_OpenInoFinish;
+
+  uint64_t MAX_ITEMS_PER_OBJ = g_conf().get_val<uint64_t>("osd_deep_scrub_large_omap_object_key_threshold");
+  static const unsigned MAX_OBJECTS = 1024; // (1024 * osd_deep_scrub_large_omap_object_key_threshold) items at most
+
+  static const int DIRTY_NEW	= -1;
+  static const int DIRTY_UNDEF	= -2;
+
+  unsigned num_pending_commit = 0;
+  void _encode_header(bufferlist& bl, int j_state);
+  void _commit_finish(int r, uint64_t log_seq, MDSContext *fin);
+  void _journal_finish(int r, uint64_t log_seq, MDSContext *fin,
+		       std::map<unsigned, std::vector<ObjectOperation> >& ops);
+
+  void get_ref(CInode *in, frag_t fg=-1U);
+  void put_ref(CInode *in, frag_t fg=-1U);
+
+  object_t get_object_name(unsigned idx) const;
+
+  void _reset_states() {
+    omap_num_objs = 0;
+    omap_num_items.resize(0);
+    journal_state = JOURNAL_NONE;
+    loaded_journals.clear();
+    loaded_anchor_map.clear();
+  }
+  void _read_omap_values(const std::string& key, unsigned idx, bool first);
+  void _load_finish(int op_r, int header_r, int values_r,
+		    unsigned idx, bool first, bool more,
+                    bufferlist &header_bl,
+		    std::map<std::string, bufferlist> &values);
+  void _recover_finish(int r);
+
+  void _open_ino_finish(inodeno_t ino, int r);
+  void _prefetch_inodes();
+  void _prefetch_dirfrags();
+
+  void _get_ancestors(const Anchor& parent,
+		      vector<inode_backpointer_t>& ancestors,
+		      mds_rank_t& auth_hint);
+
+  MDSRank *mds;
+
+  version_t omap_version = 0;
+
+  unsigned omap_num_objs = 0;
+  std::vector<unsigned> omap_num_items;
+
+  map<inodeno_t, OpenedAnchor> anchor_map;
+
+  std::map<inodeno_t, int> dirty_items; // ino -> dirty state
+
+  uint64_t committed_log_seq = 0;
+  uint64_t committing_log_seq = 0;
+
+  enum {
+    JOURNAL_NONE = 0,
+    JOURNAL_START = 1,
+    JOURNAL_FINISH = 2,
+  };
+  int journal_state = 0;
+
+  std::vector<std::map<std::string, bufferlist> > loaded_journals;
+  map<inodeno_t, RecoveredAnchor> loaded_anchor_map;
+  MDSContext::vec waiting_for_load;
+  bool load_done = false;
+
+  enum {
+    DIR_INODES = 1,
+    DIRFRAGS = 2,
+    FILE_INODES = 3,
+    DONE = 4,
+  };
+  unsigned prefetch_state = 0;
+  unsigned num_opening_inodes = 0;
+  MDSContext::vec waiting_for_prefetch;
+
+  std::map<uint64_t, vector<inodeno_t> > logseg_destroyed_inos;
+  std::set<inodeno_t> destroyed_inos_set;
+
+  std::unique_ptr<PerfCounters> logger;
+};
+
+#endif
diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc
new file mode 100644
index 000000000..56c962d19
--- /dev/null
+++ b/src/mds/PurgeQueue.cc
@@ -0,0 +1,850 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "common/debug.h"
+#include "mds/mdstypes.h"
+#include "mds/CInode.h"
+#include "mds/MDCache.h"
+
+#include "PurgeQueue.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, rank) << __func__ << ": "
+static ostream& _prefix(std::ostream *_dout, mds_rank_t rank) {
+  return *_dout << "mds." << rank << ".purge_queue ";
+}
+
+const std::map<std::string, PurgeItem::Action> PurgeItem::actions = {
+  {"NONE", PurgeItem::NONE},
+  {"PURGE_FILE", PurgeItem::PURGE_FILE},
+  {"TRUNCATE_FILE", PurgeItem::TRUNCATE_FILE},
+  {"PURGE_DIR", PurgeItem::PURGE_DIR}
+};
+
+void PurgeItem::encode(bufferlist &bl) const
+{
+  ENCODE_START(2, 1, bl);
+  encode((uint8_t)action, bl);
+  encode(ino, bl);
+  encode(size, bl);
+  encode(layout, bl, CEPH_FEATURE_FS_FILE_LAYOUT_V2);
+  encode(old_pools, bl);
+  encode(snapc, bl);
+  encode(fragtree, bl);
+  encode(stamp, bl);
+  uint8_t static const pad = 0xff;
+  for (unsigned int i = 0; i<pad_size; i++) {
+    encode(pad, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void PurgeItem::decode(bufferlist::const_iterator &p)
+{
+  DECODE_START(2, p);
+  bool done = false;
+  if (struct_v == 1) {
+    auto p_start = p;
+    try {
+      // bad encoding introduced by v13.2.2
+      decode(stamp, p);
+      decode(pad_size, p);
+      p += pad_size;
+      uint8_t raw_action;
+      decode(raw_action, p);
+      action = (Action)raw_action;
+      decode(ino, p);
+      decode(size, p);
+      decode(layout, p);
+      decode(old_pools, p);
+      decode(snapc, p);
+      decode(fragtree, p);
+      if (p.get_off() > struct_end)
+	throw buffer::end_of_buffer();
+      done = true;
+    } catch (const buffer::error &e) {
+      p = p_start;
+    }
+  }
+  if (!done) {
+    uint8_t raw_action;
+    decode(raw_action, p);
+    action = (Action)raw_action;
+    decode(ino, p);
+    decode(size, p);
+    decode(layout, p);
+    decode(old_pools, p);
+    decode(snapc, p);
+    decode(fragtree, p);
+    if (struct_v >= 2) {
+      decode(stamp, p);
+    }
+  }
+  DECODE_FINISH(p);
+}
+
+// if Objecter has any slow requests, take that as a hint and
+// slow down our rate of purging
+PurgeQueue::PurgeQueue(
+      CephContext *cct_,
+      mds_rank_t rank_,
+      const int64_t metadata_pool_,
+      Objecter *objecter_,
+      Context *on_error_)
+  :
+    cct(cct_),
+    rank(rank_),
+    metadata_pool(metadata_pool_),
+    finisher(cct, "PurgeQueue", "PQ_Finisher"),
+    timer(cct, lock),
+    filer(objecter_, &finisher),
+    objecter(objecter_),
+    journaler("pq", MDS_INO_PURGE_QUEUE + rank, metadata_pool,
+      CEPH_FS_ONDISK_MAGIC, objecter_, nullptr, 0,
+      &finisher),
+    on_error(on_error_)
+{
+  ceph_assert(cct != nullptr);
+  ceph_assert(on_error != nullptr);
+  ceph_assert(objecter != nullptr);
+  journaler.set_write_error_handler(on_error);
+}
+
+PurgeQueue::~PurgeQueue()
+{
+  if (logger) {
+    g_ceph_context->get_perfcounters_collection()->remove(logger.get());
+  }
+  delete on_error;
+}
+
+void PurgeQueue::create_logger()
+{
+  PerfCountersBuilder pcb(g_ceph_context, "purge_queue", l_pq_first, l_pq_last);
+
+  pcb.add_u64_counter(l_pq_executed, "pq_executed", "Purge queue tasks executed",
+                      "purg", PerfCountersBuilder::PRIO_INTERESTING);
+
+  pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+  pcb.add_u64(l_pq_executing_ops, "pq_executing_ops", "Purge queue ops in flight");
+  pcb.add_u64(l_pq_executing_ops_high_water, "pq_executing_ops_high_water", "Maximum number of executing file purge ops");
+  pcb.add_u64(l_pq_executing, "pq_executing", "Purge queue tasks in flight");
+  pcb.add_u64(l_pq_executing_high_water, "pq_executing_high_water", "Maximum number of executing file purges");
+  pcb.add_u64(l_pq_item_in_journal, "pq_item_in_journal", "Purge item left in journal");
+
+  logger.reset(pcb.create_perf_counters());
+  g_ceph_context->get_perfcounters_collection()->add(logger.get());
+}
+
+void PurgeQueue::init()
+{
+  std::lock_guard l(lock);
+
+  ceph_assert(logger != nullptr);
+
+  finisher.start();
+  timer.init();
+}
+
+void PurgeQueue::activate()
+{
+  std::lock_guard l(lock);
+
+  {
+    PurgeItem item;
+    bufferlist bl;
+
+    // calculate purge item serialized size stored in journal
+    // used to count how many items still left in journal later
+    ::encode(item, bl);
+    purge_item_journal_size = bl.length() + journaler.get_journal_envelope_size(); 
+  }
+
+  if (readonly) {
+    dout(10) << "skipping activate: PurgeQueue is readonly" << dendl;
+    return;
+  }
+
+  if (journaler.get_read_pos() == journaler.get_write_pos())
+    return;
+
+  if (in_flight.empty()) {
+    dout(4) << "start work (by drain)" << dendl;
+    finisher.queue(new LambdaContext([this](int r) {
+	  std::lock_guard l(lock);
+	  _consume();
+	  }));
+  }
+}
+
+void PurgeQueue::shutdown()
+{
+  std::lock_guard l(lock);
+
+  journaler.shutdown();
+  timer.shutdown();
+  finisher.stop();
+}
+
+void PurgeQueue::open(Context *completion)
+{
+  dout(4) << "opening" << dendl;
+
+  std::lock_guard l(lock);
+
+  if (completion)
+    waiting_for_recovery.push_back(completion);
+
+  journaler.recover(new LambdaContext([this](int r){
+    if (r == -CEPHFS_ENOENT) {
+      dout(1) << "Purge Queue not found, assuming this is an upgrade and "
+                 "creating it." << dendl;
+      create(NULL);
+    } else if (r == 0) {
+      std::lock_guard l(lock);
+      dout(4) << "open complete" << dendl;
+
+      // Journaler only guarantees entries before head write_pos have been
+      // fully flushed. Before appending new entries, we need to find and
+      // drop any partial written entry.
+      if (journaler.last_committed.write_pos < journaler.get_write_pos()) {
+	dout(4) << "recovering write_pos" << dendl;
+	journaler.set_read_pos(journaler.last_committed.write_pos);
+	_recover();
+	return;
+      }
+
+      journaler.set_writeable();
+      recovered = true;
+      finish_contexts(g_ceph_context, waiting_for_recovery);
+    } else {
+      derr << "Error " << r << " loading Journaler" << dendl;
+      _go_readonly(r);
+    }
+  }));
+}
+
+void PurgeQueue::wait_for_recovery(Context* c)
+{
+  std::lock_guard l(lock);
+  if (recovered) {
+    c->complete(0);
+  } else if (readonly) {
+    dout(10) << "cannot wait for recovery: PurgeQueue is readonly" << dendl;
+    c->complete(-CEPHFS_EROFS);
+  } else {
+    waiting_for_recovery.push_back(c);
+  }
+}
+
+void PurgeQueue::_recover()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  // Journaler::is_readable() adjusts write_pos if partial entry is encountered
+  while (1) {
+    if (!journaler.is_readable() &&
+	!journaler.get_error() &&
+	journaler.get_read_pos() < journaler.get_write_pos()) {
+      journaler.wait_for_readable(new LambdaContext([this](int r) {
+        std::lock_guard l(lock);
+	_recover();
+      }));
+      return;
+    }
+
+    if (journaler.get_error()) {
+      int r = journaler.get_error();
+      derr << "Error " << r << " recovering write_pos" << dendl;
+      _go_readonly(r);
+      return;
+    }
+
+    if (journaler.get_read_pos() == journaler.get_write_pos()) {
+      dout(4) << "write_pos recovered" << dendl;
+      // restore original read_pos
+      journaler.set_read_pos(journaler.last_committed.expire_pos);
+      journaler.set_writeable();
+      recovered = true;
+      finish_contexts(g_ceph_context, waiting_for_recovery);
+      return;
+    }
+
+    bufferlist bl;
+    bool readable = journaler.try_read_entry(bl);
+    ceph_assert(readable);  // we checked earlier
+  }
+}
+
+void PurgeQueue::create(Context *fin)
+{
+  dout(4) << "creating" << dendl;
+  std::lock_guard l(lock);
+
+  if (fin)
+    waiting_for_recovery.push_back(fin);
+
+  file_layout_t layout = file_layout_t::get_default();
+  layout.pool_id = metadata_pool;
+  journaler.set_writeable();
+  journaler.create(&layout, JOURNAL_FORMAT_RESILIENT);
+  journaler.write_head(new LambdaContext([this](int r) {
+    std::lock_guard l(lock);
+    if (r) {
+      _go_readonly(r);
+    } else {
+      recovered = true;
+      finish_contexts(g_ceph_context, waiting_for_recovery);
+    }
+  }));
+}
+
+/**
+ * The `completion` context will always be called back via a Finisher
+ */
+void PurgeQueue::push(const PurgeItem &pi, Context *completion)
+{
+  dout(4) << "pushing inode " << pi.ino << dendl;
+  std::lock_guard l(lock);
+
+  if (readonly) {
+    dout(10) << "cannot push inode: PurgeQueue is readonly" << dendl;
+    completion->complete(-CEPHFS_EROFS);
+    return;
+  }
+
+  // Callers should have waited for open() before using us
+  ceph_assert(!journaler.is_readonly());
+
+  bufferlist bl;
+
+  encode(pi, bl);
+  journaler.append_entry(bl);
+  journaler.wait_for_flush(completion);
+
+  // Maybe go ahead and do something with it right away
+  bool could_consume = _consume();
+  if (!could_consume) {
+    // Usually, it is not necessary to explicitly flush here, because the reader
+    // will get flushes generated inside Journaler::is_readable.  However,
+    // if we remain in a _can_consume()==false state for a long period then
+    // we should flush in order to allow MDCache to drop its strays rather
+    // than having them wait for purgequeue to progress.
+    if (!delayed_flush) {
+      delayed_flush = new LambdaContext([this](int r){
+            delayed_flush = nullptr;
+            journaler.flush();
+          });
+
+      timer.add_event_after(
+	  g_conf()->mds_purge_queue_busy_flush_period,
+          delayed_flush);
+    }
+  }
+}
+
+uint32_t PurgeQueue::_calculate_ops(const PurgeItem &item) const
+{
+  uint32_t ops_required = 0;
+  if (item.action == PurgeItem::PURGE_DIR) {
+    // Directory, count dirfrags to be deleted
+    frag_vec_t leaves;
+    if (!item.fragtree.is_leaf(frag_t())) {
+      item.fragtree.get_leaves(leaves);
+    }
+    // One for the root, plus any leaves
+    ops_required = 1 + leaves.size();
+  } else {
+    // File, work out concurrent Filer::purge deletes
+    // Account for removing (or zeroing) backtrace
+    const uint64_t num = (item.size > 0) ?
+      Striper::get_num_objects(item.layout, item.size) : 1;
+
+    ops_required = std::min(num, g_conf()->filer_max_purge_ops);
+
+    // Account for deletions for old pools
+    if (item.action != PurgeItem::TRUNCATE_FILE) {
+      ops_required += item.old_pools.size();
+    }
+  }
+
+  return ops_required;
+}
+
+bool PurgeQueue::_can_consume()
+{
+  if (readonly) {
+    dout(10) << "can't consume: PurgeQueue is readonly" << dendl;
+    return false;
+  }
+
+  dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, "
+           << in_flight.size() << "/" << g_conf()->mds_max_purge_files
+           << " files" << dendl;
+
+  if (in_flight.size() == 0 && cct->_conf->mds_max_purge_files > 0) {
+    // Always permit consumption if nothing is in flight, so that the ops
+    // limit can never be so low as to forbid all progress (unless
+    // administrator has deliberately paused purging by setting max
+    // purge files to zero).
+    return true;
+  }
+
+  if (ops_in_flight >= max_purge_ops) {
+    dout(20) << "Throttling on op limit " << ops_in_flight << "/"
+             << max_purge_ops << dendl;
+    return false;
+  }
+
+  if (in_flight.size() >= cct->_conf->mds_max_purge_files) {
+    dout(20) << "Throttling on item limit " << in_flight.size()
+             << "/" << cct->_conf->mds_max_purge_files << dendl;
+    return false;
+  } else {
+    return true;
+  }
+}
+
+void PurgeQueue::_go_readonly(int r)
+{
+  if (readonly) return;
+  dout(1) << "going readonly because internal IO failed: " << strerror(-r) << dendl;
+  readonly = true;
+  finisher.queue(on_error, r);
+  on_error = nullptr;
+  journaler.set_readonly();
+  finish_contexts(g_ceph_context, waiting_for_recovery, r);
+}
+
+bool PurgeQueue::_consume()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  bool could_consume = false;
+  while(_can_consume()) {
+
+    if (delayed_flush) {
+      // We are now going to read from the journal, so any proactive
+      // flush is no longer necessary.  This is not functionally necessary
+      // but it can avoid generating extra fragmented flush IOs.
+      timer.cancel_event(delayed_flush);
+      delayed_flush = nullptr;
+    }
+
+    if (int r = journaler.get_error()) {
+      derr << "Error " << r << " recovering write_pos" << dendl;
+      _go_readonly(r);
+      return could_consume;
+    }
+
+    if (!journaler.is_readable()) {
+      dout(10) << " not readable right now" << dendl;
+      // Because we are the writer and the reader of the journal
+      // via the same Journaler instance, we never need to reread_head
+      if (!journaler.have_waiter()) {
+        journaler.wait_for_readable(new LambdaContext([this](int r) {
+          std::lock_guard l(lock);
+          if (r == 0) {
+            _consume();
+          } else if (r != -CEPHFS_EAGAIN) {
+            _go_readonly(r);
+          }
+        }));
+      }
+
+      return could_consume;
+    }
+
+    could_consume = true;
+    // The journaler is readable: consume an entry
+    bufferlist bl;
+    bool readable = journaler.try_read_entry(bl);
+    ceph_assert(readable);  // we checked earlier
+
+    dout(20) << " decoding entry" << dendl;
+    PurgeItem item;
+    auto q = bl.cbegin();
+    try {
+      decode(item, q);
+    } catch (const buffer::error &err) {
+      derr << "Decode error at read_pos=0x" << std::hex
+           << journaler.get_read_pos() << dendl;
+      _go_readonly(CEPHFS_EIO);
+    }
+    dout(20) << " executing item (" << item.ino << ")" << dendl;
+    _execute_item(item, journaler.get_read_pos());
+  }
+
+  dout(10) << " cannot consume right now" << dendl;
+
+  return could_consume;
+}
+
+class C_IO_PurgeItem_Commit : public Context {
+public:
+  C_IO_PurgeItem_Commit(PurgeQueue *pq, std::vector<PurgeItemCommitOp> ops, uint64_t expire_to)
+    : purge_queue(pq), ops_vec(std::move(ops)), expire_to(expire_to) {
+  }
+
+  void finish(int r) override {
+    purge_queue->_commit_ops(r, ops_vec, expire_to);
+  }
+
+private:
+  PurgeQueue *purge_queue;
+  std::vector<PurgeItemCommitOp> ops_vec;
+  uint64_t expire_to;
+};
+
+void PurgeQueue::_commit_ops(int r, const std::vector<PurgeItemCommitOp>& ops_vec, uint64_t expire_to)
+{
+  if (r < 0) {
+    derr << " r = " << r << dendl;
+    return;
+  }
+
+  SnapContext nullsnapc;
+  C_GatherBuilder gather(cct);
+
+  for (auto &op : ops_vec) {
+    dout(10) << op.item.get_type_str() << dendl;
+    if (op.type == PurgeItemCommitOp::PURGE_OP_RANGE) {
+      uint64_t first_obj = 0, num_obj = 0;
+      uint64_t num = Striper::get_num_objects(op.item.layout, op.item.size);
+      num_obj = num;
+
+      if (op.item.action == PurgeItem::TRUNCATE_FILE) {
+        first_obj = 1;
+        if (num > 1)
+          num_obj = num - 1;
+        else
+          continue;
+      }
+
+      filer.purge_range(op.item.ino, &op.item.layout, op.item.snapc,
+                        first_obj, num_obj, ceph::real_clock::now(), op.flags,
+                        gather.new_sub());
+    } else if (op.type == PurgeItemCommitOp::PURGE_OP_REMOVE) {
+      if (op.item.action == PurgeItem::PURGE_DIR) {
+        objecter->remove(op.oid, op.oloc, nullsnapc,
+                         ceph::real_clock::now(), op.flags,
+                         gather.new_sub());
+      } else {
+        objecter->remove(op.oid, op.oloc, op.item.snapc,
+                         ceph::real_clock::now(), op.flags,
+                         gather.new_sub());
+      }
+    } else if (op.type == PurgeItemCommitOp::PURGE_OP_ZERO) {
+      filer.zero(op.item.ino, &op.item.layout, op.item.snapc,
+                 0, op.item.layout.object_size, ceph::real_clock::now(), 0, true,
+                 gather.new_sub());
+    } else {
+      derr << "Invalid purge op: " << op.type << dendl;
+      ceph_abort();
+    }
+  }
+
+  ceph_assert(gather.has_subs());
+
+  gather.set_finisher(new C_OnFinisher(
+	              new LambdaContext([this, expire_to](int r) {
+    std::lock_guard l(lock);
+
+    if (r == -CEPHFS_EBLOCKLISTED) {
+      finisher.queue(on_error, r);
+      on_error = nullptr;
+      return;
+    }
+
+    _execute_item_complete(expire_to);
+    _consume();
+
+    // Have we gone idle?  If so, do an extra write_head now instead of
+    // waiting for next flush after journaler_write_head_interval.
+    // Also do this periodically even if not idle, so that the persisted
+    // expire_pos doesn't fall too far behind our progress when consuming
+    // a very long queue.
+    if (!readonly &&
+        (in_flight.empty() || journaler.write_head_needed())) {
+      journaler.write_head(nullptr);
+    }
+  }), &finisher));
+
+  gather.activate();
+}
+
+void PurgeQueue::_execute_item(
+    const PurgeItem &item,
+    uint64_t expire_to)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  in_flight[expire_to] = item;
+  logger->set(l_pq_executing, in_flight.size());
+  files_high_water = std::max<uint64_t>(files_high_water,
+                              in_flight.size());
+  logger->set(l_pq_executing_high_water, files_high_water);
+  auto ops = _calculate_ops(item);
+  ops_in_flight += ops;
+  logger->set(l_pq_executing_ops, ops_in_flight);
+  ops_high_water = std::max(ops_high_water, ops_in_flight);
+  logger->set(l_pq_executing_ops_high_water, ops_high_water);
+
+  std::vector<PurgeItemCommitOp> ops_vec;
+  auto submit_ops = [&]() {
+    finisher.queue(new C_IO_PurgeItem_Commit(this, std::move(ops_vec), expire_to));
+  };
+
+  if (item.action == PurgeItem::PURGE_FILE) {
+    if (item.size > 0) {
+      uint64_t num = Striper::get_num_objects(item.layout, item.size);
+      dout(10) << " 0~" << item.size << " objects 0~" << num
+               << " snapc " << item.snapc << " on " << item.ino << dendl;
+      ops_vec.emplace_back(item, PurgeItemCommitOp::PURGE_OP_RANGE, 0);
+    }
+
+    // remove the backtrace object if it was not purged
+    object_t oid = CInode::get_object_name(item.ino, frag_t(), "");
+    if (ops_vec.empty() || !item.layout.pool_ns.empty()) {
+      object_locator_t oloc(item.layout.pool_id);
+      dout(10) << " remove backtrace object " << oid
+               << " pool " << oloc.pool << " snapc " << item.snapc << dendl;
+      ops_vec.emplace_back(item, PurgeItemCommitOp::PURGE_OP_REMOVE, 0, oid, oloc);
+    }
+
+    // remove old backtrace objects
+    for (const auto &p : item.old_pools) {
+      object_locator_t oloc(p);
+      dout(10) << " remove backtrace object " << oid
+               << " old pool " << p << " snapc " << item.snapc << dendl;
+      ops_vec.emplace_back(item, PurgeItemCommitOp::PURGE_OP_REMOVE, 0, oid, oloc);
+    }
+  } else if (item.action == PurgeItem::PURGE_DIR) {
+    object_locator_t oloc(metadata_pool);
+    frag_vec_t leaves;
+    if (!item.fragtree.is_leaf(frag_t()))
+      item.fragtree.get_leaves(leaves);
+    leaves.push_back(frag_t());
+    for (const auto &leaf : leaves) {
+      object_t oid = CInode::get_object_name(item.ino, leaf, "");
+      dout(10) << " remove dirfrag " << oid << dendl;
+      ops_vec.emplace_back(item, PurgeItemCommitOp::PURGE_OP_REMOVE, 0, oid, oloc);
+    }
+  } else if (item.action == PurgeItem::TRUNCATE_FILE) {
+    const uint64_t num = Striper::get_num_objects(item.layout, item.size);
+    dout(10) << " 0~" << item.size << " objects 0~" << num
+	     << " snapc " << item.snapc << " on " << item.ino << dendl;
+
+    // keep backtrace object
+    if (num > 1) {
+      ops_vec.emplace_back(item, PurgeItemCommitOp::PURGE_OP_RANGE, 0);
+    }
+    ops_vec.emplace_back(item, PurgeItemCommitOp::PURGE_OP_ZERO, 0);
+  } else {
+    derr << "Invalid item (action=" << item.action << ") in purge queue, "
+            "dropping it" << dendl;
+    ops_in_flight -= ops;
+    logger->set(l_pq_executing_ops, ops_in_flight);
+    ops_high_water = std::max(ops_high_water, ops_in_flight);
+    logger->set(l_pq_executing_ops_high_water, ops_high_water);
+    in_flight.erase(expire_to);
+    logger->set(l_pq_executing, in_flight.size());
+    files_high_water = std::max<uint64_t>(files_high_water,
+                                in_flight.size());
+    logger->set(l_pq_executing_high_water, files_high_water);
+    return;
+  }
+
+  submit_ops();
+}
+
+void PurgeQueue::_execute_item_complete(
+    uint64_t expire_to)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+  dout(10) << "complete at 0x" << std::hex << expire_to << std::dec << dendl;
+  ceph_assert(in_flight.count(expire_to) == 1);
+
+  auto iter = in_flight.find(expire_to);
+  ceph_assert(iter != in_flight.end());
+  if (iter == in_flight.begin()) {
+    uint64_t pos = expire_to;
+    if (!pending_expire.empty()) {
+      auto n = iter;
+      ++n;
+      if (n == in_flight.end()) {
+	pos = *pending_expire.rbegin();
+	pending_expire.clear();
+      } else {
+	auto p = pending_expire.begin();
+	do {
+	  if (*p >= n->first)
+	    break;
+	  pos = *p;
+	  pending_expire.erase(p++);
+	} while (p != pending_expire.end());
+      }
+    }
+    dout(10) << "expiring to 0x" << std::hex << pos << std::dec << dendl;
+    journaler.set_expire_pos(pos);
+  } else {
+    // This is completely fine, we're not supposed to purge files in
+    // order when doing them in parallel.
+    dout(10) << "non-sequential completion, not expiring anything" << dendl;
+    pending_expire.insert(expire_to);
+  }
+
+  ops_in_flight -= _calculate_ops(iter->second);
+  logger->set(l_pq_executing_ops, ops_in_flight);
+  ops_high_water = std::max(ops_high_water, ops_in_flight);
+  logger->set(l_pq_executing_ops_high_water, ops_high_water);
+
+  dout(10) << "completed item for ino " << iter->second.ino << dendl;
+
+  in_flight.erase(iter);
+  logger->set(l_pq_executing, in_flight.size());
+  files_high_water = std::max<uint64_t>(files_high_water,
+                              in_flight.size());
+  logger->set(l_pq_executing_high_water, files_high_water);
+  dout(10) << "in_flight.size() now " << in_flight.size() << dendl;
+
+  uint64_t write_pos = journaler.get_write_pos(); 
+  uint64_t read_pos = journaler.get_read_pos(); 
+  uint64_t expire_pos = journaler.get_expire_pos(); 
+  uint64_t item_num = (write_pos - (in_flight.size() ? expire_pos : read_pos)) 
+		      / purge_item_journal_size;
+  dout(10) << "left purge items in journal: " << item_num 
+    << " (purge_item_journal_size/write_pos/read_pos/expire_pos) now at " 
+    << "(" << purge_item_journal_size << "/" << write_pos << "/" << read_pos 
+    << "/" << expire_pos << ")" << dendl;
+
+  logger->set(l_pq_item_in_journal, item_num);
+  logger->inc(l_pq_executed);
+}
+
+void PurgeQueue::update_op_limit(const MDSMap &mds_map)
+{
+  std::lock_guard l(lock);
+
+  if (readonly) {
+    dout(10) << "skipping; PurgeQueue is readonly" << dendl;
+    return;
+  }
+
+  uint64_t pg_count = 0;
+  objecter->with_osdmap([&](const OSDMap& o) {
+    // Number of PGs across all data pools
+    const std::vector<int64_t> &data_pools = mds_map.get_data_pools();
+    for (const auto dp : data_pools) {
+      if (o.get_pg_pool(dp) == NULL) {
+        // It is possible that we have an older OSDMap than MDSMap,
+        // because we don't start watching every OSDMap until after
+        // MDSRank is initialized
+        dout(4) << " data pool " << dp << " not found in OSDMap" << dendl;
+        continue;
+      }
+      pg_count += o.get_pg_num(dp);
+    }
+  });
+
+  // Work out a limit based on n_pgs / n_mdss, multiplied by the user's
+  // preference for how many ops per PG
+  max_purge_ops = uint64_t(((double)pg_count / (double)mds_map.get_max_mds()) *
+			   cct->_conf->mds_max_purge_ops_per_pg);
+
+  // User may also specify a hard limit, apply this if so.
+  if (cct->_conf->mds_max_purge_ops) {
+    max_purge_ops = std::min(max_purge_ops, cct->_conf->mds_max_purge_ops);
+  }
+}
+
+void PurgeQueue::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
+{
+  if (changed.count("mds_max_purge_ops")
+      || changed.count("mds_max_purge_ops_per_pg")) {
+    update_op_limit(mds_map);
+  } else if (changed.count("mds_max_purge_files")) {
+    std::lock_guard l(lock);
+    if (in_flight.empty()) {
+      // We might have gone from zero to a finite limit, so
+      // might need to kick off consume.
+      dout(4) << "maybe start work again (max_purge_files="
+              << g_conf()->mds_max_purge_files << dendl;
+      finisher.queue(new LambdaContext([this](int r){
+        std::lock_guard l(lock);
+        _consume();
+      }));
+    }
+  }
+}
+
+bool PurgeQueue::drain(
+    uint64_t *progress,
+    uint64_t *progress_total,
+    size_t *in_flight_count
+    )
+{
+  std::lock_guard l(lock);
+
+  if (readonly) {
+    dout(10) << "skipping drain; PurgeQueue is readonly" << dendl;
+    return true;
+  }
+
+  ceph_assert(progress != nullptr);
+  ceph_assert(progress_total != nullptr);
+  ceph_assert(in_flight_count != nullptr);
+
+  const bool done = in_flight.empty() && (
+      journaler.get_read_pos() == journaler.get_write_pos());
+  if (done) {
+    return true;
+  }
+
+  const uint64_t bytes_remaining = journaler.get_write_pos()
+                                   - journaler.get_read_pos();
+
+  if (!draining) {
+    // Start of draining: remember how much there was outstanding at
+    // this point so that we can give a progress percentage later
+    draining = true;
+
+    // Life the op throttle as this daemon now has nothing to do but
+    // drain the purge queue, so do it as fast as we can.
+    max_purge_ops = 0xffff;
+  }
+
+  drain_initial = std::max(bytes_remaining, drain_initial);
+
+  *progress = drain_initial - bytes_remaining;
+  *progress_total = drain_initial;
+  *in_flight_count = in_flight.size();
+
+  return false;
+}
+
+std::string_view PurgeItem::get_type_str() const
+{
+  switch(action) {
+  case PurgeItem::NONE: return "NONE";
+  case PurgeItem::PURGE_FILE: return "PURGE_FILE";
+  case PurgeItem::PURGE_DIR: return "PURGE_DIR";
+  case PurgeItem::TRUNCATE_FILE: return "TRUNCATE_FILE";
+  default:
+    return "UNKNOWN";
+  }
+}
+
diff --git a/src/mds/PurgeQueue.h b/src/mds/PurgeQueue.h
new file mode 100644
index 000000000..6e953d3e6
--- /dev/null
+++ b/src/mds/PurgeQueue.h
@@ -0,0 +1,249 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef PURGE_QUEUE_H_
+#define PURGE_QUEUE_H_
+
+#include "include/compact_set.h"
+#include "common/Finisher.h"
+#include "mds/MDSMap.h"
+#include "osdc/Journaler.h"
+
+
+/**
+ * Descriptor of the work associated with purging a file.  We record
+ * the minimal amount of information from the inode such as the size
+ * and layout: all other un-needed inode metadata (times, permissions, etc)
+ * has been discarded.
+ */
+class PurgeItem
+{
+public:
+  enum Action : uint8_t {
+    NONE = 0,
+    PURGE_FILE = 1,
+    TRUNCATE_FILE,
+    PURGE_DIR
+  };
+
+  PurgeItem() {}
+
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::const_iterator &p);
+
+  static Action str_to_type(std::string_view str) {
+    return PurgeItem::actions.at(std::string(str));
+  }
+
+  void dump(Formatter *f) const
+  {
+    f->dump_int("action", action);
+    f->dump_int("ino", ino);
+    f->dump_int("size", size);
+    f->open_object_section("layout");
+    layout.dump(f);
+    f->close_section();
+    f->open_object_section("SnapContext");
+    snapc.dump(f);
+    f->close_section();
+    f->open_object_section("fragtree");
+    fragtree.dump(f);
+    f->close_section();
+  }
+
+  std::string_view get_type_str() const;
+
+  utime_t stamp;
+  //None PurgeItem serves as NoOp for splicing out journal entries;
+  //so there has to be a "pad_size" to specify the size of journal
+  //space to be spliced.
+  uint32_t pad_size = 0;
+  Action action = NONE;
+  inodeno_t ino = 0;
+  uint64_t size = 0;
+  file_layout_t layout;
+  std::vector<int64_t> old_pools;
+  SnapContext snapc;
+  fragtree_t fragtree;
+private:
+  static const std::map<std::string, PurgeItem::Action> actions;
+};
+WRITE_CLASS_ENCODER(PurgeItem)
+
+enum {
+  l_pq_first = 3500,
+
+  // How many items have been finished by PurgeQueue
+  l_pq_executing_ops,
+  l_pq_executing_ops_high_water,
+  l_pq_executing,
+  l_pq_executing_high_water,
+  l_pq_executed,
+  l_pq_item_in_journal,
+  l_pq_last
+};
+
+struct PurgeItemCommitOp {
+public:
+  enum PurgeType : uint8_t {
+    PURGE_OP_RANGE = 0,
+    PURGE_OP_REMOVE = 1,
+    PURGE_OP_ZERO
+  };
+
+  PurgeItemCommitOp(PurgeItem _item, PurgeType _type, int _flags)
+    : item(_item), type(_type), flags(_flags) {}
+
+  PurgeItemCommitOp(PurgeItem _item, PurgeType _type, int _flags,
+                    object_t _oid, object_locator_t _oloc)
+    : item(_item), type(_type), flags(_flags), oid(_oid), oloc(_oloc) {}
+
+  PurgeItem item;
+  PurgeType type;
+  int flags;
+  object_t oid;
+  object_locator_t oloc;
+};
+
+/**
+ * A persistent queue of PurgeItems.  This class both writes and reads
+ * to the queue.  There is one of these per MDS rank.
+ *
+ * Note that this class does not take a reference to MDSRank: we are
+ * independent of all the metadata structures and do not need to
+ * take mds_lock for anything.
+ */
+class PurgeQueue
+{
+public:
+  PurgeQueue(
+      CephContext *cct_,
+      mds_rank_t rank_,
+      const int64_t metadata_pool_,
+      Objecter *objecter_,
+      Context *on_error);
+  ~PurgeQueue();
+
+  void init();
+  void activate();
+  void shutdown();
+
+  void create_logger();
+
+  // Write an empty queue, use this during MDS rank creation
+  void create(Context *completion);
+
+  // Read the Journaler header for an existing queue and start consuming
+  void open(Context *completion);
+
+  void wait_for_recovery(Context *c);
+
+  // Submit one entry to the work queue.  Call back when it is persisted
+  // to the queue (there is no callback for when it is executed)
+  void push(const PurgeItem &pi, Context *completion);
+
+  void _commit_ops(int r, const std::vector<PurgeItemCommitOp>& ops_vec, uint64_t expire_to);
+
+  // If the on-disk queue is empty and we are not currently processing
+  // anything.
+  bool is_idle() const;
+
+  /**
+   * Signal to the PurgeQueue that you would like it to hurry up and
+   * finish consuming everything in the queue.  Provides progress
+   * feedback.
+   *
+   * @param progress: bytes consumed since we started draining
+   * @param progress_total: max bytes that were outstanding during purge
+   * @param in_flight_count: number of file purges currently in flight
+   *
+   * @returns true if drain is complete
+   */
+  bool drain(
+    uint64_t *progress,
+    uint64_t *progress_total,
+    size_t *in_flight_count);
+
+  void update_op_limit(const MDSMap &mds_map);
+
+  void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
+
+private:
+  uint32_t _calculate_ops(const PurgeItem &item) const;
+
+  bool _can_consume();
+
+  // recover the journal write_pos (drop any partial written entry)
+  void _recover();
+
+  /**
+   * @return true if we were in a position to try and consume something:
+   *         does not mean we necessarily did.
+   */
+  bool _consume();
+
+  void _execute_item(const PurgeItem &item, uint64_t expire_to);
+  void _execute_item_complete(uint64_t expire_to);
+
+  void _go_readonly(int r);
+
+  CephContext *cct;
+  const mds_rank_t rank;
+  ceph::mutex lock = ceph::make_mutex("PurgeQueue");
+  bool readonly = false;
+
+  int64_t metadata_pool;
+
+  // Don't use the MDSDaemon's Finisher and Timer, because this class
+  // operates outside of MDSDaemon::mds_lock
+  Finisher finisher;
+  SafeTimer timer;
+  Filer filer;
+  Objecter *objecter;
+  std::unique_ptr<PerfCounters> logger;
+
+  Journaler journaler;
+
+  Context *on_error;
+
+  // Map of Journaler offset to PurgeItem
+  std::map<uint64_t, PurgeItem> in_flight;
+
+  std::set<uint64_t> pending_expire;
+
+  // Throttled allowances
+  uint64_t ops_in_flight = 0;
+
+  // Dynamic op limit per MDS based on PG count
+  uint64_t max_purge_ops = 0;
+
+  // How many bytes were remaining when drain() was first called,
+  // used for indicating progress.
+  uint64_t drain_initial = 0;
+
+  // Has drain() ever been called on this instance?
+  bool draining = false;
+
+  // Do we currently have a flush timer event waiting?
+  Context *delayed_flush = nullptr;
+
+  bool recovered = false;
+  std::vector<Context*> waiting_for_recovery;
+
+  size_t purge_item_journal_size;
+
+  uint64_t ops_high_water = 0;
+  uint64_t files_high_water = 0;
+};
+#endif
diff --git a/src/mds/RecoveryQueue.cc b/src/mds/RecoveryQueue.cc
new file mode 100644
index 000000000..6d215ccdb
--- /dev/null
+++ b/src/mds/RecoveryQueue.cc
@@ -0,0 +1,236 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "CInode.h"
+#include "MDCache.h"
+#include "MDSRank.h"
+#include "Locker.h"
+#include "osdc/Filer.h"
+
+#include "RecoveryQueue.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << " RecoveryQueue::" << __func__ << " "
+
+class C_MDC_Recover : public MDSIOContextBase {
+public:
+  C_MDC_Recover(RecoveryQueue *rq_, CInode *i) :
+    MDSIOContextBase(false), rq(rq_), in(i) {
+    ceph_assert(rq != NULL);
+  }
+  void print(ostream& out) const override {
+    out << "file_recover(" << in->ino() << ")";
+  }
+
+  uint64_t size = 0;
+  utime_t mtime;
+protected:
+  void finish(int r) override {
+    rq->_recovered(in, r, size, mtime);
+  }
+
+  MDSRank *get_mds() override {
+    return rq->mds;
+  }
+
+  RecoveryQueue *rq;
+  CInode *in;
+};
+
+RecoveryQueue::RecoveryQueue(MDSRank *mds_) :
+  file_recover_queue(member_offset(CInode, item_dirty_dirfrag_dir)),
+  file_recover_queue_front(member_offset(CInode, item_dirty_dirfrag_nest)),
+  mds(mds_), filer(mds_->objecter, mds_->finisher)
+{ }
+
+/**
+ * Progress the queue.  Call this after enqueuing something or on
+ * completion of something.
+ */
+void RecoveryQueue::advance()
+{
+  dout(10) << file_recover_queue_size << " queued, "
+	   << file_recover_queue_front_size << " prioritized, "
+	   << file_recovering.size() << " recovering" << dendl;
+
+  while (file_recovering.size() < g_conf()->mds_max_file_recover) {
+    if (!file_recover_queue_front.empty()) {
+      CInode *in = file_recover_queue_front.front();
+      in->item_recover_queue_front.remove_myself();
+      file_recover_queue_front_size--;
+      _start(in);
+    } else if (!file_recover_queue.empty()) {
+      CInode *in = file_recover_queue.front();
+      in->item_recover_queue.remove_myself();
+      file_recover_queue_size--;
+      _start(in);
+    } else {
+      break;
+    }
+  }
+
+  logger->set(l_mdc_num_recovering_processing, file_recovering.size());
+  logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
+  logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
+}
+
+void RecoveryQueue::_start(CInode *in)
+{
+  const auto& pi = in->get_projected_inode();
+
+  // blech
+  if (pi->client_ranges.size() && !pi->get_max_size()) {
+    mds->clog->warn() << "bad client_range " << pi->client_ranges
+		      << " on ino " << pi->ino;
+  }
+
+  auto p = file_recovering.find(in);
+  if (pi->client_ranges.size() && pi->get_max_size()) {
+    dout(10) << "starting " << pi->size << " " << pi->client_ranges
+	     << " " << *in << dendl;
+    if (p == file_recovering.end()) {
+      file_recovering.insert(make_pair(in, false));
+
+      C_MDC_Recover *fin = new C_MDC_Recover(this, in);
+      auto layout = pi->layout;
+      filer.probe(in->ino(), &layout, in->last,
+		  pi->get_max_size(), &fin->size, &fin->mtime, false,
+		  0, fin);
+    } else {
+      p->second = true;
+      dout(10) << "already working on " << *in << ", set need_restart flag" << dendl;
+    }
+  } else {
+    dout(10) << "skipping " << pi->size << " " << *in << dendl;
+    if (p == file_recovering.end()) {
+      in->state_clear(CInode::STATE_RECOVERING);
+      mds->locker->eval(in, CEPH_LOCK_IFILE);
+      in->auth_unpin(this);
+    }
+  }
+}
+
+void RecoveryQueue::prioritize(CInode *in)
+{
+  if (file_recovering.count(in)) {
+    dout(10) << "already working on " << *in << dendl;
+    return;
+  }
+
+  if (!in->item_recover_queue_front.is_on_list()) {
+    dout(20) << *in << dendl;
+
+    ceph_assert(in->item_recover_queue.is_on_list());
+    in->item_recover_queue.remove_myself();
+    file_recover_queue_size--;
+
+    file_recover_queue_front.push_back(&in->item_recover_queue_front);
+
+    file_recover_queue_front_size++;
+    logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
+    return;
+  }
+
+  dout(10) << "not queued " << *in << dendl;
+}
+
+static bool _is_in_any_recover_queue(CInode *in)
+{
+  return in->item_recover_queue.is_on_list() ||
+	 in->item_recover_queue_front.is_on_list();
+}
+
+/**
+ * Given an authoritative inode which is in the cache,
+ * enqueue it for recovery.
+ */
+void RecoveryQueue::enqueue(CInode *in)
+{
+  dout(15) << "RecoveryQueue::enqueue " << *in << dendl;
+  ceph_assert(logger);  // Caller should have done set_logger before using me
+  ceph_assert(in->is_auth());
+
+  in->state_clear(CInode::STATE_NEEDSRECOVER);
+  if (!in->state_test(CInode::STATE_RECOVERING)) {
+    in->state_set(CInode::STATE_RECOVERING);
+    in->auth_pin(this);
+    logger->inc(l_mdc_recovery_started);
+  }
+
+  if (!_is_in_any_recover_queue(in)) {
+    file_recover_queue.push_back(&in->item_recover_queue);
+    file_recover_queue_size++;
+    logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
+  }
+}
+
+
+/**
+ * Call back on completion of Filer probe on an inode.
+ */
+void RecoveryQueue::_recovered(CInode *in, int r, uint64_t size, utime_t mtime)
+{
+  dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime
+	   << " for " << *in << dendl;
+
+  if (r != 0) {
+    dout(0) << "recovery error! " << r << dendl;
+    if (r == -CEPHFS_EBLOCKLISTED) {
+      mds->respawn();
+      return;
+    } else {
+      // Something wrong on the OSD side trying to recover the size
+      // of this inode.  In principle we could record this as a piece
+      // of per-inode damage, but it's actually more likely that
+      // this indicates something wrong with the MDS (like maybe
+      // it has the wrong auth caps?)
+      mds->clog->error() << " OSD read error while recovering size"
+          " for inode " << in->ino();
+      mds->damaged();
+    }
+  }
+
+  auto p = file_recovering.find(in);
+  ceph_assert(p != file_recovering.end());
+  bool restart = p->second;
+  file_recovering.erase(p);
+
+  logger->set(l_mdc_num_recovering_processing, file_recovering.size());
+  logger->inc(l_mdc_recovery_completed);
+  in->state_clear(CInode::STATE_RECOVERING);
+
+  if (restart) {
+    if (in->item_recover_queue.is_on_list()) {
+      in->item_recover_queue.remove_myself();
+      file_recover_queue_size--;
+    }
+    if (in->item_recover_queue_front.is_on_list()) {
+      in->item_recover_queue_front.remove_myself();
+      file_recover_queue_front_size--;
+    }
+    logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
+    logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
+    _start(in);
+  } else if (!_is_in_any_recover_queue(in)) {
+    // journal
+    mds->locker->check_inode_max_size(in, true, 0,  size, mtime);
+    mds->locker->eval(in, CEPH_LOCK_IFILE);
+    in->auth_unpin(this);
+  }
+
+  advance();
+}
+
diff --git a/src/mds/RecoveryQueue.h b/src/mds/RecoveryQueue.h
new file mode 100644
index 000000000..699be08c3
--- /dev/null
+++ b/src/mds/RecoveryQueue.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+//class C_MDC_Recover;
+//
+#ifndef RECOVERY_QUEUE_H
+#define RECOVERY_QUEUE_H
+
+#include <set>
+
+#include "include/common_fwd.h"
+#include "osdc/Filer.h"
+
+class CInode;
+class MDSRank;
+
+class RecoveryQueue {
+public:
+  explicit RecoveryQueue(MDSRank *mds_);
+
+  void enqueue(CInode *in);
+  void advance();
+  void prioritize(CInode *in);   ///< do this inode now/soon
+
+  void set_logger(PerfCounters *p) {logger=p;}
+
+private:
+  friend class C_MDC_Recover;
+
+  void _start(CInode *in);  ///< start recovering this file
+  void _recovered(CInode *in, int r, uint64_t size, utime_t mtime);
+
+  size_t file_recover_queue_size = 0;
+  size_t file_recover_queue_front_size = 0;
+
+  elist<CInode*> file_recover_queue;   ///< the queue
+  elist<CInode*> file_recover_queue_front;  ///< elevated priority items
+  std::map<CInode*, bool> file_recovering; // inode -> need_restart
+
+  MDSRank *mds;
+  PerfCounters *logger = nullptr;
+  Filer filer;
+};
+
+#endif // RECOVERY_QUEUE_H
diff --git a/src/mds/ScatterLock.h b/src/mds/ScatterLock.h
new file mode 100644
index 000000000..f654fd2ff
--- /dev/null
+++ b/src/mds/ScatterLock.h
@@ -0,0 +1,254 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_SCATTERLOCK_H
+#define CEPH_SCATTERLOCK_H
+
+#include "SimpleLock.h"
+
+#include "MDSContext.h"
+
+class ScatterLock : public SimpleLock {
+public:
+  ScatterLock(MDSCacheObject *o, LockType *lt) :
+    SimpleLock(o, lt) {}
+  ~ScatterLock() override {
+    ceph_assert(!_more);
+  }
+
+  bool is_scatterlock() const override {
+    return true;
+  }
+
+  bool is_sync_and_unlocked() const {
+    return
+      SimpleLock::is_sync_and_unlocked() && 
+      !is_dirty() &&
+      !is_flushing();
+  }
+
+  bool can_scatter_pin(client_t loner) {
+    /*
+      LOCK : NOT okay because it can MIX and force replicas to journal something
+      TSYN : also not okay for same reason
+      EXCL : also not okay
+
+      MIX  : okay, replica can stall before sending AC_SYNCACK
+      SYNC : okay, replica can stall before sending AC_MIXACK or AC_LOCKACK
+    */   
+    return
+      get_state() == LOCK_SYNC ||
+      get_state() == LOCK_MIX;
+  }
+
+  void set_xlock_snap_sync(MDSContext *c)
+  {
+    ceph_assert(get_type() == CEPH_LOCK_IFILE);
+    ceph_assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE);
+    state = LOCK_XLOCKSNAP;
+    add_waiter(WAIT_STABLE, c);
+  }
+
+  xlist<ScatterLock*>::item *get_updated_item() { return &more()->item_updated; }
+
+  utime_t get_update_stamp() {
+    return _more ? _more->update_stamp : utime_t();
+  }
+
+  void set_update_stamp(utime_t t) { more()->update_stamp = t; }
+
+  void set_scatter_wanted() {
+    state_flags |= SCATTER_WANTED;
+  }
+  void set_unscatter_wanted() {
+    state_flags |= UNSCATTER_WANTED;
+  }
+  void clear_scatter_wanted() {
+    state_flags &= ~SCATTER_WANTED;
+  }
+  void clear_unscatter_wanted() {
+    state_flags &= ~UNSCATTER_WANTED;
+  }
+  bool get_scatter_wanted() const {
+    return state_flags & SCATTER_WANTED;
+  }
+  bool get_unscatter_wanted() const {
+    return state_flags & UNSCATTER_WANTED;
+  }
+
+  bool is_dirty() const override {
+    return state_flags & DIRTY;
+  }
+  bool is_flushing() const override {
+    return state_flags & FLUSHING;
+  }
+  bool is_flushed() const override {
+    return state_flags & FLUSHED;
+  }
+  bool is_dirty_or_flushing() const {
+    return is_dirty() || is_flushing();
+  }
+
+  void mark_dirty() { 
+    if (!is_dirty()) {
+      if (!is_flushing())
+	parent->get(MDSCacheObject::PIN_DIRTYSCATTERED);
+      set_dirty();
+    }
+  }
+  void start_flush() {
+    if (is_dirty()) {
+      set_flushing();
+      clear_dirty();
+    }
+  }
+  void finish_flush() {
+    if (is_flushing()) {
+      clear_flushing();
+      set_flushed();
+      if (!is_dirty()) {
+	parent->put(MDSCacheObject::PIN_DIRTYSCATTERED);
+	parent->clear_dirty_scattered(get_type());
+      }
+    }
+  }
+  void clear_flushed() override {
+    state_flags &= ~FLUSHED;
+  }
+  void remove_dirty() {
+    start_flush();
+    finish_flush();
+    clear_flushed();
+  }
+
+  void infer_state_from_strong_rejoin(int rstate, bool locktoo) {
+    if (rstate == LOCK_MIX || 
+	rstate == LOCK_MIX_LOCK || // replica still has wrlocks?
+	rstate == LOCK_MIX_SYNC)
+      state = LOCK_MIX;
+    else if (locktoo && rstate == LOCK_LOCK)
+      state = LOCK_LOCK;
+  }
+
+  void encode_state_for_rejoin(ceph::buffer::list& bl, int rep) {
+    __s16 s = get_replica_state();
+    if (is_gathering(rep)) {
+      // the recovering mds may hold rejoined wrlocks
+      if (state == LOCK_MIX_SYNC)
+	s = LOCK_MIX_SYNC;
+      else
+	s = LOCK_MIX_LOCK;
+    }
+
+    // If there is a recovering mds who replcated an object when it failed
+    // and scatterlock in the object was in MIX state, It's possible that
+    // the recovering mds needs to take wrlock on the scatterlock when it
+    // replays unsafe requests. So this mds should delay taking rdlock on
+    // the scatterlock until the recovering mds finishes replaying unsafe.
+    // Otherwise unsafe requests may get replayed after current request.
+    //
+    // For example:
+    // The recovering mds is auth mds of a dirfrag, this mds is auth mds
+    // of corresponding inode. when 'rm -rf' the direcotry, this mds should
+    // delay the rmdir request until the recovering mds has replayed unlink
+    // requests.
+    if (s == LOCK_MIX || s == LOCK_MIX_LOCK || s == LOCK_MIX_SYNC)
+      mark_need_recover();
+
+    using ceph::encode;
+    encode(s, bl);
+  }
+
+  void decode_state_rejoin(ceph::buffer::list::const_iterator& p, MDSContext::vec& waiters, bool survivor) {
+    SimpleLock::decode_state_rejoin(p, waiters, survivor);
+    if (is_flushing()) {
+      set_dirty();
+      clear_flushing();
+    }
+  }
+
+  bool remove_replica(int from, bool rejoin) {
+    if (rejoin &&
+	(state == LOCK_MIX ||
+	 state == LOCK_MIX_SYNC ||
+	 state == LOCK_MIX_LOCK2 ||
+	 state == LOCK_MIX_TSYN ||
+	 state == LOCK_MIX_EXCL))
+      return false;
+    return SimpleLock::remove_replica(from);
+  }
+
+  void print(std::ostream& out) const override {
+    out << "(";
+    _print(out);
+    if (is_dirty())
+      out << " dirty";
+    if (is_flushing())
+      out << " flushing";
+    if (is_flushed())
+      out << " flushed";
+    if (get_scatter_wanted())
+      out << " scatter_wanted";
+    out << ")";
+  }
+
+private:
+  struct more_bits_t {
+    xlist<ScatterLock*>::item item_updated;
+    utime_t update_stamp;
+
+    explicit more_bits_t(ScatterLock *lock) :
+      item_updated(lock)
+    {}
+  };
+
+  more_bits_t *more() {
+    if (!_more)
+      _more.reset(new more_bits_t(this));
+    return _more.get();
+  }
+
+  enum {
+    SCATTER_WANTED   = 1 << 8,
+    UNSCATTER_WANTED = 1 << 9,
+    DIRTY            = 1 << 10,
+    FLUSHING         = 1 << 11,
+    FLUSHED          = 1 << 12,
+  };
+
+  void set_flushing() {
+    state_flags |= FLUSHING;
+  }
+  void clear_flushing() {
+    state_flags &= ~FLUSHING;
+  }
+  void set_flushed() {
+    state_flags |= FLUSHED;
+  }
+  void set_dirty() {
+    state_flags |= DIRTY;
+  }
+  void clear_dirty() {
+    state_flags &= ~DIRTY;
+    if (_more) {
+      _more->item_updated.remove_myself();
+      _more.reset();
+    }
+  }
+
+  mutable std::unique_ptr<more_bits_t> _more;
+};
+
+#endif
diff --git a/src/mds/ScrubHeader.h b/src/mds/ScrubHeader.h
new file mode 100644
index 000000000..eb79090b0
--- /dev/null
+++ b/src/mds/ScrubHeader.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef SCRUB_HEADER_H_
+#define SCRUB_HEADER_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "include/ceph_assert.h"
+
+namespace ceph {
+class Formatter;
+};
+
+class CInode;
+
+/**
+ * Externally input parameters for a scrub, associated with the root
+ * of where we are doing a recursive scrub
+ */
+class ScrubHeader {
+public:
+  ScrubHeader(std::string_view tag_, bool is_tag_internal_, bool force_,
+              bool recursive_, bool repair_)
+    : tag(tag_), is_tag_internal(is_tag_internal_), force(force_),
+      recursive(recursive_), repair(repair_) {}
+
+  // Set after construction because it won't be known until we've
+  // started resolving path and locking
+  void set_origin(inodeno_t ino) { origin = ino; }
+
+  bool get_recursive() const { return recursive; }
+  bool get_repair() const { return repair; }
+  bool get_force() const { return force; }
+  bool is_internal_tag() const { return is_tag_internal; }
+  inodeno_t get_origin() const { return origin; }
+  const std::string& get_tag() const { return tag; }
+
+  bool get_repaired() const { return repaired; }
+  void set_repaired() { repaired = true; }
+
+  void set_epoch_last_forwarded(unsigned epoch) { epoch_last_forwarded = epoch; }
+  unsigned get_epoch_last_forwarded() const { return epoch_last_forwarded; }
+
+  void inc_num_pending() { ++num_pending; }
+  void dec_num_pending() {
+    ceph_assert(num_pending > 0);
+    --num_pending;
+  }
+  unsigned get_num_pending() const { return num_pending; }
+
+protected:
+  const std::string tag;
+  bool is_tag_internal;
+  const bool force;
+  const bool recursive;
+  const bool repair;
+  inodeno_t origin;
+
+  bool repaired = false;  // May be set during scrub if repairs happened
+  unsigned epoch_last_forwarded = 0;
+  unsigned num_pending = 0;
+};
+
+typedef std::shared_ptr<ScrubHeader> ScrubHeaderRef;
+typedef std::shared_ptr<const ScrubHeader> ScrubHeaderRefConst;
+
+#endif // SCRUB_HEADER_H_
diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc
new file mode 100644
index 000000000..84441fcf6
--- /dev/null
+++ b/src/mds/ScrubStack.cc
@@ -0,0 +1,1128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "ScrubStack.h"
+#include "common/Finisher.h"
+#include "mds/MDSRank.h"
+#include "mds/MDCache.h"
+#include "mds/MDSContinuation.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mdcache->mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+  return *_dout << "mds." << mds->get_nodeid() << ".scrubstack ";
+}
+
+std::ostream &operator<<(std::ostream &os, const ScrubStack::State &state) {
+  switch(state) {
+  case ScrubStack::STATE_RUNNING:
+    os << "RUNNING";
+    break;
+  case ScrubStack::STATE_IDLE:
+    os << "IDLE";
+    break;
+  case ScrubStack::STATE_PAUSING:
+    os << "PAUSING";
+    break;
+  case ScrubStack::STATE_PAUSED:
+    os << "PAUSED";
+    break;
+  default:
+    ceph_abort();
+  }
+
+  return os;
+}
+
+void ScrubStack::dequeue(MDSCacheObject *obj)
+{
+  dout(20) << "dequeue " << *obj << " from ScrubStack" << dendl;
+  ceph_assert(obj->item_scrub.is_on_list());
+  obj->put(MDSCacheObject::PIN_SCRUBQUEUE);
+  obj->item_scrub.remove_myself();
+  stack_size--;
+}
+
+int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock));
+  if (CInode *in = dynamic_cast<CInode*>(obj)) {
+    if (in->scrub_is_in_progress()) {
+      dout(10) << __func__ << " with {" << *in << "}" << ", already in scrubbing" << dendl;
+      return -CEPHFS_EBUSY;
+    }
+
+    dout(10) << __func__ << " with {" << *in << "}" << ", top=" << top << dendl;
+    in->scrub_initialize(header);
+  } else if (CDir *dir = dynamic_cast<CDir*>(obj)) {
+    if (dir->scrub_is_in_progress()) {
+      dout(10) << __func__ << " with {" << *dir << "}" << ", already in scrubbing" << dendl;
+      return -CEPHFS_EBUSY;
+    }
+
+    dout(10) << __func__ << " with {" << *dir << "}" << ", top=" << top << dendl;
+    // The edge directory must be in memory
+    dir->auth_pin(this);
+    dir->scrub_initialize(header);
+  } else {
+    ceph_assert(0 == "queue dentry to scrub stack");
+  }
+
+  dout(20) << "enqueue " << *obj << " to " << (top ? "top" : "bottom") << " of ScrubStack" << dendl;
+  if (!obj->item_scrub.is_on_list()) {
+    obj->get(MDSCacheObject::PIN_SCRUBQUEUE);
+    stack_size++;
+  }
+  if (top)
+    scrub_stack.push_front(&obj->item_scrub);
+  else
+    scrub_stack.push_back(&obj->item_scrub);
+  return 0;
+}
+
+int ScrubStack::enqueue(CInode *in, ScrubHeaderRef& header, bool top)
+{
+  // abort in progress
+  if (clear_stack)
+    return -CEPHFS_EAGAIN;
+
+  header->set_origin(in->ino());
+  auto ret = scrubbing_map.emplace(header->get_tag(), header);
+  if (!ret.second) {
+    dout(10) << __func__ << " with {" << *in << "}"
+	     << ", conflicting tag " << header->get_tag() << dendl;
+    return -CEPHFS_EEXIST;
+  }
+
+  int r = _enqueue(in, header, top);
+  if (r < 0)
+    return r;
+
+  clog_scrub_summary(in);
+
+  kick_off_scrubs();
+  return 0;
+}
+
+void ScrubStack::add_to_waiting(MDSCacheObject *obj)
+{
+  scrubs_in_progress++;
+  obj->item_scrub.remove_myself();
+  scrub_waiting.push_back(&obj->item_scrub);
+}
+
+void ScrubStack::remove_from_waiting(MDSCacheObject *obj, bool kick)
+{
+  scrubs_in_progress--;
+  if (obj->item_scrub.is_on_list()) {
+    obj->item_scrub.remove_myself();
+    scrub_stack.push_front(&obj->item_scrub);
+    if (kick)
+      kick_off_scrubs();
+  }
+}
+
+class C_RetryScrub : public MDSInternalContext {
+public:
+  C_RetryScrub(ScrubStack *s, MDSCacheObject *o) :
+    MDSInternalContext(s->mdcache->mds), stack(s), obj(o) {
+    stack->add_to_waiting(obj);
+  }
+  void finish(int r) override {
+    stack->remove_from_waiting(obj);
+  }
+private:
+  ScrubStack *stack;
+  MDSCacheObject *obj;
+};
+
+void ScrubStack::kick_off_scrubs()
+{
+  ceph_assert(ceph_mutex_is_locked(mdcache->mds->mds_lock));
+  dout(20) << __func__ << ": state=" << state << dendl;
+
+  if (clear_stack || state == STATE_PAUSING || state == STATE_PAUSED) {
+    if (scrubs_in_progress == 0) {
+      dout(10) << __func__ << ": in progress scrub operations finished, "
+               << stack_size << " in the stack" << dendl;
+
+      State final_state = state;
+      if (clear_stack) {
+        abort_pending_scrubs();
+        final_state = STATE_IDLE;
+      }
+      if (state == STATE_PAUSING) {
+        final_state = STATE_PAUSED;
+      }
+
+      set_state(final_state);
+      complete_control_contexts(0);
+    }
+
+    return;
+  }
+
+  dout(20) << __func__ << " entering with " << scrubs_in_progress << " in "
+              "progress and " << stack_size << " in the stack" << dendl;
+  elist<MDSCacheObject*>::iterator it = scrub_stack.begin();
+  while (g_conf()->mds_max_scrub_ops_in_progress > scrubs_in_progress) {
+    if (it.end()) {
+      if (scrubs_in_progress == 0) {
+        set_state(STATE_IDLE);
+      }
+
+      return;
+    }
+
+    assert(state == STATE_RUNNING || state == STATE_IDLE);
+    set_state(STATE_RUNNING);
+
+    if (CInode *in = dynamic_cast<CInode*>(*it)) {
+      dout(20) << __func__ << " examining " << *in << dendl;
+      ++it;
+
+      if (!validate_inode_auth(in))
+	continue;
+
+      if (!in->is_dir()) {
+	// it's a regular file, symlink, or hard link
+	dequeue(in); // we only touch it this once, so remove from stack
+
+	scrub_file_inode(in);
+      } else {
+	bool added_children = false;
+	bool done = false; // it's done, so pop it off the stack
+	scrub_dir_inode(in, &added_children, &done);
+	if (done) {
+	  dout(20) << __func__ << " dir inode, done" << dendl;
+	  dequeue(in);
+	}
+	if (added_children) {
+	  // dirfrags were queued at top of stack
+	  it = scrub_stack.begin();
+	}
+      }
+    } else if (CDir *dir = dynamic_cast<CDir*>(*it)) {
+      auto next = it;
+      ++next;
+      bool done = false; // it's done, so pop it off the stack
+      scrub_dirfrag(dir, &done);
+      if (done) {
+	dout(20) << __func__ << " dirfrag, done" << dendl;
+	++it; // child inodes were queued at bottom of stack
+	dequeue(dir);
+      } else {
+	it = next;
+      }
+    } else {
+      ceph_assert(0 == "dentry in scrub stack");
+    }
+  }
+}
+
+bool ScrubStack::validate_inode_auth(CInode *in)
+{
+  if (in->is_auth()) {
+    if (!in->can_auth_pin()) {
+      dout(10) << __func__ << " can't auth pin" << dendl;
+      in->add_waiter(CInode::WAIT_UNFREEZE, new C_RetryScrub(this, in));
+      return false;
+    }
+    return true;
+  } else {
+    MDSRank *mds = mdcache->mds;
+    if (in->is_ambiguous_auth()) {
+      dout(10) << __func__ << " ambiguous auth" << dendl;
+      in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_RetryScrub(this, in));
+    } else if (mds->is_cluster_degraded()) {
+      dout(20) << __func__ << " cluster degraded" << dendl;
+      mds->wait_for_cluster_recovered(new C_RetryScrub(this, in));
+    } else {
+      ScrubHeaderRef header = in->get_scrub_header();
+      ceph_assert(header);
+
+      auto ret = remote_scrubs.emplace(std::piecewise_construct,
+				       std::forward_as_tuple(in),
+				       std::forward_as_tuple());
+      ceph_assert(ret.second); // FIXME: parallel scrubs?
+      auto &scrub_r = ret.first->second;
+      scrub_r.tag = header->get_tag();
+
+      mds_rank_t auth = in->authority().first;
+      dout(10) << __func__ << " forward to mds." << auth << dendl;
+      auto r = make_message<MMDSScrub>(MMDSScrub::OP_QUEUEINO, in->ino(),
+				       std::move(in->scrub_queued_frags()),
+				       header->get_tag(), header->get_origin(),
+				       header->is_internal_tag(), header->get_force(),
+				       header->get_recursive(), header->get_repair());
+      mdcache->mds->send_message_mds(r, auth);
+
+      scrub_r.gather_set.insert(auth);
+      // wait for ACK
+      add_to_waiting(in);
+    }
+    return false;
+  }
+}
+
+void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *done)
+{
+  dout(10) << __func__ << " " << *in << dendl;
+  ceph_assert(in->is_auth());
+  MDSRank *mds = mdcache->mds;
+
+  ScrubHeaderRef header = in->get_scrub_header();
+  ceph_assert(header);
+
+  MDSGatherBuilder gather(g_ceph_context);
+
+  auto &queued = in->scrub_queued_frags();
+  std::map<mds_rank_t, fragset_t> scrub_remote;
+
+  frag_vec_t frags;
+  in->dirfragtree.get_leaves(frags);
+  dout(20) << __func__ << "recursive mode, frags " << frags << dendl;
+  for (auto &fg : frags) {
+    if (queued.contains(fg))
+      continue;
+    CDir *dir = in->get_or_open_dirfrag(mdcache, fg);
+    if (!dir->is_auth()) {
+      if (dir->is_ambiguous_auth()) {
+	dout(20) << __func__ << " ambiguous auth " << *dir  << dendl;
+	dir->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, gather.new_sub());
+      } else if (mds->is_cluster_degraded()) {
+	dout(20) << __func__ << " cluster degraded" << dendl;
+	mds->wait_for_cluster_recovered(gather.new_sub());
+      } else {
+	mds_rank_t auth = dir->authority().first;
+	scrub_remote[auth].insert_raw(fg);
+      }
+    } else if (!dir->can_auth_pin()) {
+      dout(20) << __func__ << " freezing/frozen " << *dir  << dendl;
+      dir->add_waiter(CDir::WAIT_UNFREEZE, gather.new_sub());
+    } else if (dir->get_version() == 0) {
+      dout(20) << __func__ << " barebones " << *dir  << dendl;
+      dir->fetch(gather.new_sub());
+    } else {
+      _enqueue(dir, header, true);
+      queued.insert_raw(dir->get_frag());
+      *added_children = true;
+    }
+  }
+
+  queued.simplify();
+
+  if (gather.has_subs()) {
+    gather.set_finisher(new C_RetryScrub(this, in));
+    gather.activate();
+    return;
+  }
+
+  if (!scrub_remote.empty()) {
+    auto ret = remote_scrubs.emplace(std::piecewise_construct,
+				     std::forward_as_tuple(in),
+				     std::forward_as_tuple());
+    ceph_assert(ret.second); // FIXME: parallel scrubs?
+    auto &scrub_r = ret.first->second;
+    scrub_r.tag = header->get_tag();
+
+    for (auto& p : scrub_remote) {
+      p.second.simplify();
+      dout(20) << __func__ << " forward " << p.second  << " to mds." << p.first << dendl;
+      auto r = make_message<MMDSScrub>(MMDSScrub::OP_QUEUEDIR, in->ino(),
+				       std::move(p.second), header->get_tag(),
+				       header->get_origin(), header->is_internal_tag(),
+				       header->get_force(), header->get_recursive(),
+				       header->get_repair());
+      mds->send_message_mds(r, p.first);
+      scrub_r.gather_set.insert(p.first);
+    }
+    // wait for ACKs
+    add_to_waiting(in);
+    return;
+  }
+
+  scrub_dir_inode_final(in);
+
+  *done = true;
+  dout(10) << __func__ << " done" << dendl;
+}
+
+class C_InodeValidated : public MDSInternalContext
+{
+public:
+  ScrubStack *stack;
+  CInode::validated_data result;
+  CInode *target;
+
+  C_InodeValidated(MDSRank *mds, ScrubStack *stack_, CInode *target_)
+    : MDSInternalContext(mds), stack(stack_), target(target_)
+  {
+    stack->scrubs_in_progress++;
+  }
+  void finish(int r) override {
+    stack->_validate_inode_done(target, r, result);
+    stack->scrubs_in_progress--;
+    stack->kick_off_scrubs();
+  }
+};
+
+void ScrubStack::scrub_dir_inode_final(CInode *in)
+{
+  dout(20) << __func__ << " " << *in << dendl;
+
+  C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in);
+  in->validate_disk_state(&fin->result, fin);
+  return;
+}
+
+void ScrubStack::scrub_dirfrag(CDir *dir, bool *done)
+{
+  ceph_assert(dir != NULL);
+
+  dout(10) << __func__ << " " << *dir << dendl;
+
+  if (!dir->is_complete()) {
+    dir->fetch(new C_RetryScrub(this, dir), true); // already auth pinned
+    dout(10) << __func__ << " incomplete, fetching" << dendl;
+    return;
+  }
+
+  ScrubHeaderRef header = dir->get_scrub_header();
+  version_t last_scrub = dir->scrub_info()->last_recursive.version;
+  if (header->get_recursive()) {
+    for (auto it = dir->begin(); it != dir->end(); ++it) {
+      if (it->first.snapid != CEPH_NOSNAP)
+	continue;
+      CDentry *dn = it->second;
+      CDentry::linkage_t *dnl = dn->get_linkage();
+      if (dn->get_version() <= last_scrub &&
+	  dnl->get_remote_d_type() != DT_DIR &&
+	  !header->get_force()) {
+	dout(15) << __func__ << " skip dentry " << it->first
+		 << ", no change since last scrub" << dendl;
+	continue;
+      }
+      if (dnl->is_primary()) {
+	_enqueue(dnl->get_inode(), header, false);
+      } else if (dnl->is_remote()) {
+	// TODO: check remote linkage
+      }
+    }
+  }
+
+  dir->scrub_local();
+
+  dir->scrub_finished();
+  dir->auth_unpin(this);
+
+  *done = true;
+  dout(10) << __func__ << " done" << dendl;
+}
+
+void ScrubStack::scrub_file_inode(CInode *in)
+{
+  C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in);
+  // At this stage the DN is already past scrub_initialize, so
+  // it's in the cache, it has PIN_SCRUBQUEUE and it is authpinned
+  in->validate_disk_state(&fin->result, fin);
+}
+
+void ScrubStack::_validate_inode_done(CInode *in, int r,
+				      const CInode::validated_data &result)
+{
+  LogChannelRef clog = mdcache->mds->clog;
+  const ScrubHeaderRefConst header = in->scrub_info()->header;
+
+  std::string path;
+  if (!result.passed_validation) {
+    // Build path string for use in messages
+    in->make_path_string(path, true);
+  }
+
+  if (result.backtrace.checked && !result.backtrace.passed &&
+      !result.backtrace.repaired)
+  {
+    // Record backtrace fails as remote linkage damage, as
+    // we may not be able to resolve hard links to this inode
+    mdcache->mds->damage_table.notify_remote_damaged(in->ino(), path);
+  } else if (result.inode.checked && !result.inode.passed &&
+             !result.inode.repaired) {
+    // Record damaged inode structures as damaged dentries as
+    // that is where they are stored
+    auto parent = in->get_projected_parent_dn();
+    if (parent) {
+      auto dir = parent->get_dir();
+      mdcache->mds->damage_table.notify_dentry(
+          dir->inode->ino(), dir->frag, parent->last, parent->get_name(), path);
+    }
+  }
+
+  // Inform the cluster log if we found an error
+  if (!result.passed_validation) {
+    if (result.all_damage_repaired()) {
+      clog->info() << "Scrub repaired inode " << in->ino()
+                   << " (" << path << ")";
+    } else {
+      clog->warn() << "Scrub error on inode " << in->ino()
+                   << " (" << path << ") see " << g_conf()->name
+                   << " log and `damage ls` output for details";
+    }
+
+    // Put the verbose JSON output into the MDS log for later inspection
+    JSONFormatter f;
+    result.dump(&f);
+    CachedStackStringStream css;
+    f.flush(*css);
+    derr << __func__ << " scrub error on inode " << *in << ": " << css->strv()
+         << dendl;
+  } else {
+    dout(10) << __func__ << " scrub passed on inode " << *in << dendl;
+  }
+
+  in->scrub_finished();
+}
+
+void ScrubStack::complete_control_contexts(int r) {
+  ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock));
+
+  for (auto &ctx : control_ctxs) {
+    ctx->complete(r);
+  }
+  control_ctxs.clear();
+}
+
+void ScrubStack::set_state(State next_state) {
+    if (state != next_state) {
+      dout(20) << __func__ << ", from state=" << state << ", to state="
+               << next_state << dendl;
+      state = next_state;
+      clog_scrub_summary();
+    }
+}
+
+bool ScrubStack::scrub_in_transition_state() {
+  ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock));
+  dout(20) << __func__ << ": state=" << state << dendl;
+
+  // STATE_RUNNING is considered as a transition state so as to
+  // "delay" the scrub control operation.
+  if (state == STATE_RUNNING || state == STATE_PAUSING) {
+    return true;
+  }
+
+  return false;
+}
+
+std::string_view ScrubStack::scrub_summary() {
+  ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock));
+
+  bool have_more = false;
+  CachedStackStringStream cs;
+
+  if (state == STATE_IDLE) {
+    if (scrubbing_map.empty())
+      return "idle";
+    *cs << "idle+waiting";
+  }
+
+  if (state == STATE_RUNNING) {
+    if (clear_stack) {
+      *cs << "aborting";
+    } else {
+      *cs << "active";
+    }
+  } else {
+    if (state == STATE_PAUSING) {
+      have_more = true;
+      *cs << "pausing";
+    } else if (state == STATE_PAUSED) {
+      have_more = true;
+      *cs << "paused";
+    }
+
+    if (clear_stack) {
+      if (have_more) {
+        *cs << "+";
+      }
+      *cs << "aborting";
+    }
+  }
+
+  if (!scrubbing_map.empty()) {
+    *cs << " paths [";
+    bool first = true;
+    for (auto &p : scrubbing_map) {
+      if (!first)
+	*cs << ",";
+      auto& header = p.second;
+      if (CInode *in = mdcache->get_inode(header->get_origin()))
+	*cs << scrub_inode_path(in);
+      else
+	*cs << "#" << header->get_origin();
+      first = false;
+    }
+    *cs << "]";
+  }
+
+  return cs->strv();
+}
+
+void ScrubStack::scrub_status(Formatter *f) {
+  ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock));
+
+  f->open_object_section("result");
+
+  CachedStackStringStream css;
+  bool have_more = false;
+
+  if (state == STATE_IDLE) {
+    if (scrubbing_map.empty())
+      *css << "no active scrubs running";
+    else
+      *css << state << " (waiting for more scrubs)";
+  } else if (state == STATE_RUNNING) {
+    if (clear_stack) {
+      *css << "ABORTING";
+    } else {
+      *css << "scrub active";
+    }
+    *css << " (" << stack_size << " inodes in the stack)";
+  } else {
+    if (state == STATE_PAUSING || state == STATE_PAUSED) {
+      have_more = true;
+      *css << state;
+    }
+    if (clear_stack) {
+      if (have_more) {
+        *css << "+";
+      }
+      *css << "ABORTING";
+    }
+
+    *css << " (" << stack_size << " inodes in the stack)";
+  }
+  f->dump_string("status", css->strv());
+
+  f->open_object_section("scrubs");
+
+  for (auto& p : scrubbing_map) {
+    have_more = false;
+    auto& header = p.second;
+
+    std::string tag(header->get_tag());
+    f->open_object_section(tag.c_str()); // scrub id
+
+    if (CInode *in = mdcache->get_inode(header->get_origin()))
+      f->dump_string("path", scrub_inode_path(in));
+    else
+      f->dump_stream("path") << "#" << header->get_origin();
+
+    f->dump_string("tag", header->get_tag());
+
+    CachedStackStringStream optcss;
+    if (header->get_recursive()) {
+      *optcss << "recursive";
+      have_more = true;
+    }
+    if (header->get_repair()) {
+      if (have_more) {
+        *optcss << ",";
+      }
+      *optcss << "repair";
+      have_more = true;
+    }
+    if (header->get_force()) {
+      if (have_more) {
+        *optcss << ",";
+      }
+      *optcss << "force";
+    }
+
+    f->dump_string("options", optcss->strv());
+    f->close_section(); // scrub id
+  }
+  f->close_section(); // scrubs
+  f->close_section(); // result
+}
+
+void ScrubStack::abort_pending_scrubs() {
+  ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock));
+  ceph_assert(clear_stack);
+
+  auto abort_one = [this](MDSCacheObject *obj) {
+    if (CInode *in = dynamic_cast<CInode*>(obj))  {
+      in->scrub_aborted();
+    } else if (CDir *dir = dynamic_cast<CDir*>(obj)) {
+      dir->scrub_aborted();
+      dir->auth_unpin(this);
+    } else {
+      ceph_abort(0 == "dentry in scrub stack");
+    }
+  };
+  for (auto it = scrub_stack.begin(); !it.end(); ++it)
+    abort_one(*it);
+  for (auto it = scrub_waiting.begin(); !it.end(); ++it)
+    abort_one(*it);
+
+  stack_size = 0;
+  scrub_stack.clear();
+  scrub_waiting.clear();
+
+  for (auto& p : remote_scrubs)
+    remove_from_waiting(p.first, false);
+  remote_scrubs.clear();
+
+  clear_stack = false;
+}
+
+void ScrubStack::send_state_message(int op) {
+  MDSRank *mds = mdcache->mds;
+  set<mds_rank_t> up_mds;
+  mds->get_mds_map()->get_up_mds_set(up_mds);
+  for (auto& r : up_mds) {
+    if (r == 0)
+      continue;
+    auto m = make_message<MMDSScrub>(op);
+    mds->send_message_mds(m, r);
+  }
+}
+
+void ScrubStack::scrub_abort(Context *on_finish) {
+  ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock));
+
+  dout(10) << __func__ << ": aborting with " << scrubs_in_progress
+           << " scrubs in progress and " << stack_size << " in the"
+           << " stack" << dendl;
+
+  if (mdcache->mds->get_nodeid() == 0) {
+    scrub_epoch_last_abort = scrub_epoch;
+    scrub_any_peer_aborting = true;
+    send_state_message(MMDSScrub::OP_ABORT);
+  }
+
+  clear_stack = true;
+  if (scrub_in_transition_state()) {
+    if (on_finish)
+      control_ctxs.push_back(on_finish);
+    return;
+  }
+
+  abort_pending_scrubs();
+  if (state != STATE_PAUSED)
+    set_state(STATE_IDLE);
+
+  if (on_finish)
+    on_finish->complete(0);
+}
+
+void ScrubStack::scrub_pause(Context *on_finish) {
+  ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock));
+
+  dout(10) << __func__ << ": pausing with " << scrubs_in_progress
+           << " scrubs in progress and " << stack_size << " in the"
+           << " stack" << dendl;
+
+  if (mdcache->mds->get_nodeid() == 0)
+    send_state_message(MMDSScrub::OP_PAUSE);
+
+  // abort is in progress
+  if (clear_stack) {
+    if (on_finish)
+      on_finish->complete(-CEPHFS_EINVAL);
+    return;
+  }
+
+  bool done = scrub_in_transition_state();
+  if (done) {
+    set_state(STATE_PAUSING);
+    if (on_finish)
+      control_ctxs.push_back(on_finish);
+    return;
+  }
+
+  set_state(STATE_PAUSED);
+  if (on_finish)
+    on_finish->complete(0);
+}
+
+bool ScrubStack::scrub_resume() {
+  ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock));
+  dout(20) << __func__ << ": state=" << state << dendl;
+
+  if (mdcache->mds->get_nodeid() == 0)
+    send_state_message(MMDSScrub::OP_RESUME);
+
+  int r = 0;
+
+  if (clear_stack) {
+    r = -CEPHFS_EINVAL;
+  } else if (state == STATE_PAUSING) {
+    set_state(STATE_RUNNING);
+    complete_control_contexts(-CEPHFS_ECANCELED);
+  } else if (state == STATE_PAUSED) {
+    set_state(STATE_RUNNING);
+    kick_off_scrubs();
+  }
+
+  return r;
+}
+
+// send current scrub summary to cluster log
+void ScrubStack::clog_scrub_summary(CInode *in) {
+  if (in) {
+    std::string what;
+    if (clear_stack) {
+      what = "aborted";
+    } else if (in->scrub_is_in_progress()) {
+      what = "queued";
+    } else {
+      what = "completed";
+    }
+    clog->info() << "scrub " << what << " for path: " << scrub_inode_path(in);
+  }
+
+  clog->info() << "scrub summary: " << scrub_summary();
+}
+
+void ScrubStack::dispatch(const cref_t<Message> &m)
+{
+  switch (m->get_type()) {
+  case MSG_MDS_SCRUB:
+    handle_scrub(ref_cast<MMDSScrub>(m));
+    break;
+
+  case MSG_MDS_SCRUB_STATS:
+    handle_scrub_stats(ref_cast<MMDSScrubStats>(m));
+    break;
+
+  default:
+    derr << " scrub stack unknown message " << m->get_type() << dendl_impl;
+    ceph_abort_msg("scrub stack unknown message");
+  }
+}
+
+void ScrubStack::handle_scrub(const cref_t<MMDSScrub> &m)
+{
+
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+  dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
+
+  switch (m->get_op()) {
+  case MMDSScrub::OP_QUEUEDIR:
+    {
+      CInode *diri = mdcache->get_inode(m->get_ino());
+      ceph_assert(diri);
+
+      std::vector<CDir*> dfs;
+      MDSGatherBuilder gather(g_ceph_context);
+      for (const auto& fg : m->get_frags()) {
+	CDir *dir = diri->get_dirfrag(fg);
+	if (!dir) {
+	  dout(10) << __func__ << " no frag " << fg << dendl;
+	  continue;
+	}
+	if (!dir->is_auth()) {
+	  dout(10) << __func__ << " not auth " << *dir << dendl;
+	  continue;
+	}
+	if (!dir->can_auth_pin()) {
+	  dout(10) << __func__ << " can't auth pin " << *dir <<  dendl;
+	  dir->add_waiter(CDir::WAIT_UNFREEZE, gather.new_sub());
+	  continue;
+	}
+	dfs.push_back(dir);
+      }
+
+      if (gather.has_subs()) {
+	gather.set_finisher(new C_MDS_RetryMessage(mdcache->mds, m));
+	gather.activate();
+	return;
+      }
+
+      fragset_t queued;
+      if (!dfs.empty()) {
+	ScrubHeaderRef header;
+	if (auto it = scrubbing_map.find(m->get_tag()); it != scrubbing_map.end()) {
+	  header = it->second;
+	} else {
+	  header = std::make_shared<ScrubHeader>(m->get_tag(), m->is_internal_tag(),
+						 m->is_force(), m->is_recursive(),
+						 m->is_repair());
+	  header->set_origin(m->get_origin());
+	  scrubbing_map.emplace(header->get_tag(), header);
+	}
+	for (auto dir : dfs) {
+	  queued.insert_raw(dir->get_frag());
+	  _enqueue(dir, header, true);
+	}
+	queued.simplify();
+	kick_off_scrubs();
+      }
+
+      auto r = make_message<MMDSScrub>(MMDSScrub::OP_QUEUEDIR_ACK, m->get_ino(),
+				       std::move(queued), m->get_tag());
+      mdcache->mds->send_message_mds(r, from);
+    }
+    break;
+  case MMDSScrub::OP_QUEUEDIR_ACK:
+    {
+      CInode *diri = mdcache->get_inode(m->get_ino());
+      ceph_assert(diri);
+      auto it = remote_scrubs.find(diri);
+      if (it != remote_scrubs.end() &&
+	  m->get_tag() == it->second.tag) {
+	if (it->second.gather_set.erase(from)) {
+	  auto &queued = diri->scrub_queued_frags();
+	  for (auto &fg : m->get_frags())
+	    queued.insert_raw(fg);
+	  queued.simplify();
+
+	  if (it->second.gather_set.empty()) {
+	    remote_scrubs.erase(it);
+
+	    const auto& header = diri->get_scrub_header();
+	    header->set_epoch_last_forwarded(scrub_epoch);
+	    remove_from_waiting(diri);
+	  }
+	}
+      }
+    }
+    break;
+  case MMDSScrub::OP_QUEUEINO:
+    {
+      CInode *in = mdcache->get_inode(m->get_ino());
+      ceph_assert(in);
+
+      ScrubHeaderRef header;
+      if (auto it = scrubbing_map.find(m->get_tag()); it != scrubbing_map.end()) {
+	header = it->second;
+      } else {
+	header = std::make_shared<ScrubHeader>(m->get_tag(), m->is_internal_tag(),
+					       m->is_force(), m->is_recursive(),
+					       m->is_repair());
+	header->set_origin(m->get_origin());
+	scrubbing_map.emplace(header->get_tag(), header);
+      }
+
+      _enqueue(in, header, true);
+      in->scrub_queued_frags() = m->get_frags();
+      kick_off_scrubs();
+
+      fragset_t queued;
+      auto r = make_message<MMDSScrub>(MMDSScrub::OP_QUEUEINO_ACK, m->get_ino(),
+				       std::move(queued), m->get_tag());
+      mdcache->mds->send_message_mds(r, from);
+    }
+    break;
+  case MMDSScrub::OP_QUEUEINO_ACK:
+    {
+      CInode *in = mdcache->get_inode(m->get_ino());
+      ceph_assert(in);
+      auto it = remote_scrubs.find(in);
+      if (it != remote_scrubs.end() &&
+	  m->get_tag() == it->second.tag &&
+	  it->second.gather_set.erase(from)) {
+	ceph_assert(it->second.gather_set.empty());
+	remote_scrubs.erase(it);
+
+	remove_from_waiting(in, false);
+	dequeue(in);
+
+	const auto& header = in->get_scrub_header();
+	header->set_epoch_last_forwarded(scrub_epoch);
+	in->scrub_finished();
+
+	kick_off_scrubs();
+      }
+    }
+    break;
+  case MMDSScrub::OP_ABORT:
+    scrub_abort(nullptr);
+    break;
+  case MMDSScrub::OP_PAUSE:
+    scrub_pause(nullptr);
+    break;
+  case MMDSScrub::OP_RESUME:
+    scrub_resume();
+    break;
+  default:
+    derr << " scrub stack unknown scrub operation " << m->get_op() << dendl_impl;
+    ceph_abort_msg("scrub stack unknown scrub operation");
+  }
+}
+
+void ScrubStack::handle_scrub_stats(const cref_t<MMDSScrubStats> &m)
+{
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+  dout(7) << __func__ << " " << *m << " from mds." << from << dendl;
+
+  if (from == 0) {
+    if (scrub_epoch != m->get_epoch() - 1) {
+      scrub_epoch = m->get_epoch() - 1;
+      for (auto& p : scrubbing_map) {
+	if (p.second->get_epoch_last_forwarded())
+	  p.second->set_epoch_last_forwarded(scrub_epoch);
+      }
+    }
+    bool any_finished = false;
+    bool any_repaired = false;
+    std::set<std::string> scrubbing_tags;
+    for (auto it = scrubbing_map.begin(); it != scrubbing_map.end(); ) {
+      auto& header = it->second;
+      if (header->get_num_pending() ||
+	  header->get_epoch_last_forwarded() >= scrub_epoch) {
+	scrubbing_tags.insert(it->first);
+	++it;
+      } else if (m->is_finished(it->first)) {
+	any_finished = true;
+	if (header->get_repaired())
+	  any_repaired = true;
+	scrubbing_map.erase(it++);
+      } else {
+	++it;
+      }
+    }
+
+    scrub_epoch = m->get_epoch();
+
+    auto ack = make_message<MMDSScrubStats>(scrub_epoch,
+					    std::move(scrubbing_tags), clear_stack);
+    mdcache->mds->send_message_mds(ack, 0);
+
+    if (any_finished)
+      clog_scrub_summary();
+    if (any_repaired)
+      mdcache->mds->mdlog->trim_all();
+  } else {
+    if (scrub_epoch == m->get_epoch() &&
+	(size_t)from < mds_scrub_stats.size()) {
+      auto& stat = mds_scrub_stats[from];
+      stat.epoch_acked = m->get_epoch();
+      stat.scrubbing_tags = m->get_scrubbing_tags();
+      stat.aborting = m->is_aborting();
+    }
+  }
+}
+
+void ScrubStack::advance_scrub_status()
+{
+  if (!scrub_any_peer_aborting && scrubbing_map.empty())
+    return;
+
+  MDSRank *mds = mdcache->mds;
+
+  set<mds_rank_t> up_mds;
+  mds->get_mds_map()->get_up_mds_set(up_mds);
+  auto up_max = *up_mds.rbegin();
+
+  bool update_scrubbing = false;
+  std::set<std::string> scrubbing_tags;
+
+  if (up_max == 0) {
+    update_scrubbing = true;
+    scrub_any_peer_aborting = false;
+  } else if (mds_scrub_stats.size() > (size_t)(up_max)) {
+    bool any_aborting = false;
+    bool fully_acked = true;
+    for (const auto& stat : mds_scrub_stats) {
+      if (stat.aborting || stat.epoch_acked <= scrub_epoch_last_abort)
+	any_aborting = true;
+      if (stat.epoch_acked != scrub_epoch) {
+	fully_acked = false;
+	continue;
+      }
+      scrubbing_tags.insert(stat.scrubbing_tags.begin(),
+			    stat.scrubbing_tags.end());
+    }
+    if (!any_aborting)
+      scrub_any_peer_aborting = false;
+    if (fully_acked) {
+      // handle_scrub_stats() reports scrub is still in-progress if it has
+      // forwarded any object to other mds since previous epoch. Let's assume,
+      // at time 'A', we got scrub stats from all mds for previous epoch. If
+      // a scrub is not reported by any mds, we know there is no forward of
+      // the scrub since time 'A'. So we can consider the scrub is finished.
+      if (scrub_epoch_fully_acked + 1 == scrub_epoch)
+	update_scrubbing = true;
+      scrub_epoch_fully_acked = scrub_epoch;
+    }
+  }
+
+  if (mds_scrub_stats.size() != (size_t)up_max + 1)
+    mds_scrub_stats.resize((size_t)up_max + 1);
+  mds_scrub_stats.at(0).epoch_acked = scrub_epoch + 1;
+
+  bool any_finished = false;
+  bool any_repaired = false;
+
+  for (auto it = scrubbing_map.begin(); it != scrubbing_map.end(); ) {
+    auto& header = it->second;
+    if (header->get_num_pending() ||
+	header->get_epoch_last_forwarded() >= scrub_epoch) {
+      if (update_scrubbing && up_max != 0)
+	scrubbing_tags.insert(it->first);
+      ++it;
+    } else if (update_scrubbing && !scrubbing_tags.count(it->first)) {
+      // no longer being scrubbed globally
+      any_finished = true;
+      if (header->get_repaired())
+	any_repaired = true;
+      scrubbing_map.erase(it++);
+    } else {
+      ++it;
+    }
+  }
+
+  ++scrub_epoch;
+
+  for (auto& r : up_mds) {
+    if (r == 0)
+      continue;
+    auto m = update_scrubbing ?
+	make_message<MMDSScrubStats>(scrub_epoch, scrubbing_tags) :
+	make_message<MMDSScrubStats>(scrub_epoch);
+    mds->send_message_mds(m, r);
+  }
+
+  if (any_finished)
+    clog_scrub_summary();
+  if (any_repaired)
+    mdcache->mds->mdlog->trim_all();
+}
+
+void ScrubStack::handle_mds_failure(mds_rank_t mds)
+{
+  if (mds == 0) {
+    scrub_abort(nullptr);
+    return;
+  }
+
+  bool kick = false;
+  for (auto it = remote_scrubs.begin(); it != remote_scrubs.end(); ) {
+    if (it->second.gather_set.erase(mds) &&
+	it->second.gather_set.empty()) {
+      CInode *in = it->first;
+      remote_scrubs.erase(it++);
+      remove_from_waiting(in, false);
+      kick = true;
+    } else {
+      ++it;
+    }
+  }
+  if (kick)
+    kick_off_scrubs();
+}
diff --git a/src/mds/ScrubStack.h b/src/mds/ScrubStack.h
new file mode 100644
index 000000000..62a4a5299
--- /dev/null
+++ b/src/mds/ScrubStack.h
@@ -0,0 +1,277 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef SCRUBSTACK_H_
+#define SCRUBSTACK_H_
+
+#include "CDir.h"
+#include "CDentry.h"
+#include "CInode.h"
+#include "MDSContext.h"
+#include "ScrubHeader.h"
+
+#include "common/LogClient.h"
+#include "include/elist.h"
+#include "messages/MMDSScrub.h"
+#include "messages/MMDSScrubStats.h"
+
+class MDCache;
+class Finisher;
+
+class ScrubStack {
+public:
+  ScrubStack(MDCache *mdc, LogChannelRef &clog, Finisher *finisher_) :
+    mdcache(mdc),
+    clog(clog),
+    finisher(finisher_),
+    scrub_stack(member_offset(MDSCacheObject, item_scrub)),
+    scrub_waiting(member_offset(MDSCacheObject, item_scrub)) {}
+  ~ScrubStack() {
+    ceph_assert(scrub_stack.empty());
+    ceph_assert(!scrubs_in_progress);
+  }
+  /**
+   * Put the inode at either the top or bottom of the stack, with the
+   * given scrub params, and kick off more scrubbing.
+   * @param in The inode to scrub
+   * @param header The ScrubHeader propagated from wherever this scrub
+   */
+  int enqueue(CInode *in, ScrubHeaderRef& header, bool top);
+  /**
+   * Abort an ongoing scrub operation. The abort operation could be
+   * delayed if there are in-progress scrub operations on going. The
+   * caller should provide a context which is completed after all
+   * in-progress scrub operations are completed and pending inodes
+   * are removed from the scrub stack (with the context callbacks for
+   * inodes completed with -CEPHFS_ECANCELED).
+   * @param on_finish Context callback to invoke after abort
+   */
+  void scrub_abort(Context *on_finish);
+
+  /**
+   * Pause scrub operations. Similar to abort, pause is delayed if
+   * there are in-progress scrub operations on going. The caller
+   * should provide a context which is completed after all in-progress
+   * scrub operations are completed. Subsequent scrub operations are
+   * queued until scrub is resumed.
+   * @param on_finish Context callback to invoke after pause
+   */
+  void scrub_pause(Context *on_finish);
+
+  /**
+   * Resume a paused scrub. Unlike abort or pause, this is instantaneous.
+   * Pending pause operations are cancelled (context callbacks are
+   * invoked with -CEPHFS_ECANCELED).
+   * @returns 0 (success) if resumed, -CEPHFS_EINVAL if an abort is in-progress.
+   */
+  bool scrub_resume();
+
+  /**
+   * Get the current scrub status as human readable string. Some basic
+   * information is returned such as number of inodes pending abort/pause.
+   */
+  void scrub_status(Formatter *f);
+
+  /**
+   * Get a high level scrub status summary such as current scrub state
+   * and scrub paths.
+   */
+  std::string_view scrub_summary();
+
+  static bool is_idle(std::string_view state_str) {
+    return state_str == "idle";
+  }
+
+  bool is_scrubbing() const { return !scrub_stack.empty(); }
+
+  void advance_scrub_status();
+
+  void handle_mds_failure(mds_rank_t mds);
+
+  void dispatch(const cref_t<Message> &m);
+
+  MDCache *mdcache;
+
+protected:
+
+  // reference to global cluster log client
+  LogChannelRef &clog;
+
+  /// A finisher needed so that we don't re-enter kick_off_scrubs
+  Finisher *finisher;
+
+  /// The stack of inodes we want to scrub
+  elist<MDSCacheObject*> scrub_stack;
+  elist<MDSCacheObject*> scrub_waiting;
+  /// current number of dentries we're actually scrubbing
+  int scrubs_in_progress = 0;
+  int stack_size = 0;
+
+  struct scrub_remote_t {
+    std::string tag;
+    std::set<mds_rank_t> gather_set;
+  };
+  std::map<CInode*, scrub_remote_t> remote_scrubs;
+
+  unsigned scrub_epoch = 2;
+  unsigned scrub_epoch_fully_acked = 0;
+  unsigned scrub_epoch_last_abort = 2;
+  // check if any mds is aborting scrub after mds.0 starts
+  bool scrub_any_peer_aborting = true;
+
+  struct scrub_stat_t {
+    unsigned epoch_acked = 0;
+    std::set<std::string> scrubbing_tags;
+    bool aborting = false;
+  };
+  std::vector<scrub_stat_t> mds_scrub_stats;
+
+  std::map<std::string, ScrubHeaderRef> scrubbing_map;
+
+  friend class C_RetryScrub;
+private:
+  // scrub abort is _not_ a state, rather it's an operation that's
+  // performed after in-progress scrubs are finished.
+  enum State {
+    STATE_RUNNING = 0,
+    STATE_IDLE,
+    STATE_PAUSING,
+    STATE_PAUSED,
+  };
+  friend std::ostream &operator<<(std::ostream &os, const State &state);
+
+  friend class C_InodeValidated;
+
+  int _enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top);
+  /**
+   * Remove the inode/dirfrag from the stack.
+   */
+  inline void dequeue(MDSCacheObject *obj);
+
+  /**
+   * Kick off as many scrubs as are appropriate, based on the current
+   * state of the stack.
+   */
+  void kick_off_scrubs();
+
+  /**
+   * Move the inode/dirfrag that can't be scrubbed immediately
+   * from scrub queue to waiting list.
+   */
+  void add_to_waiting(MDSCacheObject *obj);
+  /**
+   * Move the inode/dirfrag back to scrub queue.
+   */
+  void remove_from_waiting(MDSCacheObject *obj, bool kick=true);
+  /**
+   * Validate authority of the inode. If current mds is not auth of the inode,
+   * forword scrub to auth mds.
+   */
+  bool validate_inode_auth(CInode *in);
+
+  /**
+   * Scrub a file inode.
+   * @param in The inode to scrub
+   */
+  void scrub_file_inode(CInode *in);
+
+  /**
+   * Callback from completion of CInode::validate_disk_state
+   * @param in The inode we were validating
+   * @param r The return status from validate_disk_state
+   * @param result Populated results from validate_disk_state
+   */
+  void _validate_inode_done(CInode *in, int r,
+			    const CInode::validated_data &result);
+
+  /**
+   * Scrub a directory inode. It queues child dirfrags, then does
+   * final scrub of the inode.
+   *
+   * @param in The directory indoe to scrub
+   * @param added_children set to true if we pushed some of our children
+   * @param done set to true if we started to do final scrub
+   */
+  void scrub_dir_inode(CInode *in, bool *added_children, bool *done);
+  /**
+   * Scrub a dirfrag. It queues child dentries, then does final
+   * scrub of the dirfrag.
+   *
+   * @param dir The dirfrag to scrub (must be auth)
+   * @param done set to true if we started to do final scrub
+   */
+  void scrub_dirfrag(CDir *dir, bool *done);
+  /**
+   * Scrub a directory-representing dentry.
+   *
+   * @param in The directory inode we're doing final scrub on.
+   */
+  void scrub_dir_inode_final(CInode *in);
+  /**
+   * Set scrub state
+   * @param next_state State to move the scrub to.
+   */
+  void set_state(State next_state);
+
+  /**
+   * Is scrub in one of transition states (running, pausing)
+   */
+  bool scrub_in_transition_state();
+
+  /**
+   * complete queued up contexts
+   * @param r return value to complete contexts.
+   */
+  void complete_control_contexts(int r);
+
+  /**
+   * ask peer mds (rank > 0) to abort/pause/resume scrubs
+   */
+  void send_state_message(int op);
+
+  /**
+   * Abort pending scrubs for inodes waiting in the inode stack.
+   * Completion context is complete with -CEPHFS_ECANCELED.
+   */
+  void abort_pending_scrubs();
+
+  /**
+   * Return path for a given inode.
+   * @param in inode to make path entry.
+   */
+  std::string scrub_inode_path(CInode *in) {
+    std::string path;
+    in->make_path_string(path, true);
+    return (path.empty() ? "/" : path.c_str());
+  }
+
+  /**
+   * Send scrub information (queued/finished scrub path and summary)
+   * to cluster log.
+   * @param in inode for which scrub has been queued or finished.
+   */
+  void clog_scrub_summary(CInode *in=nullptr);
+
+  void handle_scrub(const cref_t<MMDSScrub> &m);
+  void handle_scrub_stats(const cref_t<MMDSScrubStats> &m);
+
+  State state = STATE_IDLE;
+  bool clear_stack = false;
+
+  // list of pending context completions for asynchronous scrub
+  // control operations.
+  std::vector<Context *> control_ctxs;
+};
+
+#endif /* SCRUBSTACK_H_ */
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
new file mode 100644
index 000000000..4912b3bab
--- /dev/null
+++ b/src/mds/Server.cc
@@ -0,0 +1,11184 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <boost/lexical_cast.hpp>
+#include "include/ceph_assert.h"  // lexical_cast includes system assert.h
+
+#include <boost/config/warning_disable.hpp>
+#include <boost/fusion/include/std_pair.hpp>
+#include <boost/range/adaptor/reversed.hpp>
+
+#include "MDSRank.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDCache.h"
+#include "MDLog.h"
+#include "Migrator.h"
+#include "MDBalancer.h"
+#include "InoTable.h"
+#include "SnapClient.h"
+#include "Mutation.h"
+#include "MetricsHandler.h"
+#include "cephfs_features.h"
+
+#include "msg/Messenger.h"
+
+#include "osdc/Objecter.h"
+
+#include "events/EUpdate.h"
+#include "events/EPeerUpdate.h"
+#include "events/ESession.h"
+#include "events/EOpen.h"
+#include "events/ECommitted.h"
+#include "events/EPurged.h"
+
+#include "include/stringify.h"
+#include "include/filepath.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "include/compat.h"
+#include "osd/OSDMap.h"
+
+#include <errno.h>
+
+#include <list>
+#include <regex>
+#include <string_view>
+#include <functional>
+
+#include "common/config.h"
+
+#include "msg/Message.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
+
+class ServerContext : public MDSContext {
+  protected:
+  Server *server;
+  MDSRank *get_mds() override
+  {
+    return server->mds;
+  }
+
+  public:
+  explicit ServerContext(Server *s) : server(s) {
+    ceph_assert(server != NULL);
+  }
+};
+
+class Batch_Getattr_Lookup : public BatchOp {
+protected:
+  Server* server;
+  ceph::ref_t<MDRequestImpl> mdr;
+  std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
+  int res = 0;
+public:
+  Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
+    : server(s), mdr(r) {
+    if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
+      mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
+    else
+      mdr->batch_op_map = &mdr->in[0]->batch_ops;
+  }
+  void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
+    batch_reqs.push_back(r);
+  }
+  ceph::ref_t<MDRequestImpl> find_new_head() override {
+    while (!batch_reqs.empty()) {
+      auto r = std::move(batch_reqs.back());
+      batch_reqs.pop_back();
+      if (r->killed)
+	continue;
+
+      r->batch_op_map = mdr->batch_op_map;
+      mdr->batch_op_map = nullptr;
+      mdr = r;
+      return mdr;
+    }
+    return nullptr;
+  }
+  void _forward(mds_rank_t t) override {
+    MDCache* mdcache = server->mdcache;
+    mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
+    mdr->set_mds_stamp(ceph_clock_now());
+    for (auto& m : batch_reqs) {
+      if (!m->killed)
+	mdcache->request_forward(m, t);
+    }
+    batch_reqs.clear();
+  }
+  void _respond(int r) override {
+    mdr->set_mds_stamp(ceph_clock_now());
+    for (auto& m : batch_reqs) {
+      if (!m->killed) {
+	m->tracei = mdr->tracei;
+	m->tracedn = mdr->tracedn;
+	server->respond_to_request(m, r);
+      }
+    }
+    batch_reqs.clear();
+    server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
+  }
+  void print(std::ostream& o) {
+    o << "[batch front=" << *mdr << "]";
+  }
+};
+
+class ServerLogContext : public MDSLogContextBase {
+protected:
+  Server *server;
+  MDSRank *get_mds() override
+  {
+    return server->mds;
+  }
+
+  MDRequestRef mdr;
+  void pre_finish(int r) override {
+    if (mdr)
+      mdr->mark_event("journal_committed: ");
+  }
+public:
+  explicit ServerLogContext(Server *s) : server(s) {
+    ceph_assert(server != NULL);
+  }
+  explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
+    ceph_assert(server != NULL);
+  }
+};
+
+void Server::create_logger()
+{
+  PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
+
+  plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
+                      "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
+  plb.add_u64_counter(l_mdss_handle_peer_request, "handle_peer_request",
+                      "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
+  plb.add_u64_counter(l_mdss_handle_client_session,
+                      "handle_client_session", "Client session messages", "hcs",
+                      PerfCountersBuilder::PRIO_INTERESTING);
+  plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
+                      "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
+  plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
+                      "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
+                      PerfCountersBuilder::PRIO_INTERESTING);
+
+  // fop latencies are useful
+  plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+  plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
+                   "Request type lookup hash of inode latency");
+  plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
+                   "Request type lookup inode latency");
+  plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
+                   "Request type lookup parent latency");
+  plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
+                   "Request type lookup name latency");
+  plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
+                   "Request type lookup latency");
+  plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
+                   "Request type lookup snapshot latency");
+  plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
+                   "Request type get attribute latency");
+  plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
+                   "Request type set attribute latency");
+  plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
+                   "Request type set file layout latency");
+  plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
+                   "Request type set directory layout latency");
+  plb.add_time_avg(l_mdss_req_getvxattr_latency, "req_getvxattr_latency",
+                   "Request type get virtual extended attribute latency");
+  plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
+                   "Request type set extended attribute latency");
+  plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
+                   "Request type remove extended attribute latency");
+  plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
+                   "Request type read directory latency");
+  plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
+                   "Request type set file lock latency");
+  plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
+                   "Request type get file lock latency");
+  plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
+                   "Request type create latency");
+  plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
+                   "Request type open latency");
+  plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
+                   "Request type make node latency");
+  plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
+                   "Request type link latency");
+  plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
+                   "Request type unlink latency");
+  plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
+                   "Request type remove directory latency");
+  plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
+                   "Request type rename latency");
+  plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
+                   "Request type make directory latency");
+  plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
+                   "Request type symbolic link latency");
+  plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
+                   "Request type list snapshot latency");
+  plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
+                   "Request type make snapshot latency");
+  plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
+                   "Request type remove snapshot latency");
+  plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
+                   "Request type rename snapshot latency");
+
+  plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+  plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
+                      "Client requests dispatched");
+  plb.add_u64_counter(l_mdss_dispatch_peer_request, "dispatch_server_request",
+                      "Server requests dispatched");
+
+  logger = plb.create_perf_counters();
+  g_ceph_context->get_perfcounters_collection()->add(logger);
+}
+
+Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
+  mds(m), 
+  mdcache(mds->mdcache), mdlog(mds->mdlog),
+  recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
+  metrics_handler(metrics_handler)
+{
+  forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
+  replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
+  cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
+  max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
+  delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
+  max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+  cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
+  max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
+  caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
+  dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
+  supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
+  supported_metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
+}
+
+void Server::dispatch(const cref_t<Message> &m)
+{
+  switch (m->get_type()) {
+  case CEPH_MSG_CLIENT_RECONNECT:
+    handle_client_reconnect(ref_cast<MClientReconnect>(m));
+    return;
+  }
+
+/*
+ *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
+
+1. In reconnect phase, client sent unsafe requests to mds.
+2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
+(Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
+3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
+
+*/
+  bool sessionclosed_isok = replay_unsafe_with_closed_session;
+  // active?
+  // handle_peer_request()/handle_client_session() will wait if necessary
+  if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
+    const auto &req = ref_cast<MClientRequest>(m);
+    if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
+      Session *session = mds->get_session(req);
+      if (!session || (!session->is_open() && !sessionclosed_isok)) {
+	dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
+	return;
+      }
+      bool queue_replay = false;
+      if (req->is_replay() || req->is_async()) {
+	dout(3) << "queuing replayed op" << dendl;
+	queue_replay = true;
+	if (req->head.ino &&
+	    !session->have_completed_request(req->get_reqid().tid, nullptr)) {
+	  inodeno_t ino(req->head.ino);
+	  mdcache->add_replay_ino_alloc(ino);
+	  if (replay_unsafe_with_closed_session &&
+	      session->free_prealloc_inos.contains(ino)) {
+	    // don't purge inodes that will be created by later replay
+	    session->free_prealloc_inos.erase(ino);
+	    session->delegated_inos.insert(ino);
+	  }
+	}
+      } else if (req->get_retry_attempt()) {
+	// process completed request in clientreplay stage. The completed request
+	// might have created new file/directorie. This guarantees MDS sends a reply
+	// to client before other request modifies the new file/directorie.
+	if (session->have_completed_request(req->get_reqid().tid, NULL)) {
+	  dout(3) << "queuing completed op" << dendl;
+	  queue_replay = true;
+	}
+	// this request was created before the cap reconnect message, drop any embedded
+	// cap releases.
+	req->releases.clear();
+      }
+      if (queue_replay) {
+	req->mark_queued_for_replay();
+	mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
+	return;
+      }
+    }
+
+    bool wait_for_active = true;
+    if (mds->is_stopping()) {
+      wait_for_active = false;
+    } else if (mds->is_clientreplay()) {
+      if (req->is_queued_for_replay()) {
+	wait_for_active = false;
+      }
+    }
+    if (wait_for_active) {
+      dout(3) << "not active yet, waiting" << dendl;
+      mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
+      return;
+    }
+  }
+
+  switch (m->get_type()) {
+  case CEPH_MSG_CLIENT_SESSION:
+    handle_client_session(ref_cast<MClientSession>(m));
+    return;
+  case CEPH_MSG_CLIENT_REQUEST:
+    handle_client_request(ref_cast<MClientRequest>(m));
+    return;
+  case CEPH_MSG_CLIENT_RECLAIM:
+    handle_client_reclaim(ref_cast<MClientReclaim>(m));
+    return;
+  case MSG_MDS_PEER_REQUEST:
+    handle_peer_request(ref_cast<MMDSPeerRequest>(m));
+    return;
+  default:
+    derr << "Server unknown message " << m->get_type() << " from peer type " << m->get_connection()->get_peer_type() << dendl;
+    ceph_abort_msg("server unknown message  " + to_string(m->get_type()) + " from peer type " + to_string(m->get_connection()->get_peer_type()));  
+  }
+}
+
+
+
+// ----------------------------------------------------------
+// SESSION management
+
+class C_MDS_session_finish : public ServerLogContext {
+  Session *session;
+  uint64_t state_seq;
+  bool open;
+  version_t cmapv;
+  interval_set<inodeno_t> inos_to_free;
+  version_t inotablev;
+  interval_set<inodeno_t> inos_to_purge;
+  LogSegment *ls = nullptr;
+  Context *fin;
+public:
+  C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = nullptr) :
+    ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
+  C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv,
+		       const interval_set<inodeno_t>& to_free, version_t iv,
+		       const interval_set<inodeno_t>& to_purge, LogSegment *_ls, Context *fin_ = nullptr) :
+    ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv),
+    inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge), ls(_ls), fin(fin_) {}
+  void finish(int r) override {
+    ceph_assert(r == 0);
+    server->_session_logged(session, state_seq, open, cmapv, inos_to_free, inotablev, inos_to_purge, ls);
+    if (fin) {
+      fin->complete(r);
+    }
+  }
+};
+
+Session* Server::find_session_by_uuid(std::string_view uuid)
+{
+  Session* session = nullptr;
+  for (auto& it : mds->sessionmap.get_sessions()) {
+    auto& metadata = it.second->info.client_metadata;
+
+    auto p = metadata.find("uuid");
+    if (p == metadata.end() || p->second != uuid)
+      continue;
+
+    if (!session) {
+      session = it.second;
+    } else if (!session->reclaiming_from) {
+      assert(it.second->reclaiming_from == session);
+      session = it.second;
+    } else {
+      assert(session->reclaiming_from == it.second);
+    }
+  }
+  return session;
+}
+
+void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
+{
+  if (!session->is_open() && !session->is_stale()) {
+    dout(10) << "session not open, dropping this req" << dendl;
+    return;
+  }
+
+  auto reply = make_message<MClientReclaimReply>(0);
+  if (m->get_uuid().empty()) {
+    dout(10) << __func__ << " invalid message (no uuid)" << dendl;
+    reply->set_result(-CEPHFS_EINVAL);
+    mds->send_message_client(reply, session);
+    return;
+  }
+
+  unsigned flags = m->get_flags();
+  if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
+    dout(10) << __func__ << " unsupported flags" << dendl;
+    reply->set_result(-CEPHFS_EINVAL);
+    mds->send_message_client(reply, session);
+    return;
+  }
+
+  Session* target = find_session_by_uuid(m->get_uuid());
+  if (target) {
+    if (session->info.auth_name != target->info.auth_name) {
+      dout(10) << __func__ << " session auth_name " << session->info.auth_name
+	       << " != target auth_name " << target->info.auth_name << dendl;
+      reply->set_result(-CEPHFS_EPERM);
+      mds->send_message_client(reply, session);
+    }
+
+    assert(!target->reclaiming_from);
+    assert(!session->reclaiming_from);
+    session->reclaiming_from = target;
+    reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
+  }
+
+  if (flags & CEPH_RECLAIM_RESET) {
+    finish_reclaim_session(session, reply);
+  } else ceph_assert(0); /* no other flags are handled at this time */
+}
+
+void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
+{
+  Session *target = session->reclaiming_from;
+  if (target) {
+    session->reclaiming_from = nullptr;
+
+    Context *send_reply;
+    if (reply) {
+      int64_t session_id = session->get_client().v;
+      send_reply = new LambdaContext([this, session_id, reply](int r) {
+	    assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
+	    Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
+	    if (!session) {
+	      return;
+	    }
+	    auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
+	    reply->set_epoch(epoch);
+	    mds->send_message_client(reply, session);
+	  });
+    } else {
+      send_reply = nullptr;
+    }
+
+    bool blocklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
+	  return map.is_blocklisted(target->info.inst.addr);
+	});
+
+    if (blocklisted || !g_conf()->mds_session_blocklist_on_evict) {
+      kill_session(target, send_reply);
+    } else {
+      CachedStackStringStream css;
+      mds->evict_client(target->get_client().v, false, true, *css, send_reply);
+    }
+  } else if (reply) {
+    mds->send_message_client(reply, session);
+  }
+}
+
+void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
+{
+  Session *session = mds->get_session(m);
+  dout(3) << __func__ <<  " " << *m << " from " << m->get_source() << dendl;
+  ceph_assert(m->is_a_client()); // should _not_ come from an mds!
+
+  if (!session) {
+    dout(0) << " ignoring sessionless msg " << *m << dendl;
+    return;
+  }
+
+  std::string_view fs_name = mds->get_fs_name();
+  if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
+    dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
+    return;
+  }
+
+  if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+    mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
+  if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
+    finish_reclaim_session(session);
+  } else {
+    reclaim_session(session, m);
+  }
+}
+
+void Server::handle_client_session(const cref_t<MClientSession> &m)
+{
+  version_t pv;
+  Session *session = mds->get_session(m);
+
+  dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
+  ceph_assert(m->is_a_client()); // should _not_ come from an mds!
+
+  if (!session) {
+    dout(0) << " ignoring sessionless msg " << *m << dendl;
+    auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
+    reply->metadata["error_string"] = "sessionless";
+    mds->send_message(reply, m->get_connection());
+    return;
+  }
+
+  std::string_view fs_name = mds->get_fs_name();
+  if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
+    dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
+    auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
+    reply->metadata["error_string"] = "client doesn't have caps for FS \"" +
+				      std::string(fs_name) + "\"";
+    mds->send_message(std::move(reply), m->get_connection());
+    return;
+  }
+
+  if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
+    // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
+  } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
+    // close requests need to be handled when mds is active
+    if (mds->get_state() < MDSMap::STATE_ACTIVE) {
+      mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
+      return;
+    }
+  } else {
+    if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+      mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+      return;
+    }
+  }
+
+  if (logger)
+    logger->inc(l_mdss_handle_client_session);
+
+  uint64_t sseq = 0;
+  switch (m->get_op()) {
+  case CEPH_SESSION_REQUEST_OPEN:
+    if (session->is_opening() ||
+	session->is_open() ||
+	session->is_stale() ||
+	session->is_killing() ||
+	terminating_sessions) {
+      if (m->supported_features.test(CEPHFS_FEATURE_NOTIFY_SESSION_STATE)) {
+	if (session->is_open() && !mds->is_stopping()) {
+          dout(10) << "currently already opened" << dendl;
+
+          auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN,
+                                                    session->get_push_seq());
+          if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
+            reply->supported_features = supported_features;
+          mds->send_message_client(reply, session);
+          if (mdcache->is_readonly()) {
+            auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
+            mds->send_message_client(m, session);
+          }
+	}
+      }
+      dout(10) << "currently " << session->get_state_name()
+               << ", dropping this req" << dendl;
+      return;
+    }
+    ceph_assert(session->is_closed() || session->is_closing());
+
+    if (mds->is_stopping()) {
+      dout(10) << "mds is stopping, dropping open req" << dendl;
+      return;
+    }
+
+    {
+      auto& addr = session->info.inst.addr;
+      session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
+      auto& client_metadata = session->info.client_metadata;
+
+      auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
+        auto now = ceph_clock_now();
+        auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
+        auto elapsed = now - m->get_recv_stamp();
+        CachedStackStringStream css;
+        *css << "New client session:"
+             << " addr=\"" <<  session->info.inst.addr << "\""
+             << ",elapsed=" << elapsed
+             << ",throttled=" << throttle_elapsed
+             << ",status=\"" << status << "\"";
+        if (!err.empty()) {
+          *css << ",error=\"" << err << "\"";
+        }
+        const auto& metadata = session->info.client_metadata;
+        if (auto it = metadata.find("root"); it != metadata.end()) {
+          *css << ",root=\"" << it->second << "\"";
+        }
+        dout(2) << css->strv() << dendl;
+      };
+
+      auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str, unsigned flags=0) {
+	auto m = make_message<MClientSession>(CEPH_SESSION_REJECT, 0, flags);
+	if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
+	  m->metadata["error_string"] = err_str;
+	mds->send_message_client(m, session);
+        log_session_status("REJECTED", err_str);
+      };
+
+      bool blocklisted = mds->objecter->with_osdmap(
+	  [&addr](const OSDMap &osd_map) -> bool {
+	    return osd_map.is_blocklisted(addr);
+	  });
+
+      if (blocklisted) {
+	dout(10) << "rejecting blocklisted client " << addr << dendl;
+	// This goes on the wire and the "blacklisted" substring is
+	// depended upon by the kernel client for detecting whether it
+	// has been blocklisted.  If mounted with recover_session=clean
+	// (since 5.4), it tries to automatically recover itself from
+	// blocklisting.
+        unsigned flags = 0;
+	flags |= MClientSession::SESSION_BLOCKLISTED;
+	send_reject_message("blocklisted (blacklisted)", flags);
+	session->clear();
+	break;
+      }
+
+      if (client_metadata.features.empty())
+	infer_supported_features(session, client_metadata);
+
+      dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
+      dout(20) << " features: '" << client_metadata.features << "'" << dendl;
+      dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
+      for (const auto& p : client_metadata) {
+	dout(20) << "  " << p.first << ": " << p.second << dendl;
+      }
+
+      feature_bitset_t missing_features = required_client_features;
+      missing_features -= client_metadata.features;
+      if (!missing_features.empty()) {
+	CachedStackStringStream css;
+	*css << "missing required features '" << missing_features << "'";
+	send_reject_message(css->strv());
+	mds->clog->warn() << "client session (" << session->info.inst
+                          << ") lacks required features " << missing_features
+                          << "; client supports " << client_metadata.features;
+	session->clear();
+	break;
+      }
+
+      // Special case for the 'root' metadata path; validate that the claimed
+      // root is actually within the caps of the session
+      if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
+	auto claimed_root = it->second;
+	CachedStackStringStream css;
+	bool denied = false;
+	// claimed_root has a leading "/" which we strip before passing
+	// into caps check
+	if (claimed_root.empty() || claimed_root[0] != '/') {
+	  denied = true;
+	  *css << "invalue root '" << claimed_root << "'";
+	} else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
+	  denied = true;
+	  *css << "non-allowable root '" << claimed_root << "'";
+	}
+
+	if (denied) {
+	  // Tell the client we're rejecting their open
+	  send_reject_message(css->strv());
+	  mds->clog->warn() << "client session with " << css->strv()
+			    << " denied (" << session->info.inst << ")";
+	  session->clear();
+	  break;
+	}
+      }
+
+      if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
+	if (find_session_by_uuid(it->second)) {
+	  send_reject_message("duplicated session uuid");
+	  mds->clog->warn() << "client session with duplicated session uuid '"
+			    << it->second << "' denied (" << session->info.inst << ")";
+	  session->clear();
+	  break;
+	}
+      }
+
+      if (session->is_closed()) {
+        mds->sessionmap.add_session(session);
+      }
+
+      pv = mds->sessionmap.mark_projected(session);
+      sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
+      mds->sessionmap.touch_session(session);
+      auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
+        ceph_assert(r == 0);
+        log_session_status("ACCEPTED", "");
+      });
+      mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
+				new C_MDS_session_finish(this, session, sseq, true, pv, fin));
+      mdlog->flush();
+    }
+    break;
+
+  case CEPH_SESSION_REQUEST_RENEWCAPS:
+    if (session->is_open() || session->is_stale()) {
+      mds->sessionmap.touch_session(session);
+      if (session->is_stale()) {
+	mds->sessionmap.set_state(session, Session::STATE_OPEN);
+	mds->locker->resume_stale_caps(session);
+	mds->sessionmap.touch_session(session);
+      }
+      auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
+      mds->send_message_client(reply, session);
+    } else {
+      dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
+    }
+    break;
+    
+  case CEPH_SESSION_REQUEST_CLOSE:
+    {
+      if (session->is_closed() || 
+	  session->is_closing() ||
+	  session->is_killing()) {
+	dout(10) << "already closed|closing|killing, dropping this req" << dendl;
+	return;
+      }
+      if (session->is_importing()) {
+	dout(10) << "ignoring close req on importing session" << dendl;
+	return;
+      }
+      ceph_assert(session->is_open() || 
+	     session->is_stale() || 
+	     session->is_opening());
+      if (m->get_seq() < session->get_push_seq()) {
+	dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq() 
+		 << ", dropping" << dendl;
+	return;
+      }
+      // We are getting a seq that is higher than expected.
+      // Handle the same as any other seqn error.
+      //
+      if (m->get_seq() != session->get_push_seq()) {
+	dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
+		<< ", BUGGY!" << dendl;
+	mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
+			  << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
+	return;
+      }
+      journal_close_session(session, Session::STATE_CLOSING, NULL);
+    }
+    break;
+
+  case CEPH_SESSION_FLUSHMSG_ACK:
+    finish_flush_session(session, m->get_seq());
+    break;
+
+  case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
+    if (mds->is_active())
+      mdlog->flush();
+    break;
+
+  default:
+    auto m = make_message<MClientSession>(CEPH_SESSION_REJECT);
+    mds->send_message_client(m, session);
+    derr << "Server received unknown message " << m->get_type() << ", closing session and blocklisting the client " << session->get_client() << dendl;
+    CachedStackStringStream css;
+    mds->evict_client(session->get_client().v, false, true, *css, nullptr);
+  }
+}
+
+void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
+  if (!session->is_open() ||
+      !session->get_connection() ||
+      !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
+    return;
+  }
+
+  version_t seq = session->wait_for_flush(gather.new_sub());
+  mds->send_message_client(
+    make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
+}
+
+void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
+{
+  for (const auto& client : client_set) {
+    Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
+    ceph_assert(session);
+    flush_session(session, gather);
+  }
+}
+
+void Server::finish_flush_session(Session *session, version_t seq)
+{
+  MDSContext::vec finished;
+  session->finish_flush(seq, finished);
+  mds->queue_waiters(finished);
+}
+
+void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
+			     const interval_set<inodeno_t>& inos_to_free, version_t piv,
+			     const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls)
+{
+  dout(10) << "_session_logged " << session->info.inst
+	   << " state_seq " << state_seq
+	   << " " << (open ? "open":"close") << " " << pv
+	   << " inos_to_free " << inos_to_free << " inotablev " << piv
+	   << " inos_to_purge " << inos_to_purge << dendl;
+
+  if (!open) {
+    if (inos_to_purge.size()){
+      ceph_assert(ls);
+      session->info.prealloc_inos.subtract(inos_to_purge);
+      ls->purging_inodes.insert(inos_to_purge);
+      if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping())
+	mdcache->purge_inodes(inos_to_purge, ls);
+    }
+
+    if (inos_to_free.size()) {
+      ceph_assert(piv);
+      ceph_assert(session->is_closing() || session->is_killing() ||
+	  session->is_opening()); // re-open closing session
+      session->info.prealloc_inos.subtract(inos_to_free);
+      mds->inotable->apply_release_ids(inos_to_free);
+      ceph_assert(mds->inotable->get_version() == piv);
+    }
+    session->free_prealloc_inos = session->info.prealloc_inos;
+    session->delegated_inos.clear();
+  }
+
+  mds->sessionmap.mark_dirty(session);
+
+  // apply
+  if (session->get_state_seq() != state_seq) {
+    dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
+	     << ", noop" << dendl;
+    // close must have been canceled (by an import?), or any number of other things..
+  } else if (open) {
+    ceph_assert(session->is_opening());
+    mds->sessionmap.set_state(session, Session::STATE_OPEN);
+    mds->sessionmap.touch_session(session);
+    metrics_handler->add_session(session);
+    ceph_assert(session->get_connection());
+    auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
+    if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
+      reply->supported_features = supported_features;
+      reply->metric_spec = supported_metric_spec;
+    }
+    mds->send_message_client(reply, session);
+    if (mdcache->is_readonly()) {
+      auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
+      mds->send_message_client(m, session);
+    }
+  } else if (session->is_closing() ||
+	     session->is_killing()) {
+    // kill any lingering capabilities, leases, requests
+    bool killing = session->is_killing();
+    while (!session->caps.empty()) {
+      Capability *cap = session->caps.front();
+      CInode *in = cap->get_inode();
+      dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
+      mds->locker->remove_client_cap(in, cap, killing);
+    }
+    while (!session->leases.empty()) {
+      ClientLease *r = session->leases.front();
+      CDentry *dn = static_cast<CDentry*>(r->parent);
+      dout(20) << " killing client lease of " << *dn << dendl;
+      dn->remove_client_lease(r, mds->locker);
+    }
+    if (client_reconnect_gather.erase(session->info.get_client())) {
+      dout(20) << " removing client from reconnect set" << dendl;
+      if (client_reconnect_gather.empty()) {
+        dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
+        reconnect_gather_finish();
+      }
+    }
+    if (client_reclaim_gather.erase(session->info.get_client())) {
+      dout(20) << " removing client from reclaim set" << dendl;
+      if (client_reclaim_gather.empty()) {
+        dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
+	mds->maybe_clientreplay_done();
+      }
+    }
+    
+    if (session->is_closing()) {
+      // mark con disposable.  if there is a fault, we will get a
+      // reset and clean it up.  if the client hasn't received the
+      // CLOSE message yet, they will reconnect and get an
+      // ms_handle_remote_reset() and realize they had in fact closed.
+      // do this *before* sending the message to avoid a possible
+      // race.
+      if (session->get_connection()) {
+        // Conditional because terminate_sessions will indiscrimately
+        // put sessions in CLOSING whether they ever had a conn or not.
+        session->get_connection()->mark_disposable();
+      }
+
+      // reset session
+      mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
+      mds->sessionmap.set_state(session, Session::STATE_CLOSED);
+      session->clear();
+      metrics_handler->remove_session(session);
+      mds->sessionmap.remove_session(session);
+    } else if (session->is_killing()) {
+      // destroy session, close connection
+      if (session->get_connection()) {
+        session->get_connection()->mark_down();
+        mds->sessionmap.set_state(session, Session::STATE_CLOSED);
+        session->set_connection(nullptr);
+      }
+      metrics_handler->remove_session(session);
+      mds->sessionmap.remove_session(session);
+    } else {
+      ceph_abort();
+    }
+  } else {
+    ceph_abort();
+  }
+}
+
+/**
+ * Inject sessions from some source other than actual connections.
+ *
+ * For example:
+ *  - sessions inferred from journal replay
+ *  - sessions learned from other MDSs during rejoin
+ *  - sessions learned from other MDSs during dir/caps migration
+ *  - sessions learned from other MDSs during a cross-MDS rename
+ */
+version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
+					      map<client_t,client_metadata_t>& cmm,
+					      map<client_t, pair<Session*,uint64_t> >& smap)
+{
+  version_t pv = mds->sessionmap.get_projected();
+
+  dout(10) << "prepare_force_open_sessions " << pv 
+	   << " on " << cm.size() << " clients"
+	   << dendl;
+
+  mds->objecter->with_osdmap(
+      [this, &cm, &cmm](const OSDMap &osd_map) {
+	for (auto p = cm.begin(); p != cm.end(); ) {
+	  if (osd_map.is_blocklisted(p->second.addr)) {
+	    dout(10) << " ignoring blocklisted client." << p->first
+		     << " (" <<  p->second.addr << ")" << dendl;
+	    cmm.erase(p->first);
+	    cm.erase(p++);
+	  } else {
+	    ++p;
+	  }
+	}
+      });
+
+  for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
+    Session *session = mds->sessionmap.get_or_add_session(p->second);
+    pv = mds->sessionmap.mark_projected(session);
+    uint64_t sseq;
+    if (session->is_closed() || 
+	session->is_closing() ||
+	session->is_killing()) {
+      sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
+      auto q = cmm.find(p->first);
+      if (q != cmm.end())
+	session->info.client_metadata.merge(q->second);
+    } else {
+      ceph_assert(session->is_open() ||
+	     session->is_opening() ||
+	     session->is_stale());
+      sseq = 0;
+    }
+    smap[p->first] = make_pair(session, sseq);
+    session->inc_importing();
+  }
+  return pv;
+}
+
+void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
+					bool dec_import)
+{
+  /*
+   * FIXME: need to carefully consider the race conditions between a
+   * client trying to close a session and an MDS doing an import
+   * trying to force open a session...  
+   */
+  dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
+	   << " initial v " << mds->sessionmap.get_version() << dendl;
+
+  for (auto &it : smap) {
+    Session *session = it.second.first;
+    uint64_t sseq = it.second.second;
+    if (sseq > 0) {
+      if (session->get_state_seq() != sseq) {
+	dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
+      } else {
+	dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
+	mds->sessionmap.set_state(session, Session::STATE_OPEN);
+	mds->sessionmap.touch_session(session);
+        metrics_handler->add_session(session);
+
+	auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
+	if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
+	  reply->supported_features = supported_features;
+          reply->metric_spec = supported_metric_spec;
+	}
+	mds->send_message_client(reply, session);
+
+	if (mdcache->is_readonly())
+	  mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
+      }
+    } else {
+      dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
+      ceph_assert(session->is_open() || session->is_stale());
+    }
+
+    if (dec_import) {
+      session->dec_importing();
+    }
+
+    mds->sessionmap.mark_dirty(session);
+  }
+
+  dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
+}
+
+class C_MDS_TerminatedSessions : public ServerContext {
+  void finish(int r) override {
+    server->terminating_sessions = false;
+  }
+  public:
+  explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
+};
+
+void Server::terminate_sessions()
+{
+  dout(5) << "terminating all sessions..." << dendl;
+
+  terminating_sessions = true;
+
+  // kill them off.  clients will retry etc.
+  set<Session*> sessions;
+  mds->sessionmap.get_client_session_set(sessions);
+  for (set<Session*>::const_iterator p = sessions.begin();
+       p != sessions.end();
+       ++p) {
+    Session *session = *p;
+    if (session->is_closing() ||
+	session->is_killing() ||
+	session->is_closed())
+      continue;
+    journal_close_session(session, Session::STATE_CLOSING, NULL);
+  }
+
+  mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
+}
+
+
+void Server::find_idle_sessions()
+{
+  auto now = clock::now();
+  auto last_cleared_laggy = mds->last_cleared_laggy();
+
+  dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
+  
+  // timeout/stale
+  //  (caps go stale, lease die)
+  double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
+  double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
+
+  // don't kick clients if we've been laggy
+  if (last_cleared_laggy < cutoff) {
+    dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
+	     << "), not marking any client stale" << dendl;
+    return;
+  }
+
+  std::vector<Session*> to_evict;
+
+  bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
+  const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
+  if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
+    std::vector<Session*> new_stale;
+
+    for (auto session : *(sessions_p1->second)) {
+      auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+      if (last_cap_renew_span < cutoff) {
+	dout(20) << "laggiest active session is " << session->info.inst
+		 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+	break;
+      }
+
+      if (session->last_seen > session->last_cap_renew) {
+	last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
+	if (last_cap_renew_span < cutoff) {
+	  dout(20) << "laggiest active session is " << session->info.inst
+		   << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+	  continue;
+	}
+      }
+
+      if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
+	dout(20) << "evicting session " << session->info.inst << " since autoclose "
+		    "has arrived" << dendl;
+	// evict session without marking it stale
+	to_evict.push_back(session);
+	continue;
+      }
+
+      if (defer_session_stale &&
+	  !session->is_any_flush_waiter() &&
+	  !mds->locker->is_revoking_any_caps_from(session->get_client())) {
+	dout(20) << "deferring marking session " << session->info.inst << " stale "
+		    "since it holds no caps" << dendl;
+	continue;
+      }
+
+      auto it = session->info.client_metadata.find("timeout");
+      if (it != session->info.client_metadata.end()) {
+	unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
+	if (timeout == 0) {
+	  dout(10) << "skipping session " << session->info.inst
+		   << ", infinite timeout specified" << dendl;
+	  continue;
+	}
+	double cutoff = queue_max_age + timeout;
+	if  (last_cap_renew_span < cutoff) {
+	  dout(10) << "skipping session " << session->info.inst
+		   << ", timeout (" << timeout << ") specified"
+		   << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+	  continue;
+	}
+
+	// do not go through stale, evict it directly.
+	to_evict.push_back(session);
+      } else {
+	dout(10) << "new stale session " << session->info.inst
+		 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
+	new_stale.push_back(session);
+      }
+    }
+
+    for (auto session : new_stale) {
+      mds->sessionmap.set_state(session, Session::STATE_STALE);
+      if (mds->locker->revoke_stale_caps(session)) {
+	mds->locker->remove_stale_leases(session);
+	finish_flush_session(session, session->get_push_seq());
+	auto m = make_message<MClientSession>(CEPH_SESSION_STALE, session->get_push_seq());
+	mds->send_message_client(m, session);
+      } else {
+	to_evict.push_back(session);
+      }
+    }
+  }
+
+  // autoclose
+  cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
+
+  // Collect a list of sessions exceeding the autoclose threshold
+  const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
+  if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
+    for (auto session : *(sessions_p2->second)) {
+      assert(session->is_stale());
+      auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+      if (last_cap_renew_span < cutoff) {
+	dout(20) << "oldest stale session is " << session->info.inst
+		 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
+	break;
+      }
+      to_evict.push_back(session);
+    }
+  }
+
+  for (auto session: to_evict) {
+    if (session->is_importing()) {
+      dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
+      continue;
+    }
+
+    auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+    mds->clog->warn() << "evicting unresponsive client " << *session
+		      << ", after " << last_cap_renew_span << " seconds";
+    dout(10) << "autoclosing stale session " << session->info.inst
+	     << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
+
+    if (g_conf()->mds_session_blocklist_on_timeout) {
+      CachedStackStringStream css;
+      mds->evict_client(session->get_client().v, false, true, *css, nullptr);
+    } else {
+      kill_session(session, NULL);
+    }
+  }
+}
+
+void Server::evict_cap_revoke_non_responders() {
+  if (!cap_revoke_eviction_timeout) {
+    return;
+  }
+
+  auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
+
+  for (auto const &client: to_evict) {
+    mds->clog->warn() << "client id " << client << " has not responded to"
+                      << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
+                      << " seconds, evicting";
+    dout(1) << __func__ << ": evicting cap revoke non-responder client id "
+            << client << dendl;
+
+    CachedStackStringStream css;
+    bool evicted = mds->evict_client(client.v, false,
+                                     g_conf()->mds_session_blocklist_on_evict,
+                                     *css, nullptr);
+    if (evicted && logger) {
+      logger->inc(l_mdss_cap_revoke_eviction);
+    }
+  }
+}
+
+void Server::handle_conf_change(const std::set<std::string>& changed) {
+  if (changed.count("mds_forward_all_requests_to_auth")){
+    forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
+  }
+  if (changed.count("mds_cap_revoke_eviction_timeout")) {
+    cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
+    dout(20) << __func__ << " cap revoke eviction timeout changed to "
+            << cap_revoke_eviction_timeout << dendl;
+  }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
+  }
+  if (changed.count("mds_max_snaps_per_dir")) {
+    max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
+    dout(20) << __func__ << " max snapshots per directory changed to "
+            << max_snaps_per_dir << dendl;
+  }
+  if (changed.count("mds_client_delegate_inos_pct")) {
+    delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
+  }
+  if (changed.count("mds_max_caps_per_client")) {
+    max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+  }
+  if (changed.count("mds_session_cap_acquisition_throttle")) {
+    cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
+  }
+  if (changed.count("mds_session_max_caps_throttle_ratio")) {
+    max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
+  }
+  if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
+    caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
+  }
+  if (changed.count("mds_alternate_name_max")) {
+    alternate_name_max  = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
+  }
+  if (changed.count("mds_dir_max_entries")) {
+    dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
+    dout(20) << __func__ << " max entries per directory changed to "
+            << dir_max_entries << dendl;
+  }
+}
+
+/*
+ * XXX bump in the interface here, not using an MDSContext here
+ * because all the callers right now happen to use a SaferCond
+ */
+void Server::kill_session(Session *session, Context *on_safe)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
+
+  if ((session->is_opening() ||
+       session->is_open() ||
+       session->is_stale()) &&
+      !session->is_importing()) {
+    dout(10) << "kill_session " << session << dendl;
+    journal_close_session(session, Session::STATE_KILLING, on_safe);
+  } else {
+    dout(10) << "kill_session importing or already closing/killing " << session << dendl;
+    if (session->is_closing() ||
+	session->is_killing()) {
+      if (on_safe)
+	mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
+    } else {
+      ceph_assert(session->is_closed() ||
+		  session->is_importing());
+      if (on_safe)
+	on_safe->complete(0);
+    }
+  }
+}
+
+size_t Server::apply_blocklist()
+{
+  std::vector<Session*> victims;
+  const auto& sessions = mds->sessionmap.get_sessions();
+  mds->objecter->with_osdmap(
+    [&](const OSDMap& o) {
+      for (const auto& p : sessions) {
+	if (!p.first.is_client()) {
+	  // Do not apply OSDMap blocklist to MDS daemons, we find out
+	  // about their death via MDSMap.
+	  continue;
+	}
+	if (o.is_blocklisted(p.second->info.inst.addr)) {
+	  victims.push_back(p.second);
+	}
+      }
+    });
+
+  for (const auto& s : victims) {
+    kill_session(s, nullptr);
+  }
+
+  dout(10) << "apply_blocklist: killed " << victims.size() << dendl;
+
+  return victims.size();
+}
+
+void Server::journal_close_session(Session *session, int state, Context *on_safe)
+{
+  dout(10) << __func__ << " : "
+	   << session->info.inst
+	   << " pending_prealloc_inos " << session->pending_prealloc_inos
+	   << " free_prealloc_inos " << session->free_prealloc_inos
+	   << " delegated_inos " << session->delegated_inos << dendl;
+
+  uint64_t sseq = mds->sessionmap.set_state(session, state);
+  version_t pv = mds->sessionmap.mark_projected(session);
+  version_t piv = 0;
+
+  // release alloc and pending-alloc inos for this session
+  // and wipe out session state, in case the session close aborts for some reason
+  interval_set<inodeno_t> inos_to_free;
+  inos_to_free.insert(session->pending_prealloc_inos);
+  inos_to_free.insert(session->free_prealloc_inos);
+  if (inos_to_free.size()) {
+    mds->inotable->project_release_ids(inos_to_free);
+    piv = mds->inotable->get_projected_version();
+  } else
+    piv = 0;
+  
+  auto le = new ESession(session->info.inst, false, pv, inos_to_free, piv, session->delegated_inos);
+  auto fin = new C_MDS_session_finish(this, session, sseq, false, pv, inos_to_free, piv,
+				      session->delegated_inos, mdlog->get_current_segment(), on_safe);
+  mdlog->start_submit_entry(le, fin);
+  mdlog->flush();
+
+  // clean up requests, too
+  while(!session->requests.empty()) {
+    auto mdr = MDRequestRef(*session->requests.begin());
+    mdcache->request_kill(mdr);
+  }
+
+  finish_flush_session(session, session->get_push_seq());
+}
+
+void Server::reconnect_clients(MDSContext *reconnect_done_)
+{
+  reconnect_done = reconnect_done_;
+
+  auto now = clock::now();
+  set<Session*> sessions;
+  mds->sessionmap.get_client_session_set(sessions);
+  for (auto session : sessions) {
+    if (session->is_open()) {
+      client_reconnect_gather.insert(session->get_client());
+      session->set_reconnecting(true);
+      session->last_cap_renew = now;
+    }
+  }
+
+  if (client_reconnect_gather.empty()) {
+    dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
+    reconnect_gather_finish();
+    return;
+  }
+
+  // clients will get the mdsmap and discover we're reconnecting via the monitor.
+  
+  reconnect_start = now;
+  dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
+  mds->sessionmap.dump();
+}
+
+void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
+{
+  dout(7) << "handle_client_reconnect " << m->get_source()
+	  << (m->has_more() ? " (more)" : "") << dendl;
+  client_t from = m->get_source().num();
+  Session *session = mds->get_session(m);
+  if (!session) {
+    dout(0) << " ignoring sessionless msg " << *m << dendl;
+    auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
+    reply->metadata["error_string"] = "sessionless";
+    mds->send_message(reply, m->get_connection());
+    return;
+  }
+
+  if (!session->is_open()) {
+    dout(0) << " ignoring msg from not-open session" << *m << dendl;
+    auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
+    mds->send_message(reply, m->get_connection());
+    return;
+  }
+
+  bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
+
+  if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
+    dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
+    mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
+  auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
+  dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
+
+  bool deny = false;
+  if (reconnect_all_deny || !mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
+    // XXX maybe in the future we can do better than this?
+    if (reconnect_all_deny) {
+      dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl;
+    } else {
+      dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl;
+    }
+    mds->clog->info() << "denied reconnect attempt (mds is "
+       << ceph_mds_state_name(mds->get_state())
+       << ") from " << m->get_source_inst()
+       << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
+    deny = true;
+  } else {
+    std::string error_str;
+    if (!session->is_open()) {
+      error_str = "session is closed";
+    } else if (mdcache->is_readonly()) {
+      error_str = "mds is readonly";
+    } else {
+      if (session->info.client_metadata.features.empty())
+	infer_supported_features(session,  session->info.client_metadata);
+
+      feature_bitset_t missing_features = required_client_features;
+      missing_features -= session->info.client_metadata.features;
+      if (!missing_features.empty()) {
+	CachedStackStringStream css;
+	*css << "missing required features '" << missing_features << "'";
+	error_str = css->strv();
+      }
+    }
+
+    if (!error_str.empty()) {
+      deny = true;
+      dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
+      mds->clog->info() << "denied reconnect attempt from "
+			<< m->get_source_inst() << " (" << error_str << ")";
+    }
+  }
+
+  if (deny) {
+    auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
+    mds->send_message_client(r, session);
+    if (session->is_open()) {
+      client_reconnect_denied.insert(session->get_client());
+    }
+    return;
+  }
+
+  if (!m->has_more()) {
+    metrics_handler->add_session(session);
+    // notify client of success with an OPEN
+    auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
+    if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
+      reply->supported_features = supported_features;
+      reply->metric_spec = supported_metric_spec;
+    }
+    mds->send_message_client(reply, session);
+    mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
+  }
+
+  session->last_cap_renew = clock::now();
+  
+  // snaprealms
+  for (const auto &r : m->realms) {
+    CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
+    if (in && in->state_test(CInode::STATE_PURGING))
+      continue;
+    if (in) {
+      if (in->snaprealm) {
+	dout(15) << "open snaprealm (w inode) on " << *in << dendl;
+      } else {
+	// this can happen if we are non-auth or we rollback snaprealm
+	dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
+      }
+      mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
+    } else {
+      dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
+	       << " seq " << r.realm.seq << dendl;
+      mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
+    }
+  }
+
+  // caps
+  for (const auto &p : m->caps) {
+    // make sure our last_cap_id is MAX over all issued caps
+    if (p.second.capinfo.cap_id > mdcache->last_cap_id)
+      mdcache->last_cap_id = p.second.capinfo.cap_id;
+    
+    CInode *in = mdcache->get_inode(p.first);
+    if (in && in->state_test(CInode::STATE_PURGING))
+      continue;
+    if (in && in->is_auth()) {
+      // we recovered it, and it's ours.  take note.
+      dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
+	       << " on " << *in << dendl;
+      in->reconnect_cap(from, p.second, session);
+      mdcache->add_reconnected_cap(from, p.first, p.second);
+      recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
+      continue;
+    }
+      
+    if (in && !in->is_auth()) {
+      // not mine.
+      dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
+      // add to cap export list.
+      mdcache->rejoin_export_caps(p.first, from, p.second,
+				  in->authority().first, true);
+    } else {
+      // don't know if the inode is mine
+      dout(10) << "missing ino " << p.first << ", will load later" << dendl;
+      mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
+    }
+  }
+
+  reconnect_last_seen = clock::now();
+
+  if (!m->has_more()) {
+    mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
+
+    // remove from gather set
+    client_reconnect_gather.erase(from);
+    session->set_reconnecting(false);
+    if (client_reconnect_gather.empty())
+      reconnect_gather_finish();
+  }
+}
+
+void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
+{
+  int supported = -1;
+  auto it = client_metadata.find("ceph_version");
+  if (it != client_metadata.end()) {
+    // user space client
+    if (it->second.compare(0, 16, "ceph version 12.") == 0)
+      supported = CEPHFS_FEATURE_LUMINOUS;
+    else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
+      supported = CEPHFS_FEATURE_KRAKEN;
+  } else {
+    it = client_metadata.find("kernel_version");
+    if (it != client_metadata.end()) {
+      // kernel client
+      if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
+	supported = CEPHFS_FEATURE_LUMINOUS;
+    }
+  }
+  if (supported == -1 &&
+      session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
+    supported = CEPHFS_FEATURE_JEWEL;
+
+  if (supported >= 0) {
+    unsigned long value = (1UL << (supported + 1)) - 1;
+    client_metadata.features = feature_bitset_t(value);
+    dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
+  }
+}
+
+void Server::update_required_client_features()
+{
+  required_client_features = mds->mdsmap->get_required_client_features();
+  dout(7) << "required_client_features: " << required_client_features << dendl;
+
+  if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
+    set<Session*> sessions;
+    mds->sessionmap.get_client_session_set(sessions);
+    for (auto session : sessions) {
+      feature_bitset_t missing_features = required_client_features;
+      missing_features -= session->info.client_metadata.features;
+      if (!missing_features.empty()) {
+	bool blocklisted = mds->objecter->with_osdmap(
+	    [session](const OSDMap &osd_map) -> bool {
+	      return osd_map.is_blocklisted(session->info.inst.addr);
+	    });
+	if (blocklisted)
+	  continue;
+
+	mds->clog->warn() << "evicting session " << *session << ", missing required features '"
+			  << missing_features << "'";
+	CachedStackStringStream css;
+	mds->evict_client(session->get_client().v, false,
+			  g_conf()->mds_session_blocklist_on_evict, *css);
+      }
+    }
+  }
+}
+
+void Server::reconnect_gather_finish()
+{
+  dout(7) << "reconnect_gather_finish.  failed on " << failed_reconnects << " clients" << dendl;
+  ceph_assert(reconnect_done);
+
+  if (!mds->snapclient->is_synced()) {
+    // make sure snaptable cache is populated. snaprealms will be
+    // extensively used in rejoin stage.
+    dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
+    mds->snapclient->wait_for_sync(reconnect_done);
+  } else {
+    reconnect_done->complete(0);
+  }
+  reconnect_done = NULL;
+}
+
+void Server::reconnect_tick()
+{
+  bool reject_all_reconnect = false;
+  if (reconnect_evicting) {
+    dout(7) << "reconnect_tick: waiting for evictions" << dendl;
+    return;
+  }
+
+  /*
+   * Set mds_deny_all_reconnect to reject all the reconnect req ,
+   * then load less meta information in rejoin phase. This will shorten reboot time.
+   * Moreover, loading less meta increases the chance standby with less memory can failover.
+
+   * Why not shorten reconnect period?
+   * Clients may send unsafe or retry requests, which haven't been
+   * completed before old mds stop, to new mds. These requests may
+   * need to be processed during new mds's clientreplay phase,
+   * see: #https://github.com/ceph/ceph/pull/29059.
+   */
+  bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
+  if (client_reconnect_gather.empty())
+    return;
+
+  if (reconnect_all_deny && (client_reconnect_gather == client_reconnect_denied))
+    reject_all_reconnect = true;
+ 
+  auto now = clock::now();
+  auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
+  if (elapse1 < g_conf()->mds_reconnect_timeout && !reject_all_reconnect)
+    return;
+
+  vector<Session*> remaining_sessions;
+  remaining_sessions.reserve(client_reconnect_gather.size());
+  for (auto c : client_reconnect_gather) {
+    Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
+    ceph_assert(session);
+    remaining_sessions.push_back(session);
+    // client re-sends cap flush messages before the reconnect message
+    if (session->last_seen > reconnect_last_seen)
+      reconnect_last_seen = session->last_seen;
+  }
+
+  auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
+  if (elapse2 < g_conf()->mds_reconnect_timeout / 2 && !reject_all_reconnect) {
+    dout(7) << "reconnect_tick: last seen " << elapse2
+            << " seconds ago, extending reconnect interval" << dendl;
+    return;
+  }
+
+  dout(7) << "reconnect timed out, " << remaining_sessions.size()
+          << " clients have not reconnected in time" << dendl;
+
+  // If we're doing blocklist evictions, use this to wait for them before
+  // proceeding to reconnect_gather_finish
+  MDSGatherBuilder gather(g_ceph_context);
+
+  for (auto session : remaining_sessions) {
+    // Keep sessions that have specified timeout. These sessions will prevent
+    // mds from going to active. MDS goes to active after they all have been
+    // killed or reclaimed.
+    if (session->info.client_metadata.find("timeout") !=
+	session->info.client_metadata.end()) {
+      dout(1) << "reconnect keeps " << session->info.inst
+	      << ", need to be reclaimed" << dendl;
+      client_reclaim_gather.insert(session->get_client());
+      continue;
+    }
+
+    dout(1) << "reconnect gives up on " << session->info.inst << dendl;
+
+    mds->clog->warn() << "evicting unresponsive client " << *session
+		      << ", after waiting " << elapse1
+		      << " seconds during MDS startup";
+
+    // make _session_logged() purge orphan objects of lost async/unsafe requests
+    session->delegated_inos.swap(session->free_prealloc_inos);
+
+    if (g_conf()->mds_session_blocklist_on_timeout) {
+      CachedStackStringStream css;
+      mds->evict_client(session->get_client().v, false, true, *css,
+			gather.new_sub());
+    } else {
+      kill_session(session, NULL);
+    }
+
+    failed_reconnects++;
+  }
+  client_reconnect_gather.clear();
+  client_reconnect_denied.clear();
+
+  if (gather.has_subs()) {
+    dout(1) << "reconnect will complete once clients are evicted" << dendl;
+    gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
+	    [this](int r){reconnect_gather_finish();})));
+    gather.activate();
+    reconnect_evicting = true;
+  } else {
+    reconnect_gather_finish();
+  }
+}
+
+void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
+{
+  if (!locks.length()) return;
+  int numlocks;
+  ceph_filelock lock;
+  auto p = locks.cbegin();
+  decode(numlocks, p);
+  for (int i = 0; i < numlocks; ++i) {
+    decode(lock, p);
+    lock.client = client;
+    in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
+    ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
+  }
+  decode(numlocks, p);
+  for (int i = 0; i < numlocks; ++i) {
+    decode(lock, p);
+    lock.client = client;
+    in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
+    ++in->get_flock_lock_state()->client_held_lock_counts[client];
+  }
+}
+
+/**
+ * Call this when the MDCache is oversized, to send requests to the clients
+ * to trim some caps, and consequently unpin some inodes in the MDCache so
+ * that it can trim too.
+ */
+std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
+{
+  const auto now = clock::now();
+  const bool steady = !!(flags&RecallFlags::STEADY);
+  const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
+  const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
+  const bool trim = !!(flags&RecallFlags::TRIM);
+
+  const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+  const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
+  const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
+  const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
+  const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
+  const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
+
+  dout(7) << __func__ << ":"
+           << " min=" << min_caps_per_client
+           << " max=" << max_caps_per_client
+           << " total=" << Capability::count()
+           << " flags=" << flags
+           << dendl;
+
+  /* trim caps of sessions with the most caps first */
+  std::multimap<uint64_t, Session*> caps_session;
+  auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
+    auto num_caps = s->caps.size();
+    auto cache_liveness = s->get_session_cache_liveness();
+    if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
+      caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
+    }
+  };
+  mds->sessionmap.get_client_sessions(std::move(f));
+
+  std::pair<bool, uint64_t> result = {false, 0};
+  auto& [throttled, caps_recalled] = result;
+  last_recall_state = now;
+  for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
+    if (!session->is_open() ||
+        !session->get_connection() ||
+	!session->info.inst.name.is_client())
+      continue;
+
+    dout(10) << __func__ << ":"
+             << " session " << session->info.inst
+	     << " caps " << num_caps
+	     << ", leases " << session->leases.size()
+	     << dendl;
+
+    uint64_t newlim;
+    if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
+      newlim = min_caps_per_client;
+    } else {
+      newlim = num_caps-recall_max_caps;
+    }
+    if (num_caps > newlim) {
+      /* now limit the number of caps we recall at a time to prevent overloading ourselves */
+      uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
+      newlim = num_caps-recall;
+      const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
+      const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
+      const uint64_t global_recall_throttle = recall_throttle.get();
+      if (session_recall_throttle+recall > recall_max_decay_threshold) {
+        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
+        throttled = true;
+        continue;
+      } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
+        dout(15) << "  session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
+        throttled = true;
+        continue;
+      } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
+        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
+        throttled = true;
+        break;
+      }
+
+      // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
+      if (steady) {
+        const auto session_recall = session->get_recall_caps();
+        const auto session_release = session->get_release_caps();
+        if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
+          /* The session has been unable to keep up with the number of caps
+           * recalled (by half); additionally, to prevent marking sessions
+           * we've just begun to recall from, the session_recall counter
+           * (decayed count of caps recently recalled) is **greater** than the
+           * session threshold for the session's cap recall throttle.
+           */
+          dout(15) << "  2*session_release < session_recall"
+                      " (2*" << session_release << " < " << session_recall << ") &&"
+                      " 2*session_recall < recall_max_decay_threshold"
+                      " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        } else if (recall < recall_max_caps && 2*recall < session_recall) {
+          /* The number of caps recalled is less than the number we *could*
+           * recall (so there isn't much left to recall?) and the number of
+           * caps is less than the current recall_caps counter (decayed count
+           * of caps recently recalled).
+           */
+          dout(15) << "  2*recall < session_recall "
+                      " (2*" << recall << " < " << session_recall << ") &&"
+                      " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        }
+      }
+
+      dout(7) << "  recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
+
+      auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
+      m->head.max_caps = newlim;
+      mds->send_message_client(m, session);
+      if (gather) {
+        flush_session(session, *gather);
+      }
+      caps_recalled += session->notify_recall_sent(newlim);
+      recall_throttle.hit(recall);
+    }
+  }
+
+  dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
+
+  return result;
+}
+
+void Server::force_clients_readonly()
+{
+  dout(10) << "force_clients_readonly" << dendl;
+  set<Session*> sessions;
+  mds->sessionmap.get_client_session_set(sessions);
+  for (set<Session*>::const_iterator p = sessions.begin();
+      p != sessions.end();
+      ++p) {
+    Session *session = *p;
+    if (!session->info.inst.name.is_client() ||
+	!(session->is_open() || session->is_stale()))
+      continue;
+    mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
+  }
+}
+
+/*******
+ * some generic stuff for finishing off requests
+ */
+void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
+{
+  dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
+  ceph_assert(!mdr->has_completed);
+
+  // note trace items for eventual reply.
+  mdr->tracei = in;
+  if (in)
+    mdr->pin(in);
+
+  mdr->tracedn = dn;
+  if (dn)
+    mdr->pin(dn);
+
+  early_reply(mdr, in, dn);
+
+  mdr->committing = true;
+  submit_mdlog_entry(le, fin, mdr, __func__);
+
+  if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
+    if (mds->queue_one_replay()) {
+      dout(10) << " queued next replay op" << dendl;
+    } else {
+      dout(10) << " journaled last replay op" << dendl;
+    }
+  } else if (mdr->did_early_reply) {
+    mds->locker->drop_rdlocks_for_early_reply(mdr.get());
+    if (dn && dn->is_waiter_for(CDentry::WAIT_UNLINK_FINISH))
+      mdlog->flush();
+  } else {
+    mdlog->flush();
+  }
+}
+
+void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
+                                std::string_view event)
+{
+  if (mdr) {
+    string event_str("submit entry: ");
+    event_str += event;
+    mdr->mark_event(event_str);
+  } 
+  mdlog->submit_entry(le, fin);
+}
+
+/*
+ * send response built from mdr contents and error code; clean up mdr
+ */
+void Server::respond_to_request(MDRequestRef& mdr, int r)
+{
+  if (mdr->client_request) {
+    if (mdr->is_batch_head()) {
+      dout(20) << __func__ << " batch head " << *mdr << dendl;
+      mdr->release_batch_op()->respond(r);
+    } else {
+     reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
+    }
+  } else if (mdr->internal_op > -1) {
+    dout(10) << "respond_to_request on internal request " << mdr << dendl;
+    if (!mdr->internal_op_finish)
+      ceph_abort_msg("trying to respond to internal op without finisher");
+    mdr->internal_op_finish->complete(r);
+    mdcache->request_finish(mdr);
+  }
+}
+
+// statistics mds req op number and latency 
+void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
+{
+  int code = l_mdss_first;
+  switch(req->get_op()) {
+  case CEPH_MDS_OP_LOOKUPHASH:
+    code = l_mdss_req_lookuphash_latency;
+    break;
+  case CEPH_MDS_OP_LOOKUPINO:
+    code = l_mdss_req_lookupino_latency;
+    break;
+  case CEPH_MDS_OP_LOOKUPPARENT:
+    code = l_mdss_req_lookupparent_latency;
+    break;
+  case CEPH_MDS_OP_LOOKUPNAME:
+    code = l_mdss_req_lookupname_latency;
+    break;
+  case CEPH_MDS_OP_LOOKUP:
+    code = l_mdss_req_lookup_latency;
+    break;
+  case CEPH_MDS_OP_LOOKUPSNAP:
+    code = l_mdss_req_lookupsnap_latency;
+    break;
+  case CEPH_MDS_OP_GETATTR:
+    code = l_mdss_req_getattr_latency;
+    break;
+  case CEPH_MDS_OP_SETATTR:
+    code = l_mdss_req_setattr_latency;
+    break;
+  case CEPH_MDS_OP_SETLAYOUT:
+    code = l_mdss_req_setlayout_latency;
+    break;
+  case CEPH_MDS_OP_SETDIRLAYOUT:
+    code = l_mdss_req_setdirlayout_latency;
+    break;
+  case CEPH_MDS_OP_GETVXATTR:
+    code = l_mdss_req_getvxattr_latency;
+    break;
+  case CEPH_MDS_OP_SETXATTR:
+    code = l_mdss_req_setxattr_latency;
+    break;
+  case CEPH_MDS_OP_RMXATTR:
+    code = l_mdss_req_rmxattr_latency;
+    break;
+  case CEPH_MDS_OP_READDIR:
+    code = l_mdss_req_readdir_latency;
+    break;
+  case CEPH_MDS_OP_SETFILELOCK:
+    code = l_mdss_req_setfilelock_latency;
+    break;
+  case CEPH_MDS_OP_GETFILELOCK:
+    code = l_mdss_req_getfilelock_latency;
+    break;
+  case CEPH_MDS_OP_CREATE:
+    code = l_mdss_req_create_latency;
+    break;
+  case CEPH_MDS_OP_OPEN:
+    code = l_mdss_req_open_latency;
+    break;
+  case CEPH_MDS_OP_MKNOD:
+    code = l_mdss_req_mknod_latency;
+    break;
+  case CEPH_MDS_OP_LINK:
+    code = l_mdss_req_link_latency;
+    break;
+  case CEPH_MDS_OP_UNLINK:
+    code = l_mdss_req_unlink_latency;
+    break;
+  case CEPH_MDS_OP_RMDIR:
+    code = l_mdss_req_rmdir_latency;
+    break;
+  case CEPH_MDS_OP_RENAME:
+    code = l_mdss_req_rename_latency;
+    break;
+  case CEPH_MDS_OP_MKDIR:
+    code = l_mdss_req_mkdir_latency;
+    break;
+  case CEPH_MDS_OP_SYMLINK:
+    code = l_mdss_req_symlink_latency;
+    break;
+  case CEPH_MDS_OP_LSSNAP:
+    code = l_mdss_req_lssnap_latency;
+    break;
+  case CEPH_MDS_OP_MKSNAP:
+    code = l_mdss_req_mksnap_latency;
+    break;
+  case CEPH_MDS_OP_RMSNAP:
+    code = l_mdss_req_rmsnap_latency;
+    break;
+  case CEPH_MDS_OP_RENAMESNAP:
+    code = l_mdss_req_renamesnap_latency;
+    break;
+  default:
+    dout(1) << ": unknown client op" << dendl;
+    return;
+  }
+  logger->tinc(code, lat);   
+}
+
+void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
+{
+  if (!g_conf()->mds_early_reply)
+    return;
+
+  if (mdr->no_early_reply) {
+    dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
+    return;
+  }
+
+  if (mdr->has_more() && mdr->more()->has_journaled_peers) {
+    dout(10) << "early_reply - there are journaled peers, not allowed." << dendl;
+    return; 
+  }
+
+  if (mdr->alloc_ino) {
+    dout(10) << "early_reply - allocated ino, not allowed" << dendl;
+    return;
+  }
+
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  entity_inst_t client_inst = req->get_source_inst();
+  if (client_inst.name.is_mds())
+    return;
+
+  if (req->is_replay()) {
+    dout(10) << " no early reply on replay op" << dendl;
+    return;
+  }
+
+
+  auto reply = make_message<MClientReply>(*req, 0);
+  reply->set_unsafe();
+
+  // mark xlocks "done", indicating that we are exposing uncommitted changes.
+  //
+  //_rename_finish() does not send dentry link/unlink message to replicas.
+  // so do not set xlocks on dentries "done", the xlocks prevent dentries
+  // that have projected linkages from getting new replica.
+  mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
+
+  dout(10) << "early_reply " << reply->get_result() 
+	   << " (" << cpp_strerror(reply->get_result())
+	   << ") " << *req << dendl;
+
+  if (tracei || tracedn) {
+    if (tracei)
+      mdr->cap_releases.erase(tracei->vino());
+    if (tracedn)
+      mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
+
+    set_trace_dist(reply, tracei, tracedn, mdr);
+  }
+
+  reply->set_extra_bl(mdr->reply_extra_bl);
+  mds->send_message_client(reply, mdr->session);
+
+  mdr->did_early_reply = true;
+
+  mds->logger->inc(l_mds_reply);
+  utime_t lat = ceph_clock_now() - req->get_recv_stamp();
+  mds->logger->tinc(l_mds_reply_latency, lat);
+  if (lat >= g_conf()->mds_op_complaint_time) {
+    mds->logger->inc(l_mds_slow_reply);
+  }
+  if (client_inst.name.is_client()) {
+    mds->sessionmap.hit_session(mdr->session);
+  }
+  perf_gather_op_latency(req, lat);
+  dout(20) << "lat " << lat << dendl;
+
+  mdr->mark_event("early_replied");
+}
+
+/*
+ * send given reply
+ * include a trace to tracei
+ * Clean up mdr
+ */
+void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
+{
+  ceph_assert(mdr.get());
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  
+  dout(7) << "reply_client_request " << reply->get_result()
+	   << " (" << cpp_strerror(reply->get_result())
+	   << ") " << *req << dendl;
+
+  mdr->mark_event("replying");
+
+  Session *session = mdr->session;
+
+  // note successful request in session map?
+  //
+  // setfilelock requests are special, they only modify states in MDS memory.
+  // The states get lost when MDS fails. If Client re-send a completed
+  // setfilelock request, it means that client did not receive corresponding
+  // setfilelock reply.  So MDS should re-execute the setfilelock request.
+  if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
+      reply->get_result() == 0 && session) {
+    inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
+    session->add_completed_request(mdr->reqid.tid, created);
+    if (mdr->ls) {
+      mdr->ls->touched_sessions.insert(session->info.inst.name);
+    }
+  }
+
+  // give any preallocated inos to the session
+  apply_allocated_inos(mdr, session);
+
+  // get tracei/tracedn from mdr?
+  CInode *tracei = mdr->tracei;
+  CDentry *tracedn = mdr->tracedn;
+
+  bool is_replay = mdr->client_request->is_replay();
+  bool did_early_reply = mdr->did_early_reply;
+  entity_inst_t client_inst = req->get_source_inst();
+
+  if (!did_early_reply && !is_replay) {
+
+    mds->logger->inc(l_mds_reply);
+    utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
+    mds->logger->tinc(l_mds_reply_latency, lat);
+    if (lat >= g_conf()->mds_op_complaint_time) {
+      mds->logger->inc(l_mds_slow_reply);
+    }
+    if (session && client_inst.name.is_client()) {
+      mds->sessionmap.hit_session(session);
+    }
+    perf_gather_op_latency(req, lat);
+    dout(20) << "lat " << lat << dendl;
+    
+    if (tracei)
+      mdr->cap_releases.erase(tracei->vino());
+    if (tracedn)
+      mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
+  }
+
+  // drop non-rdlocks before replying, so that we can issue leases
+  mdcache->request_drop_non_rdlocks(mdr);
+
+  // reply at all?
+  if (session && !client_inst.name.is_mds()) {
+    // send reply.
+    if (!did_early_reply &&   // don't issue leases if we sent an earlier reply already
+	(tracei || tracedn)) {
+      if (is_replay) {
+	if (tracei)
+	  mdcache->try_reconnect_cap(tracei, session);
+      } else {
+	// include metadata in reply
+	set_trace_dist(reply, tracei, tracedn, mdr);
+      }
+    }
+
+    // We can set the extra bl unconditionally: if it's already been sent in the
+    // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
+    reply->set_extra_bl(mdr->reply_extra_bl);
+
+    reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
+    mds->send_message_client(reply, session);
+  }
+
+  if (req->is_queued_for_replay() &&
+      (mdr->has_completed || reply->get_result() < 0)) {
+    if (reply->get_result() < 0) {
+      int r = reply->get_result();
+      derr << "reply_client_request: failed to replay " << *req
+	   << " error " << r << " (" << cpp_strerror(r)  << ")" << dendl;
+      mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
+    }
+    mds->queue_one_replay();
+  }
+
+  // clean up request
+  mdcache->request_finish(mdr);
+
+  // take a closer look at tracei, if it happens to be a remote link
+  if (tracei && 
+      tracedn &&
+      tracedn->get_projected_linkage()->is_remote()) {
+    mdcache->eval_remote(tracedn);
+  }
+}
+
+/*
+ * pass inode OR dentry (not both, or we may get confused)
+ *
+ * trace is in reverse order (i.e. root inode comes last)
+ */
+void Server::set_trace_dist(const ref_t<MClientReply> &reply,
+			    CInode *in, CDentry *dn,
+			    MDRequestRef& mdr)
+{
+  // skip doing this for debugging purposes?
+  if (g_conf()->mds_inject_traceless_reply_probability &&
+      mdr->ls && !mdr->o_trunc &&
+      (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
+    dout(5) << "deliberately skipping trace for " << *reply << dendl;
+    return;
+  }
+
+  // inode, dentry, dir, ..., inode
+  bufferlist bl;
+  mds_rank_t whoami = mds->get_nodeid();
+  Session *session = mdr->session;
+  snapid_t snapid = mdr->snapid;
+  utime_t now = ceph_clock_now();
+
+  dout(20) << "set_trace_dist snapid " << snapid << dendl;
+
+  // realm
+  if (snapid == CEPH_NOSNAP) {
+    SnapRealm *realm;
+    if (in)
+      realm = in->find_snaprealm();
+    else
+      realm = dn->get_dir()->get_inode()->find_snaprealm();
+    reply->snapbl = realm->get_snap_trace();
+    dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
+  }
+
+  // dir + dentry?
+  if (dn) {
+    reply->head.is_dentry = 1;
+    CDir *dir = dn->get_dir();
+    CInode *diri = dir->get_inode();
+
+    diri->encode_inodestat(bl, session, NULL, snapid);
+    dout(20) << "set_trace_dist added diri " << *diri << dendl;
+
+#ifdef MDS_VERIFY_FRAGSTAT
+    if (dir->is_complete())
+      dir->verify_fragstat();
+#endif
+    DirStat ds;
+    ds.frag = dir->get_frag();
+    ds.auth = dir->get_dir_auth().first;
+    if (dir->is_auth() && !forward_all_requests_to_auth)
+      dir->get_dist_spec(ds.dist, whoami);
+
+    dir->encode_dirstat(bl, session->info, ds);
+    dout(20) << "set_trace_dist added dir  " << *dir << dendl;
+
+    encode(dn->get_name(), bl);
+    mds->locker->issue_client_lease(dn, in, mdr, now, bl);
+  } else
+    reply->head.is_dentry = 0;
+
+  // inode
+  if (in) {
+    in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
+    dout(20) << "set_trace_dist added in   " << *in << dendl;
+    reply->head.is_target = 1;
+  } else
+    reply->head.is_target = 0;
+
+  reply->set_trace(bl);
+}
+
+void Server::handle_client_request(const cref_t<MClientRequest> &req)
+{
+  dout(4) << "handle_client_request " << *req << dendl;
+
+  if (mds->logger)
+    mds->logger->inc(l_mds_request);
+  if (logger)
+    logger->inc(l_mdss_handle_client_request);
+
+  if (!mdcache->is_open()) {
+    dout(5) << "waiting for root" << dendl;
+    mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
+    return;
+  }
+
+  bool sessionclosed_isok = replay_unsafe_with_closed_session;
+  // active session?
+  Session *session = 0;
+  if (req->is_a_client()) {
+    session = mds->get_session(req);
+    if (!session) {
+      dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
+    } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
+	       session->is_closing() ||
+	       session->is_killing()) {
+      dout(5) << "session closed|closing|killing, dropping" << dendl;
+      session = NULL;
+    }
+    if (!session) {
+      if (req->is_queued_for_replay())
+	mds->queue_one_replay();
+      return;
+    }
+  }
+
+  // old mdsmap?
+  if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
+    // send it?  hrm, this isn't ideal; they may get a lot of copies if
+    // they have a high request rate.
+  }
+
+  // completed request?
+  bool has_completed = false;
+  if (req->is_replay() || req->get_retry_attempt()) {
+    ceph_assert(session);
+    inodeno_t created;
+    if (session->have_completed_request(req->get_reqid().tid, &created)) {
+      has_completed = true;
+      if (!session->is_open())
+        return;
+      // Don't send traceless reply if the completed request has created
+      // new inode. Treat the request as lookup request instead.
+      if (req->is_replay() ||
+	  ((created == inodeno_t() || !mds->is_clientreplay()) &&
+	   req->get_op() != CEPH_MDS_OP_OPEN &&
+	   req->get_op() != CEPH_MDS_OP_CREATE)) {
+	dout(5) << "already completed " << req->get_reqid() << dendl;
+        auto reply = make_message<MClientReply>(*req, 0);
+	if (created != inodeno_t()) {
+	  bufferlist extra;
+	  encode(created, extra);
+	  reply->set_extra_bl(extra);
+	}
+        mds->send_message_client(reply, session);
+
+	if (req->is_queued_for_replay())
+	  mds->queue_one_replay();
+
+	return;
+      }
+      if (req->get_op() != CEPH_MDS_OP_OPEN &&
+	  req->get_op() != CEPH_MDS_OP_CREATE) {
+	dout(10) << " completed request which created new inode " << created
+		 << ", convert it to lookup request" << dendl;
+	req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
+	req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
+      }
+    }
+  }
+
+  // trim completed_request list
+  if (req->get_oldest_client_tid() > 0) {
+    dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
+    ceph_assert(session);
+    if (session->trim_completed_requests(req->get_oldest_client_tid())) {
+      // Sessions 'completed_requests' was dirtied, mark it to be
+      // potentially flushed at segment expiry.
+      mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
+
+      if (session->get_num_trim_requests_warnings() > 0 &&
+	  session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
+	session->reset_num_trim_requests_warnings();
+    } else {
+      if (session->get_num_completed_requests() >=
+	  (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
+	session->inc_num_trim_requests_warnings();
+	CachedStackStringStream css;
+	*css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
+	   << req->get_oldest_client_tid() << "), "
+	   << session->get_num_completed_requests()
+	   << " completed requests recorded in session\n";
+	mds->clog->warn() << css->strv();
+	dout(20) << __func__ << " " << css->strv() << dendl;
+      }
+    }
+  }
+
+  // register + dispatch
+  MDRequestRef mdr = mdcache->request_start(req);
+  if (!mdr.get())
+    return;
+
+  if (session) {
+    mdr->session = session;
+    session->requests.push_back(&mdr->item_session_request);
+  }
+
+  if (has_completed)
+    mdr->has_completed = true;
+
+  // process embedded cap releases?
+  //  (only if NOT replay!)
+  if (!req->releases.empty() && req->is_a_client() && !req->is_replay()) {
+    client_t client = req->get_source().num();
+    for (const auto &r : req->releases) {
+      mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
+    }
+    req->releases.clear();
+  }
+
+  dispatch_client_request(mdr);
+  return;
+}
+
+void Server::handle_osd_map()
+{
+  /* Note that we check the OSDMAP_FULL flag directly rather than
+   * using osdmap_full_flag(), because we want to know "is the flag set"
+   * rather than "does the flag apply to us?" */
+  mds->objecter->with_osdmap([this](const OSDMap& o) {
+      auto pi = o.get_pg_pool(mds->get_metadata_pool());
+      is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
+      dout(7) << __func__ << ": full = " << is_full << " epoch = "
+	      << o.get_epoch() << dendl;
+    });
+}
+
+void Server::dispatch_client_request(MDRequestRef& mdr)
+{
+  // we shouldn't be waiting on anyone.
+  ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_peer.empty());
+
+  if (mdr->killed) {
+    dout(10) << "request " << *mdr << " was killed" << dendl;
+    //if the mdr is a "batch_op" and it has followers, pick a follower as
+    //the new "head of the batch ops" and go on processing the new one.
+    if (mdr->is_batch_head()) {
+      int mask = mdr->client_request->head.args.getattr.mask;
+      auto it = mdr->batch_op_map->find(mask);
+      auto new_batch_head = it->second->find_new_head();
+      if (!new_batch_head) {
+	mdr->batch_op_map->erase(it);
+	return;
+      }
+      mdr = std::move(new_batch_head);
+    } else {
+      return;
+    }
+  } else if (mdr->aborted) {
+    mdr->aborted = false;
+    mdcache->request_kill(mdr);
+    return;
+  }
+
+  const cref_t<MClientRequest> &req = mdr->client_request;
+
+  if (logger) logger->inc(l_mdss_dispatch_client_request);
+
+  dout(7) << "dispatch_client_request " << *req << dendl;
+
+  if (req->may_write() && mdcache->is_readonly()) {
+    dout(10) << " read-only FS" << dendl;
+    respond_to_request(mdr, -CEPHFS_EROFS);
+    return;
+  }
+  if (mdr->has_more() && mdr->more()->peer_error) {
+    dout(10) << " got error from peers" << dendl;
+    respond_to_request(mdr, mdr->more()->peer_error);
+    return;
+  }
+  
+  if (is_full) {
+    CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+    if (!cur) {
+      // the request is already responded to
+      return;
+    }
+    if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
+        req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
+        req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
+        req->get_op() == CEPH_MDS_OP_RMXATTR ||
+        req->get_op() == CEPH_MDS_OP_SETXATTR ||
+        req->get_op() == CEPH_MDS_OP_CREATE ||
+        req->get_op() == CEPH_MDS_OP_SYMLINK ||
+        req->get_op() == CEPH_MDS_OP_MKSNAP ||
+	((req->get_op() == CEPH_MDS_OP_LINK ||
+	  req->get_op() == CEPH_MDS_OP_RENAME) &&
+	 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
+	) {
+
+      if (check_access(mdr, cur, MAY_FULL)) {
+        dout(20) << __func__ << ": full, has FULL caps, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
+      } else {
+        dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
+        respond_to_request(mdr, -CEPHFS_ENOSPC);
+        return;
+      }
+    } else {
+      dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
+    }
+  }
+
+  switch (req->get_op()) {
+  case CEPH_MDS_OP_LOOKUPHASH:
+  case CEPH_MDS_OP_LOOKUPINO:
+    handle_client_lookup_ino(mdr, false, false);
+    break;
+  case CEPH_MDS_OP_LOOKUPPARENT:
+    handle_client_lookup_ino(mdr, true, false);
+    break;
+  case CEPH_MDS_OP_LOOKUPNAME:
+    handle_client_lookup_ino(mdr, false, true);
+    break;
+
+    // inodes ops.
+  case CEPH_MDS_OP_LOOKUP:
+    handle_client_getattr(mdr, true);
+    break;
+
+  case CEPH_MDS_OP_LOOKUPSNAP:
+    // lookupsnap does not reference a CDentry; treat it as a getattr
+  case CEPH_MDS_OP_GETATTR:
+    handle_client_getattr(mdr, false);
+    break;
+  case CEPH_MDS_OP_GETVXATTR:
+    handle_client_getvxattr(mdr);
+    break;
+
+  case CEPH_MDS_OP_SETATTR:
+    handle_client_setattr(mdr);
+    break;
+  case CEPH_MDS_OP_SETLAYOUT:
+    handle_client_setlayout(mdr);
+    break;
+  case CEPH_MDS_OP_SETDIRLAYOUT:
+    handle_client_setdirlayout(mdr);
+    break;
+  case CEPH_MDS_OP_SETXATTR:
+    handle_client_setxattr(mdr);
+    break;
+  case CEPH_MDS_OP_RMXATTR:
+    handle_client_removexattr(mdr);
+    break;
+
+  case CEPH_MDS_OP_READDIR:
+    handle_client_readdir(mdr);
+    break;
+
+  case CEPH_MDS_OP_SETFILELOCK:
+    handle_client_file_setlock(mdr);
+    break;
+
+  case CEPH_MDS_OP_GETFILELOCK:
+    handle_client_file_readlock(mdr);
+    break;
+
+    // funky.
+  case CEPH_MDS_OP_CREATE:
+    if (mdr->has_completed)
+      handle_client_open(mdr);  // already created.. just open
+    else
+      handle_client_openc(mdr);
+    break;
+
+  case CEPH_MDS_OP_OPEN:
+    handle_client_open(mdr);
+    break;
+
+    // namespace.
+    // no prior locks.
+  case CEPH_MDS_OP_MKNOD:
+    handle_client_mknod(mdr);
+    break;
+  case CEPH_MDS_OP_LINK:
+    handle_client_link(mdr);
+    break;
+  case CEPH_MDS_OP_UNLINK:
+  case CEPH_MDS_OP_RMDIR:
+    handle_client_unlink(mdr);
+    break;
+  case CEPH_MDS_OP_RENAME:
+    handle_client_rename(mdr);
+    break;
+  case CEPH_MDS_OP_MKDIR:
+    handle_client_mkdir(mdr);
+    break;
+  case CEPH_MDS_OP_SYMLINK:
+    handle_client_symlink(mdr);
+    break;
+
+
+    // snaps
+  case CEPH_MDS_OP_LSSNAP:
+    handle_client_lssnap(mdr);
+    break;
+  case CEPH_MDS_OP_MKSNAP:
+    handle_client_mksnap(mdr);
+    break;
+  case CEPH_MDS_OP_RMSNAP:
+    handle_client_rmsnap(mdr);
+    break;
+  case CEPH_MDS_OP_RENAMESNAP:
+    handle_client_renamesnap(mdr);
+    break;
+
+  default:
+    dout(1) << " unknown client op " << req->get_op() << dendl;
+    respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
+  }
+}
+
+
+// ---------------------------------------
+// PEER REQUESTS
+
+void Server::handle_peer_request(const cref_t<MMDSPeerRequest> &m)
+{
+  dout(4) << "handle_peer_request " << m->get_reqid() << " from " << m->get_source() << dendl;
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+
+  if (logger) logger->inc(l_mdss_handle_peer_request);
+
+  // reply?
+  if (m->is_reply())
+    return handle_peer_request_reply(m);
+
+  // the purpose of rename notify is enforcing causal message ordering. making sure
+  // bystanders have received all messages from rename srcdn's auth MDS.
+  if (m->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY) {
+    auto reply = make_message<MMDSPeerRequest>(m->get_reqid(), m->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK);
+    mds->send_message(reply, m->get_connection());
+    return;
+  }
+
+  CDentry *straydn = NULL;
+  if (m->straybl.length() > 0) {
+    mdcache->decode_replica_stray(straydn, nullptr, m->straybl, from);
+    ceph_assert(straydn);
+    m->straybl.clear();
+  }
+
+  if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+    dout(3) << "not clientreplay|active yet, waiting" << dendl;
+    mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
+  // am i a new peer?
+  MDRequestRef mdr;
+  if (mdcache->have_request(m->get_reqid())) {
+    // existing?
+    mdr = mdcache->request_get(m->get_reqid());
+
+    // is my request newer?
+    if (mdr->attempt > m->get_attempt()) {
+      dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
+	       << ", dropping " << *m << dendl;
+      return;
+    }
+
+    if (mdr->attempt < m->get_attempt()) {
+      // mine is old, close it out
+      dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
+	       << ", closing out" << dendl;
+      mdcache->request_finish(mdr);
+      mdr.reset();
+    } else if (mdr->peer_to_mds != from) {
+      dout(10) << "local request " << *mdr << " not peer to mds." << from << dendl;
+      return;
+    }
+
+    // may get these while mdr->peer_request is non-null
+    if (m->get_op() == MMDSPeerRequest::OP_DROPLOCKS) {
+      mds->locker->drop_locks(mdr.get());
+      return;
+    }
+    if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
+      if (m->is_abort()) {
+	mdr->aborted = true;
+	if (mdr->peer_request) {
+	  // only abort on-going xlock, wrlock and auth pin
+	  ceph_assert(!mdr->peer_did_prepare());
+	} else {
+	  mdcache->request_finish(mdr);
+	}
+      } else {
+	if (m->inode_export.length() > 0)
+	  mdr->more()->inode_import = m->inode_export;
+	// finish off request.
+	mdcache->request_finish(mdr);
+      }
+      return;
+    }
+  }
+  if (!mdr.get()) {
+    // new?
+    if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
+      dout(10) << "missing peer request for " << m->get_reqid()
+	       << " OP_FINISH, must have lost race with a forward" << dendl;
+      return;
+    }
+    mdr = mdcache->request_start_peer(m->get_reqid(), m->get_attempt(), m);
+    mdr->set_op_stamp(m->op_stamp);
+  }
+  ceph_assert(mdr->peer_request == 0);     // only one at a time, please!
+
+  if (straydn) {
+    mdr->pin(straydn);
+    mdr->straydn = straydn;
+  }
+
+  if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
+      mdr->locks.empty()) {
+    dout(3) << "not active yet, waiting" << dendl;
+    mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
+  mdr->reset_peer_request(m);
+  
+  dispatch_peer_request(mdr);
+}
+
+void Server::handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m)
+{
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+  
+  if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+    metareqid_t r = m->get_reqid();
+    if (!mdcache->have_uncommitted_leader(r, from)) {
+      dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
+	       << from << " reqid " << r << dendl;
+      return;
+    }
+    dout(3) << "not clientreplay|active yet, waiting" << dendl;
+    mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
+  if (m->get_op() == MMDSPeerRequest::OP_COMMITTED) {
+    metareqid_t r = m->get_reqid();
+    mdcache->committed_leader_peer(r, from);
+    return;
+  }
+
+  MDRequestRef mdr = mdcache->request_get(m->get_reqid());
+  if (m->get_attempt() != mdr->attempt) {
+    dout(10) << "handle_peer_request_reply " << *mdr << " ignoring reply from other attempt "
+	     << m->get_attempt() << dendl;
+    return;
+  }
+
+  switch (m->get_op()) {
+  case MMDSPeerRequest::OP_XLOCKACK:
+    {
+      // identify lock, leader request
+      SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
+					       m->get_object_info());
+      mdr->more()->peers.insert(from);
+      lock->decode_locked_state(m->get_lock_data());
+      dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
+      mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
+      mdr->finish_locking(lock);
+      lock->get_xlock(mdr, mdr->get_client());
+
+      ceph_assert(mdr->more()->waiting_on_peer.count(from));
+      mdr->more()->waiting_on_peer.erase(from);
+      ceph_assert(mdr->more()->waiting_on_peer.empty());
+      mdcache->dispatch_request(mdr);
+    }
+    break;
+    
+  case MMDSPeerRequest::OP_WRLOCKACK:
+    {
+      // identify lock, leader request
+      SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
+					       m->get_object_info());
+      mdr->more()->peers.insert(from);
+      dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
+      auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
+      ceph_assert(it->is_remote_wrlock());
+      ceph_assert(it->wrlock_target == from);
+
+      mdr->finish_locking(lock);
+
+      ceph_assert(mdr->more()->waiting_on_peer.count(from));
+      mdr->more()->waiting_on_peer.erase(from);
+      ceph_assert(mdr->more()->waiting_on_peer.empty());
+      mdcache->dispatch_request(mdr);
+    }
+    break;
+
+  case MMDSPeerRequest::OP_AUTHPINACK:
+    handle_peer_auth_pin_ack(mdr, m);
+    break;
+
+  case MMDSPeerRequest::OP_LINKPREPACK:
+    handle_peer_link_prep_ack(mdr, m);
+    break;
+
+  case MMDSPeerRequest::OP_RMDIRPREPACK:
+    handle_peer_rmdir_prep_ack(mdr, m);
+    break;
+
+  case MMDSPeerRequest::OP_RENAMEPREPACK:
+    handle_peer_rename_prep_ack(mdr, m);
+    break;
+
+  case MMDSPeerRequest::OP_RENAMENOTIFYACK:
+    handle_peer_rename_notify_ack(mdr, m);
+    break;
+
+  default:
+    ceph_abort_msg("unknown op " + to_string(m->get_op()) + " requested");
+  }
+}
+
+void Server::dispatch_peer_request(MDRequestRef& mdr)
+{
+  dout(7) << "dispatch_peer_request " << *mdr << " " << *mdr->peer_request << dendl;
+
+  if (mdr->aborted) {
+    dout(7) << " abort flag set, finishing" << dendl;
+    mdcache->request_finish(mdr);
+    return;
+  }
+
+  if (logger) logger->inc(l_mdss_dispatch_peer_request);
+
+  int op = mdr->peer_request->get_op();
+  switch (op) {
+  case MMDSPeerRequest::OP_XLOCK:
+  case MMDSPeerRequest::OP_WRLOCK:
+    {
+      // identify object
+      SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
+					       mdr->peer_request->get_object_info());
+
+      if (!lock) {
+	dout(10) << "don't have object, dropping" << dendl;
+	ceph_abort_msg("don't have object"); // can this happen, if we auth pinned properly.
+      }
+      if (op == MMDSPeerRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
+	dout(10) << "not auth for remote xlock attempt, dropping on " 
+		 << *lock << " on " << *lock->get_parent() << dendl;
+      } else {
+	// use acquire_locks so that we get auth_pinning.
+	MutationImpl::LockOpVec lov;
+	for (const auto& p : mdr->locks) {
+	  if (p.is_xlock())
+	    lov.add_xlock(p.lock);
+	  else if (p.is_wrlock())
+	    lov.add_wrlock(p.lock);
+	}
+
+	int replycode = 0;
+	switch (op) {
+	case MMDSPeerRequest::OP_XLOCK:
+	  lov.add_xlock(lock);
+	  replycode = MMDSPeerRequest::OP_XLOCKACK;
+	  break;
+	case MMDSPeerRequest::OP_WRLOCK:
+	  lov.add_wrlock(lock);
+	  replycode = MMDSPeerRequest::OP_WRLOCKACK;
+	  break;
+	}
+	
+	if (!mds->locker->acquire_locks(mdr, lov))
+	  return;
+	
+	// ack
+	auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, replycode);
+	r->set_lock_type(lock->get_type());
+	lock->get_parent()->set_object_info(r->get_object_info());
+	if (replycode == MMDSPeerRequest::OP_XLOCKACK)
+	  lock->encode_locked_state(r->get_lock_data());
+	mds->send_message(r, mdr->peer_request->get_connection());
+      }
+
+      // done.
+      mdr->reset_peer_request();
+    }
+    break;
+
+  case MMDSPeerRequest::OP_UNXLOCK:
+  case MMDSPeerRequest::OP_UNWRLOCK:
+    {  
+      SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
+					       mdr->peer_request->get_object_info());
+      ceph_assert(lock);
+      auto it = mdr->locks.find(lock);
+      ceph_assert(it != mdr->locks.end());
+      bool need_issue = false;
+      switch (op) {
+      case MMDSPeerRequest::OP_UNXLOCK:
+	mds->locker->xlock_finish(it, mdr.get(), &need_issue);
+	break;
+      case MMDSPeerRequest::OP_UNWRLOCK:
+	mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
+	break;
+      }
+      if (need_issue)
+	mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
+
+      // done.  no ack necessary.
+      mdr->reset_peer_request();
+    }
+    break;
+
+  case MMDSPeerRequest::OP_AUTHPIN:
+    handle_peer_auth_pin(mdr);
+    break;
+
+  case MMDSPeerRequest::OP_LINKPREP:
+  case MMDSPeerRequest::OP_UNLINKPREP:
+    handle_peer_link_prep(mdr);
+    break;
+
+  case MMDSPeerRequest::OP_RMDIRPREP:
+    handle_peer_rmdir_prep(mdr);
+    break;
+
+  case MMDSPeerRequest::OP_RENAMEPREP:
+    handle_peer_rename_prep(mdr);
+    break;
+
+  default: 
+    ceph_abort_msg("unknown op "+ to_string(op)+ " received");
+  }
+}
+
+void Server::handle_peer_auth_pin(MDRequestRef& mdr)
+{
+  dout(10) << "handle_peer_auth_pin " << *mdr << dendl;
+
+  // build list of objects
+  list<MDSCacheObject*> objects;
+  CInode *auth_pin_freeze = NULL;
+  bool nonblocking = mdr->peer_request->is_nonblocking();
+  bool fail = false, wouldblock = false, readonly = false;
+  ref_t<MMDSPeerRequest> reply;
+
+  if (mdcache->is_readonly()) {
+    dout(10) << " read-only FS" << dendl;
+    readonly = true;
+    fail = true;
+  }
+
+  if (!fail) {
+    for (const auto &oi : mdr->peer_request->get_authpins()) {
+      MDSCacheObject *object = mdcache->get_object(oi);
+      if (!object) {
+	dout(10) << " don't have " << oi << dendl;
+	fail = true;
+	break;
+      }
+
+      objects.push_back(object);
+      if (oi == mdr->peer_request->get_authpin_freeze())
+	auth_pin_freeze = static_cast<CInode*>(object);
+    }
+  }
+  
+  // can we auth pin them?
+  if (!fail) {
+    for (const auto& obj : objects) {
+      if (!obj->is_auth()) {
+	dout(10) << " not auth for " << *obj << dendl;
+	fail = true;
+	break;
+      }
+      if (mdr->is_auth_pinned(obj))
+	continue;
+      if (!mdr->can_auth_pin(obj)) {
+	if (nonblocking) {
+	  dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
+	  fail = true;
+	  wouldblock = true;
+	  break;
+	}
+	// wait
+	dout(10) << " waiting for authpinnable on " << *obj << dendl;
+	obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+	mdr->drop_local_auth_pins();
+
+	mds->locker->notify_freeze_waiter(obj);
+	goto blocked;
+      }
+    }
+  }
+
+  if (!fail) {
+    /* freeze authpin wrong inode */
+    if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
+	mdr->more()->rename_inode != auth_pin_freeze)
+      mdr->unfreeze_auth_pin(true);
+
+    /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
+     * on the source inode to complete. This happens after all locks for the rename
+     * operation are acquired. But to acquire locks, we need auth pin locks' parent
+     * objects first. So there is an ABBA deadlock if someone auth pins the source inode
+     * after locks are acquired and before Server::handle_peer_rename_prep() is called.
+     * The solution is freeze the inode and prevent other MDRequests from getting new
+     * auth pins.
+     */
+    if (auth_pin_freeze) {
+      dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
+      if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
+	auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
+	mds->mdlog->flush();
+	goto blocked;
+      }
+    }
+  }
+
+  reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
+
+  if (fail) {
+    mdr->drop_local_auth_pins();  // just in case
+    if (readonly)
+      reply->mark_error_rofs();
+    if (wouldblock)
+      reply->mark_error_wouldblock();
+  } else {
+    // auth pin!
+    for (const auto& obj : objects) {
+      dout(10) << "auth_pinning " << *obj << dendl;
+      mdr->auth_pin(obj);
+    }
+    // return list of my auth_pins (if any)
+    for (const auto &p : mdr->object_states) {
+      if (!p.second.auth_pinned)
+	continue;
+      MDSCacheObjectInfo info;
+      p.first->set_object_info(info);
+      reply->get_authpins().push_back(info);
+      if (p.first == (MDSCacheObject*)auth_pin_freeze)
+	auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
+    }
+  }
+
+  mds->send_message_mds(reply, mdr->peer_to_mds);
+  
+  // clean up this request
+  mdr->reset_peer_request();
+  return;
+
+blocked:
+  if (mdr->peer_request->should_notify_blocking()) {
+    reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
+    reply->mark_req_blocked();
+    mds->send_message_mds(reply, mdr->peer_to_mds);
+    mdr->peer_request->clear_notify_blocking();
+  }
+  return;
+}
+
+void Server::handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
+{
+  dout(10) << "handle_peer_auth_pin_ack on " << *mdr << " " << *ack << dendl;
+  mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+  if (ack->is_req_blocked()) {
+    mdr->disable_lock_cache();
+    // peer auth pin is blocked, drop locks to avoid deadlock
+    mds->locker->drop_locks(mdr.get(), nullptr);
+    return;
+  }
+
+  // added auth pins?
+  set<MDSCacheObject*> pinned;
+  for (const auto &oi : ack->get_authpins()) {
+    MDSCacheObject *object = mdcache->get_object(oi);
+    ceph_assert(object);  // we pinned it
+    dout(10) << " remote has pinned " << *object << dendl;
+    mdr->set_remote_auth_pinned(object, from);
+    if (oi == ack->get_authpin_freeze())
+      mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
+    pinned.insert(object);
+  }
+
+  // removed frozen auth pin ?
+  if (mdr->more()->is_remote_frozen_authpin &&
+      ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
+    auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
+    ceph_assert(stat_p);
+    if (stat_p->remote_auth_pinned == from) {
+      mdr->more()->is_remote_frozen_authpin = false;
+    }
+  }
+
+  // removed auth pins?
+  for (auto& p : mdr->object_states) {
+    if (p.second.remote_auth_pinned == MDS_RANK_NONE)
+      continue;
+    MDSCacheObject* object = p.first;
+    if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
+      dout(10) << " remote has unpinned " << *object << dendl;
+      mdr->_clear_remote_auth_pinned(p.second);
+    }
+  }
+
+  // note peer
+  mdr->more()->peers.insert(from);
+
+  // clear from waiting list
+  auto ret = mdr->more()->waiting_on_peer.erase(from);
+  ceph_assert(ret);
+
+  if (ack->is_error_rofs()) {
+    mdr->more()->peer_error = -CEPHFS_EROFS;
+  } else if (ack->is_error_wouldblock()) {
+    mdr->more()->peer_error = -CEPHFS_EWOULDBLOCK;
+  }
+
+  // go again?
+  if (mdr->more()->waiting_on_peer.empty())
+    mdcache->dispatch_request(mdr);
+  else 
+    dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
+}
+
+
+// ---------------------------------------
+// HELPERS
+
+
+/**
+ * check whether we are permitted to complete a request
+ *
+ * Check whether we have permission to perform the operation specified
+ * by mask on the given inode, based on the capability in the mdr's
+ * session.
+ */
+bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
+{
+  if (mdr->session) {
+    int r = mdr->session->check_access(
+      in, mask,
+      mdr->client_request->get_caller_uid(),
+      mdr->client_request->get_caller_gid(),
+      &mdr->client_request->get_caller_gid_list(),
+      mdr->client_request->head.args.setattr.uid,
+      mdr->client_request->head.args.setattr.gid);
+    if (r < 0) {
+      respond_to_request(mdr, r);
+      return false;
+    }
+  }
+  return true;
+}
+
+/**
+ * check whether fragment has reached maximum size
+ *
+ */
+bool Server::check_fragment_space(MDRequestRef &mdr, CDir *dir)
+{
+  const auto size = dir->get_frag_size();
+  const auto max = g_conf()->mds_bal_fragment_size_max;
+  if (size >= max) {
+    dout(10) << "fragment " << *dir << " size exceeds " << max << " (CEPHFS_ENOSPC)" << dendl;
+    respond_to_request(mdr, -CEPHFS_ENOSPC);
+    return false;
+  } else {
+    dout(20) << "fragment " << *dir << " size " << size << " < "  << max << dendl;
+  }
+
+  return true;
+}
+
+/**
+ * check whether entries in a dir reached maximum size
+ *
+ */
+bool Server::check_dir_max_entries(MDRequestRef &mdr, CDir *in)
+{
+  const uint64_t size = in->inode->get_projected_inode()->dirstat.nfiles +
+                   in->inode->get_projected_inode()->dirstat.nsubdirs;
+  if (dir_max_entries && size >= dir_max_entries) {
+    dout(10) << "entries per dir " << *in << " size exceeds " << dir_max_entries << " (ENOSPC)" << dendl;
+    respond_to_request(mdr, -ENOSPC);
+    return false;
+  }
+  return true;
+}
+
+
+CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
+{
+  string straydname;
+  in->name_stray_dentry(straydname);
+
+  CDentry *straydn = mdr->straydn;
+  if (straydn) {
+    ceph_assert(straydn->get_name() == straydname);
+    return straydn;
+  }
+  CDir *straydir = mdcache->get_stray_dir(in);
+
+  if (!mdr->client_request->is_replay() &&
+      !check_fragment_space(mdr, straydir))
+    return nullptr;
+
+  straydn = straydir->lookup(straydname);
+  if (!straydn) {
+    if (straydir->is_frozen_dir()) {
+      dout(10) << __func__ << ": " << *straydir << " is frozen, waiting" << dendl;
+      mds->locker->drop_locks(mdr.get());
+      mdr->drop_local_auth_pins();
+      straydir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+      return nullptr;
+    }
+    straydn = straydir->add_null_dentry(straydname);
+    straydn->mark_new();
+  } else {
+    ceph_assert(straydn->get_projected_linkage()->is_null());
+  }
+
+  straydn->state_set(CDentry::STATE_STRAY);
+  mdr->straydn = straydn;
+  mdr->pin(straydn);
+
+  return straydn;
+}
+
+/** prepare_new_inode
+ *
+ * create a new inode.  set c/m/atime.  hit dir pop.
+ */
+CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
+				  const file_layout_t *layout)
+{
+  CInode *in = new CInode(mdcache);
+  auto _inode = in->_get_inode();
+  
+  // Server::prepare_force_open_sessions() can re-open session in closing
+  // state. In that corner case, session's prealloc_inos are being freed.
+  // To simplify the code, we disallow using/refilling session's prealloc_ino
+  // while session is opening.
+  bool allow_prealloc_inos = mdr->session->is_open();
+
+  // assign ino
+  if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(useino))) {
+    mds->sessionmap.mark_projected(mdr->session);
+    dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
+	     << " (" << mdr->session->info.prealloc_inos.size() << " left)"
+	     << dendl;
+  } else {
+    mdr->alloc_ino = 
+      _inode->ino = mds->inotable->project_alloc_id(useino);
+    dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
+  }
+
+  if (useino && useino != _inode->ino) {
+    dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
+    mds->clog->error() << mdr->client_request->get_source()
+       << " specified ino " << useino
+       << " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
+    //ceph_abort(); // just for now.
+  }
+    
+  if (allow_prealloc_inos &&
+      mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
+    int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
+    mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
+    ceph_assert(mdr->prealloc_inos.size());  // or else fix projected increment semantics
+    mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
+    mds->sessionmap.mark_projected(mdr->session);
+    dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
+  }
+
+  _inode->version = 1;
+  _inode->xattr_version = 1;
+  _inode->nlink = 1;   // FIXME
+
+  _inode->mode = mode;
+
+  // FIPS zeroization audit 20191117: this memset is not security related.
+  memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
+  if (_inode->is_dir()) {
+    _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+  } else if (layout) {
+    _inode->layout = *layout;
+  } else {
+    _inode->layout = mdcache->default_file_layout;
+  }
+
+  _inode->truncate_size = -1ull;  // not truncated, yet!
+  _inode->truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
+
+  CInode *diri = dir->get_inode();
+  auto pip = diri->get_projected_inode();
+
+  dout(10) << oct << " dir mode 0" << pip->mode << " new mode 0" << mode << dec << dendl;
+
+  if (pip->mode & S_ISGID) {
+    dout(10) << " dir is sticky" << dendl;
+    _inode->gid = pip->gid;
+    if (S_ISDIR(mode)) {
+      dout(10) << " new dir also sticky" << dendl;
+      _inode->mode |= S_ISGID;
+    }
+  } else {
+    _inode->gid = mdr->client_request->get_caller_gid();
+  }
+
+  _inode->uid = mdr->client_request->get_caller_uid();
+
+  _inode->btime = _inode->ctime = _inode->mtime = _inode->atime =
+    mdr->get_op_stamp();
+
+  _inode->change_attr = 0;
+
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  if (req->get_data().length()) {
+    auto p = req->get_data().cbegin();
+
+    // xattrs on new inode?
+    auto _xattrs = CInode::allocate_xattr_map();
+    decode_noshare(*_xattrs, p);
+    dout(10) << "prepare_new_inode setting xattrs " << *_xattrs << dendl;
+    in->reset_xattrs(std::move(_xattrs));
+  }
+
+  if (!mds->mdsmap->get_inline_data_enabled() ||
+      !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
+    _inode->inline_data.version = CEPH_INLINE_NONE;
+
+  mdcache->add_inode(in);  // add
+  dout(10) << "prepare_new_inode " << *in << dendl;
+  return in;
+}
+
+void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
+{
+  dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
+	   << " inotablev " << mds->inotable->get_projected_version()
+	   << dendl;
+  blob->set_ino_alloc(mdr->alloc_ino,
+		      mdr->used_prealloc_ino,
+		      mdr->prealloc_inos,
+		      mdr->client_request->get_source(),
+		      mds->sessionmap.get_projected(),
+		      mds->inotable->get_projected_version());
+}
+
+void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
+{
+  dout(10) << "apply_allocated_inos " << mdr->alloc_ino
+	   << " / " << mdr->prealloc_inos
+	   << " / " << mdr->used_prealloc_ino << dendl;
+
+  if (mdr->alloc_ino) {
+    mds->inotable->apply_alloc_id(mdr->alloc_ino);
+  }
+  if (mdr->prealloc_inos.size()) {
+    ceph_assert(session);
+    session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
+    session->free_prealloc_inos.insert(mdr->prealloc_inos);
+    session->info.prealloc_inos.insert(mdr->prealloc_inos);
+    mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
+    mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
+  }
+  if (mdr->used_prealloc_ino) {
+    ceph_assert(session);
+    session->info.prealloc_inos.erase(mdr->used_prealloc_ino);
+    mds->sessionmap.mark_dirty(session);
+  }
+}
+
+struct C_MDS_TryOpenInode : public ServerContext {
+  MDRequestRef mdr;
+  inodeno_t ino;
+  C_MDS_TryOpenInode(Server *s, MDRequestRef& r, inodeno_t i) :
+    ServerContext(s), mdr(r), ino(i) {}
+  void finish(int r) override {
+    server->_try_open_ino(mdr, r, ino);
+  }
+};
+
+void Server::_try_open_ino(MDRequestRef& mdr, int r, inodeno_t ino)
+{
+  dout(10) << "_try_open_ino " << mdr.get() << " ino " << ino << " r=" << r << dendl;
+
+  // `r` is a rank if >=0, else an error code
+  if (r >= 0) {
+    mds_rank_t dest_rank(r);
+    if (dest_rank == mds->get_nodeid())
+      dispatch_client_request(mdr);
+    else
+      mdcache->request_forward(mdr, dest_rank);
+    return;
+  }
+
+  // give up
+  if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
+    r = -CEPHFS_ESTALE;
+  respond_to_request(mdr, r);
+}
+
+class C_MDS_TryFindInode : public ServerContext {
+  MDRequestRef mdr;
+  MDCache *mdcache;
+  inodeno_t ino;
+public:
+  C_MDS_TryFindInode(Server *s, MDRequestRef& r, MDCache *m, inodeno_t i) :
+    ServerContext(s), mdr(r), mdcache(m), ino(i) {}
+  void finish(int r) override {
+    if (r == -CEPHFS_ESTALE) { // :( find_ino_peers failed
+      /*
+       * There has one case that when the MDS crashes and the
+       * openfiletable journal couldn't be flushed and then
+       * the replacing MDS is possibly won't load some already
+       * opened CInodes into the MDCache. And if the clients
+       * will retry some requests after reconnected, the MDS
+       * will return -ESTALE after failing to find the ino in
+       * all active peers.
+       *
+       * As a workaround users can run `ls -R ${mountpoint}`
+       * to list all the sub-files or sub-direcotries from the
+       * mountpoint.
+       *
+       * We need try to open the ino and try it again.
+       */
+      CInode *in = mdcache->get_inode(ino);
+      if (in && in->state_test(CInode::STATE_PURGING))
+        server->respond_to_request(mdr, r);
+      else
+        mdcache->open_ino(ino, (int64_t)-1, new C_MDS_TryOpenInode(server, mdr, ino));
+    } else {
+      server->dispatch_client_request(mdr);
+    }
+  }
+};
+
+/* If this returns null, the request has been handled
+ * as appropriate: forwarded on, or the client's been replied to */
+CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
+				    bool want_auth,
+				    bool no_want_auth)
+{
+  const filepath& refpath = mdr->get_filepath();
+  dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
+
+  if (mdr->locking_state & MutationImpl::PATH_LOCKED)
+    return mdr->in[0];
+
+  // traverse
+  CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
+  int flags = 0;
+  if (refpath.is_last_snap()) {
+    if (!no_want_auth)
+      want_auth = true;
+  } else {
+    if (!no_want_auth && forward_all_requests_to_auth)
+      want_auth = true;
+    flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
+  }
+  if (want_auth)
+    flags |= MDS_TRAVERSE_WANT_AUTH;
+  int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
+  if (r > 0)
+    return nullptr; // delayed
+  if (r < 0) {  // error
+    if (r == -CEPHFS_ENOENT && !mdr->dn[0].empty()) {
+      if (mdr->client_request &&
+	  mdr->client_request->get_dentry_wanted())
+        mdr->tracedn = mdr->dn[0].back();
+      respond_to_request(mdr, r);
+    } else if (r == -CEPHFS_ESTALE) {
+      dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
+      inodeno_t ino = refpath.get_ino();
+      mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
+    } else {
+      dout(10) << "FAIL on error " << r << dendl;
+      respond_to_request(mdr, r);
+    }
+    return nullptr;
+  }
+  CInode *ref = mdr->in[0];
+  dout(10) << "ref is " << *ref << dendl;
+
+  if (want_auth) {
+    // auth_pin?
+    //   do NOT proceed if freezing, as cap release may defer in that case, and
+    //   we could deadlock when we try to lock @ref.
+    // if we're already auth_pinned, continue; the release has already been processed.
+    if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
+	(ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
+      dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
+      ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
+      if (mdr->is_any_remote_auth_pin())
+	mds->locker->notify_freeze_waiter(ref);
+      return 0;
+    }
+    mdr->auth_pin(ref);
+  }
+
+  // set and pin ref
+  mdr->pin(ref);
+  return ref;
+}
+
+
+/** rdlock_path_xlock_dentry
+ * traverse path to the directory that could/would contain dentry.
+ * make sure i am auth for that dentry, forward as necessary.
+ * create null dentry in place (or use existing if okexist).
+ * get rdlocks on traversed dentries, xlock on new dentry.
+ */
+CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
+					  bool create, bool okexist, bool want_layout)
+{
+  const filepath& refpath = mdr->get_filepath();
+  dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
+
+  if (mdr->locking_state & MutationImpl::PATH_LOCKED)
+    return mdr->dn[0].back();
+
+  // figure parent dir vs dname
+  if (refpath.depth() == 0) {
+    dout(7) << "invalid path (zero length)" << dendl;
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return nullptr;
+  }
+
+  if (refpath.is_last_snap()) {
+    respond_to_request(mdr, -CEPHFS_EROFS);
+    return nullptr;
+  }
+
+  if (refpath.is_last_dot_or_dotdot()) {
+    dout(7) << "invalid path (last dot or dot_dot)" << dendl;
+    if (create)
+      respond_to_request(mdr, -CEPHFS_EEXIST);
+    else
+      respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+    return nullptr;
+  }
+
+  // traverse to parent dir
+  CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
+  int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
+	      MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
+	      MDS_TRAVERSE_WANT_AUTH;
+  if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
+    flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
+  if (create)
+    flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
+  if (want_layout)
+    flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
+  int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
+  if (r > 0)
+    return nullptr; // delayed
+  if (r < 0) {
+    if (r == -CEPHFS_ESTALE) {
+      dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
+      inodeno_t ino = refpath.get_ino();
+      mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
+      return nullptr;
+    }
+    respond_to_request(mdr, r);
+    return nullptr;
+  }
+
+  CDentry *dn = mdr->dn[0].back();
+  CDir *dir = dn->get_dir();
+  CInode *diri = dir->get_inode();
+
+  if (!mdr->reqid.name.is_mds()) {
+    if (diri->is_system() && !diri->is_root()) {
+      respond_to_request(mdr, -CEPHFS_EROFS);
+      return nullptr;
+    }
+  }
+
+  if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
+    respond_to_request(mdr, -CEPHFS_ENOENT);
+    return nullptr;
+  }
+
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  if (dnl->is_null()) {
+    if (!create && okexist) {
+      respond_to_request(mdr, -CEPHFS_ENOENT);
+      return nullptr;
+    }
+
+    snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+    dn->first = std::max(dn->first, next_snap);
+  } else {
+    if (!okexist) {
+      respond_to_request(mdr, -CEPHFS_EEXIST);
+      return nullptr;
+    }
+    mdr->in[0] = dnl->get_inode();
+  }
+
+  return dn;
+}
+
+/** rdlock_two_paths_xlock_destdn
+ * traverse two paths and lock the two paths in proper order.
+ * The order of taking locks is:
+ * 1. Lock directory inodes or dentries according to which trees they
+ *    are under. Lock objects under fs root before objects under mdsdir.
+ * 2. Lock directory inodes or dentries according to their depth, in
+ *    ascending order.
+ * 3. Lock directory inodes or dentries according to inode numbers or
+ *    dentries' parent inode numbers, in ascending order.
+ * 4. Lock dentries in the same directory in order of their keys.
+ * 5. Lock non-directory inodes according to inode numbers, in ascending
+ *    order.
+ */
+std::pair<CDentry*, CDentry*>
+Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
+{
+
+  const filepath& refpath = mdr->get_filepath();
+  const filepath& refpath2 = mdr->get_filepath2();
+
+  dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
+
+  if (mdr->locking_state & MutationImpl::PATH_LOCKED)
+    return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
+
+  if (refpath.depth() != 1 || refpath2.depth() != 1) {
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
+  }
+
+  if (refpath.is_last_snap() || refpath2.is_last_snap()) {
+    respond_to_request(mdr, -CEPHFS_EROFS);
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  // traverse to parent dir
+  CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
+  int flags = MDS_TRAVERSE_RDLOCK_SNAP |  MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
+  int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
+  if (r != 0) {
+    if (r == -CEPHFS_ESTALE) {
+      dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl;
+      inodeno_t ino = refpath.get_ino();
+      mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
+    } else if (r < 0) {
+      respond_to_request(mdr, r);
+    }
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
+  r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
+  if (r != 0) {
+    if (r == -CEPHFS_ESTALE) {
+      dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
+      inodeno_t ino = refpath2.get_ino();
+      mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
+    } else if (r < 0) {
+      respond_to_request(mdr, r);
+    }
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  CDentry *srcdn = mdr->dn[1].back();
+  CDir *srcdir = srcdn->get_dir();
+  CDentry *destdn = mdr->dn[0].back();
+  CDir *destdir = destdn->get_dir();
+
+  if (!mdr->reqid.name.is_mds()) {
+    if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
+	(destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
+      respond_to_request(mdr, -CEPHFS_EROFS);
+      return std::make_pair(nullptr, nullptr);
+    }
+  }
+
+  if (!destdir->get_inode()->is_base() &&
+      destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
+    respond_to_request(mdr, -CEPHFS_ENOENT);
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  MutationImpl::LockOpVec lov;
+  if (srcdir->get_inode() == destdir->get_inode()) {
+    lov.add_wrlock(&destdir->inode->filelock);
+    lov.add_wrlock(&destdir->inode->nestlock);
+    if (xlock_srcdn && srcdir != destdir) {
+      mds_rank_t srcdir_auth = srcdir->authority().first;
+      if (srcdir_auth != mds->get_nodeid()) {
+	lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
+	lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
+      }
+    }
+
+    if (srcdn->get_name() > destdn->get_name())
+      lov.add_xlock(&destdn->lock);
+
+    if (xlock_srcdn)
+      lov.add_xlock(&srcdn->lock);
+    else
+      lov.add_rdlock(&srcdn->lock);
+
+    if (srcdn->get_name() < destdn->get_name())
+      lov.add_xlock(&destdn->lock);
+  } else {
+    int cmp = mdr->compare_paths();
+    bool lock_destdir_first =
+      (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
+
+    if (lock_destdir_first) {
+      lov.add_wrlock(&destdir->inode->filelock);
+      lov.add_wrlock(&destdir->inode->nestlock);
+      lov.add_xlock(&destdn->lock);
+    }
+
+    if (xlock_srcdn) {
+      mds_rank_t srcdir_auth = srcdir->authority().first;
+      if (srcdir_auth == mds->get_nodeid()) {
+	lov.add_wrlock(&srcdir->inode->filelock);
+	lov.add_wrlock(&srcdir->inode->nestlock);
+      } else {
+	lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
+	lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
+      }
+      lov.add_xlock(&srcdn->lock);
+    } else {
+      lov.add_rdlock(&srcdn->lock);
+    }
+
+    if (!lock_destdir_first) {
+      lov.add_wrlock(&destdir->inode->filelock);
+      lov.add_wrlock(&destdir->inode->nestlock);
+      lov.add_xlock(&destdn->lock);
+    }
+  }
+
+  CInode *auth_pin_freeze = nullptr;
+  // XXX any better way to do this?
+  if (xlock_srcdn && !srcdn->is_auth()) {
+    CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+    auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
+  }
+  if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
+    return std::make_pair(nullptr, nullptr);
+
+  if (srcdn->get_projected_linkage()->is_null()) {
+    respond_to_request(mdr, -CEPHFS_ENOENT);
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  if (destdn->get_projected_linkage()->is_null()) {
+    snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+    destdn->first = std::max(destdn->first, next_snap);
+  }
+
+  mdr->locking_state |= MutationImpl::PATH_LOCKED;
+
+  return std::make_pair(destdn, srcdn);
+}
+
+/**
+ * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
+ *
+ * @param diri base inode
+ * @param fg the exact frag we want
+ * @param mdr request
+ * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
+ */
+CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
+{
+  CDir *dir = diri->get_dirfrag(fg);
+
+  if (dir) {
+    // am i auth for the dirfrag?
+    if (!dir->is_auth()) {
+      mds_rank_t auth = dir->authority().first;
+      dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
+	<< ", fw to mds." << auth << dendl;
+      mdcache->request_forward(mdr, auth);
+      return nullptr;
+    }
+  } else {
+    // not open and inode not mine?
+    if (!diri->is_auth()) {
+      mds_rank_t inauth = diri->authority().first;
+      dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
+      mdcache->request_forward(mdr, inauth);
+      return nullptr;
+    }
+
+    // not open and inode frozen?
+    if (diri->is_frozen()) {
+      dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
+      ceph_assert(diri->get_parent_dir());
+      diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+      return nullptr;
+    }
+
+    // invent?
+    dir = diri->get_or_open_dirfrag(mdcache, fg);
+  }
+
+  return dir;
+}
+
+
+// ===============================================================================
+// STAT
+
+void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+
+  if (req->get_filepath().depth() == 0 && is_lookup) {
+    // refpath can't be empty for lookup but it can for
+    // getattr (we do getattr with empty refpath for mount of '/')
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  bool want_auth = false;
+  int mask = req->head.args.getattr.mask;
+  if (mask & CEPH_STAT_RSTAT)
+    want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
+
+  if (!mdr->is_batch_head() && mdr->can_batch()) {
+    CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
+    int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
+				   (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
+				   &mdr->dn[0], &mdr->in[0]);
+    if (r > 0)
+      return; // delayed
+
+    if (r < 0) {
+      // fall-thru. let rdlock_path_pin_ref() check again.
+    } else if (is_lookup) {
+      CDentry* dn = mdr->dn[0].back();
+      mdr->pin(dn);
+      auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
+      if (em.second) {
+	em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
+      } else {
+	dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
+	em.first->second->add_request(mdr);
+	return;
+      }
+    } else {
+      CInode *in = mdr->in[0];
+      mdr->pin(in);
+      auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
+      if (em.second) {
+	em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
+      } else {
+	dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
+	em.first->second->add_request(mdr);
+	return;
+      }
+    }
+  }
+
+  CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
+  if (!ref)
+    return;
+
+  mdr->getattr_caps = mask;
+
+  /*
+   * if client currently holds the EXCL cap on a field, do not rdlock
+   * it; client's stat() will result in valid info if _either_ EXCL
+   * cap is held or MDS rdlocks and reads the value here.
+   *
+   * handling this case here is easier than weakening rdlock
+   * semantics... that would cause problems elsewhere.
+   */
+  client_t client = mdr->get_client();
+  int issued = 0;
+  Capability *cap = ref->get_client_cap(client);
+  if (cap && (mdr->snapid == CEPH_NOSNAP ||
+	      mdr->snapid <= cap->client_follows))
+    issued = cap->issued();
+
+  // FIXME
+  MutationImpl::LockOpVec lov;
+  if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
+    lov.add_rdlock(&ref->linklock);
+  if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
+    lov.add_rdlock(&ref->authlock);
+  if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
+    lov.add_rdlock(&ref->xattrlock);
+  if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
+    // Don't wait on unstable filelock if client is allowed to read file size.
+    // This can reduce the response time of getattr in the case that multiple
+    // clients do stat(2) and there are writers.
+    // The downside of this optimization is that mds may not issue Fs caps along
+    // with getattr reply. Client may need to send more getattr requests.
+    if (mdr->is_rdlocked(&ref->filelock)) {
+      lov.add_rdlock(&ref->filelock);
+    } else if (ref->filelock.is_stable() ||
+	       ref->filelock.get_num_wrlocks() > 0 ||
+	       !ref->filelock.can_read(mdr->get_client())) {
+      lov.add_rdlock(&ref->filelock);
+      mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
+    }
+  }
+
+  if (!mds->locker->acquire_locks(mdr, lov))
+    return;
+
+  if (!check_access(mdr, ref, MAY_READ))
+    return;
+
+  utime_t now = ceph_clock_now();
+  mdr->set_mds_stamp(now);
+
+  // note which caps are requested, so we return at least a snapshot
+  // value for them.  (currently this matters for xattrs and inline data)
+  mdr->getattr_caps = mask;
+
+  mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
+
+  // reply
+  dout(10) << "reply to stat on " << *req << dendl;
+  mdr->tracei = ref;
+  if (is_lookup)
+    mdr->tracedn = mdr->dn[0].back();
+  respond_to_request(mdr, 0);
+}
+
+struct C_MDS_LookupIno2 : public ServerContext {
+  MDRequestRef mdr;
+  C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
+  void finish(int r) override {
+    server->_lookup_ino_2(mdr, r);
+  }
+};
+
+/*
+ * filepath:  ino
+ */
+void Server::handle_client_lookup_ino(MDRequestRef& mdr,
+				      bool want_parent, bool want_dentry)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+
+  if ((uint64_t)req->head.args.lookupino.snapid > 0)
+    return _lookup_snap_ino(mdr);
+
+  inodeno_t ino = req->get_filepath().get_ino();
+  auto _ino = ino.val;
+
+  /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
+   * I do not have an explanation for how that happened organically but this
+   * check will ensure that the client can no longer do that.
+   *
+   * [1] https://tracker.ceph.com/issues/49922
+   */
+  if (MDS_IS_PRIVATE_INO(_ino)) {
+    respond_to_request(mdr, -CEPHFS_ESTALE);
+    return;
+  }
+
+  CInode *in = mdcache->get_inode(ino);
+  if (in && in->state_test(CInode::STATE_PURGING)) {
+    respond_to_request(mdr, -CEPHFS_ESTALE);
+    return;
+  }
+  if (!in) {
+    mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
+    return;
+  }
+
+  // check for nothing (not read or write); this still applies the
+  // path check.
+  if (!check_access(mdr, in, 0))
+    return;
+
+  CDentry *dn = in->get_projected_parent_dn();
+  CInode *diri = dn ? dn->get_dir()->inode : NULL;
+
+  MutationImpl::LockOpVec lov;
+  if (dn && (want_parent || want_dentry)) {
+    mdr->pin(dn);
+    lov.add_rdlock(&dn->lock);
+  }
+
+  unsigned mask = req->head.args.lookupino.mask;
+  if (mask) {
+    Capability *cap = in->get_client_cap(mdr->get_client());
+    int issued = 0;
+    if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
+      issued = cap->issued();
+    // FIXME
+    // permission bits, ACL/security xattrs
+    if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
+      lov.add_rdlock(&in->authlock);
+    if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
+      lov.add_rdlock(&in->xattrlock);
+
+    mdr->getattr_caps = mask;
+  }
+
+  if (!lov.empty()) {
+    if (!mds->locker->acquire_locks(mdr, lov))
+      return;
+
+    if (diri != NULL) {
+      // need read access to directory inode
+      if (!check_access(mdr, diri, MAY_READ))
+        return;
+    }
+  }
+
+  if (want_parent) {
+    if (in->is_base()) {
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+    if (!diri || diri->is_stray()) {
+      respond_to_request(mdr, -CEPHFS_ESTALE);
+      return;
+    }
+    dout(10) << "reply to lookup_parent " << *in << dendl;
+    mdr->tracei = diri;
+    respond_to_request(mdr, 0);
+  } else {
+    if (want_dentry) {
+      inodeno_t dirino = req->get_filepath2().get_ino();
+      if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
+	respond_to_request(mdr, -CEPHFS_ENOENT);
+	return;
+      }
+      dout(10) << "reply to lookup_name " << *in << dendl;
+    } else
+      dout(10) << "reply to lookup_ino " << *in << dendl;
+
+    mdr->tracei = in;
+    if (want_dentry)
+      mdr->tracedn = dn;
+    respond_to_request(mdr, 0);
+  }
+}
+
+void Server::_lookup_snap_ino(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+
+  vinodeno_t vino;
+  vino.ino = req->get_filepath().get_ino();
+  vino.snapid = (__u64)req->head.args.lookupino.snapid;
+  inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
+  __u32 hash = req->head.args.lookupino.hash;
+
+  dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
+
+  CInode *in = mdcache->lookup_snap_inode(vino);
+  if (!in) {
+    in = mdcache->get_inode(vino.ino);
+    if (in) {
+      if (in->state_test(CInode::STATE_PURGING) ||
+	  !in->has_snap_data(vino.snapid)) {
+	if (in->is_dir() || !parent_ino) {
+	  respond_to_request(mdr, -CEPHFS_ESTALE);
+	  return;
+	}
+	in = NULL;
+      }
+    }
+  }
+
+  if (in) {
+    dout(10) << "reply to lookup_snap_ino " << *in << dendl;
+    mdr->snapid = vino.snapid;
+    mdr->tracei = in;
+    respond_to_request(mdr, 0);
+    return;
+  }
+
+  CInode *diri = NULL;
+  if (parent_ino) {
+    diri = mdcache->get_inode(parent_ino);
+    if (!diri) {
+      mdcache->open_ino(parent_ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
+      return;
+    }
+
+    if (!diri->is_dir()) {
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    MutationImpl::LockOpVec lov;
+    lov.add_rdlock(&diri->dirfragtreelock);
+    if (!mds->locker->acquire_locks(mdr, lov))
+      return;
+
+    frag_t frag = diri->dirfragtree[hash];
+    CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
+    if (!dir)
+      return;
+
+    if (!dir->is_complete()) {
+      if (dir->is_frozen()) {
+	mds->locker->drop_locks(mdr.get());
+	mdr->drop_local_auth_pins();
+	dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+	return;
+      }
+      dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
+      return;
+    }
+
+    respond_to_request(mdr, -CEPHFS_ESTALE);
+  } else {
+    mdcache->open_ino(vino.ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
+  }
+}
+
+void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
+{
+  inodeno_t ino = mdr->client_request->get_filepath().get_ino();
+  dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
+
+  // `r` is a rank if >=0, else an error code
+  if (r >= 0) {
+    mds_rank_t dest_rank(r);
+    if (dest_rank == mds->get_nodeid())
+      dispatch_client_request(mdr);
+    else
+      mdcache->request_forward(mdr, dest_rank);
+    return;
+  }
+
+  // give up
+  if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
+    r = -CEPHFS_ESTALE;
+  respond_to_request(mdr, r);
+}
+
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_open(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  dout(7) << "open on " << req->get_filepath() << dendl;
+
+  int flags = req->head.args.open.flags;
+  int cmode = ceph_flags_to_mode(flags);
+  if (cmode < 0) {
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+  
+  bool need_auth = !file_mode_is_readonly(cmode) ||
+		   (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
+
+  if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
+    dout(7) << "read-only FS" << dendl;
+    respond_to_request(mdr, -CEPHFS_EROFS);
+    return;
+  }
+  
+  CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
+  if (!cur)
+    return;
+
+  if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
+    ceph_assert(!need_auth);
+    mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
+    CInode *cur = rdlock_path_pin_ref(mdr, true);
+    if (!cur)
+      return;
+  }
+
+  if (!cur->is_file()) {
+    // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
+    cmode = CEPH_FILE_MODE_PIN;
+    // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
+    if (cur->is_symlink() && !(flags & CEPH_O_NOFOLLOW))
+      flags &= ~CEPH_O_TRUNC;
+  }
+
+  dout(10) << "open flags = " << flags
+	   << ", filemode = " << cmode
+	   << ", need_auth = " << need_auth
+	   << dendl;
+  
+  // regular file?
+  /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
+    dout(7) << "not a file or dir " << *cur << dendl;
+    respond_to_request(mdr, -CEPHFS_ENXIO);                 // FIXME what error do we want?
+    return;
+    }*/
+  if ((flags & CEPH_O_DIRECTORY) && !cur->is_dir() && !cur->is_symlink()) {
+    dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  if ((flags & CEPH_O_TRUNC) && !cur->is_file()) {
+    dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
+    // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
+    respond_to_request(mdr, cur->is_dir() ? -CEPHFS_EISDIR : -CEPHFS_EINVAL);
+    return;
+  }
+
+  if (cur->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
+      !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
+    dout(7) << "old client cannot open inline data file " << *cur << dendl;
+    respond_to_request(mdr, -CEPHFS_EPERM);
+    return;
+  }
+  
+  // snapped data is read only
+  if (mdr->snapid != CEPH_NOSNAP &&
+      ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
+    dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
+    respond_to_request(mdr, -CEPHFS_EROFS);
+    return;
+  }
+
+  MutationImpl::LockOpVec lov;
+
+  unsigned mask = req->head.args.open.mask;
+  if (mask) {
+    Capability *cap = cur->get_client_cap(mdr->get_client());
+    int issued = 0;
+    if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
+      issued = cap->issued();
+    // permission bits, ACL/security xattrs
+    if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
+      lov.add_rdlock(&cur->authlock);
+    if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
+      lov.add_rdlock(&cur->xattrlock);
+
+    mdr->getattr_caps = mask;
+  }
+
+  // O_TRUNC
+  if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
+    ceph_assert(cur->is_auth());
+
+    lov.add_xlock(&cur->filelock);
+    if (!mds->locker->acquire_locks(mdr, lov))
+      return;
+
+    if (!check_access(mdr, cur, MAY_WRITE))
+      return;
+
+    // wait for pending truncate?
+    const auto& pi = cur->get_projected_inode();
+    if (pi->is_truncating()) {
+      dout(10) << " waiting for pending truncate from " << pi->truncate_from
+	       << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
+      mds->locker->drop_locks(mdr.get());
+      mdr->drop_local_auth_pins();
+      cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
+      return;
+    }
+    
+    do_open_truncate(mdr, cmode);
+    return;
+  }
+
+  // sync filelock if snapped.
+  //  this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
+  //  and that data itself is flushed so that we can read the snapped data off disk.
+  if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
+    lov.add_rdlock(&cur->filelock);
+  }
+
+  if (!mds->locker->acquire_locks(mdr, lov))
+    return;
+
+  mask = MAY_READ;
+  if (cmode & CEPH_FILE_MODE_WR)
+    mask |= MAY_WRITE;
+  if (!check_access(mdr, cur, mask))
+    return;
+
+  utime_t now = ceph_clock_now();
+  mdr->set_mds_stamp(now);
+
+  if (cur->is_file() || cur->is_dir()) {
+    if (mdr->snapid == CEPH_NOSNAP) {
+      // register new cap
+      Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
+      if (cap)
+	dout(12) << "open issued caps " << ccap_string(cap->pending())
+		 << " for " << req->get_source()
+		 << " on " << *cur << dendl;
+    } else {
+      int caps = ceph_caps_for_mode(cmode);
+      dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
+	       << " for " << req->get_source()
+	       << " snapid " << mdr->snapid
+	       << " on " << *cur << dendl;
+      mdr->snap_caps = caps;
+    }
+  }
+
+  // increase max_size?
+  if (cmode & CEPH_FILE_MODE_WR)
+    mds->locker->check_inode_max_size(cur);
+
+  // make sure this inode gets into the journal
+  if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
+      mdcache->open_file_table.should_log_open(cur)) {
+    EOpen *le = new EOpen(mds->mdlog);
+    mdlog->start_entry(le);
+    le->add_clean_inode(cur);
+    mdlog->submit_entry(le);
+  }
+  
+  // hit pop
+  if (cmode & CEPH_FILE_MODE_WR)
+    mds->balancer->hit_inode(cur, META_POP_IWR);
+  else
+    mds->balancer->hit_inode(cur, META_POP_IRD,
+			     mdr->client_request->get_source().num());
+
+  CDentry *dn = 0;
+  if (req->get_dentry_wanted()) {
+    ceph_assert(mdr->dn[0].size());
+    dn = mdr->dn[0].back();
+  }
+
+  mdr->tracei = cur;
+  mdr->tracedn = dn;
+  respond_to_request(mdr, 0);
+}
+
+class C_MDS_openc_finish : public ServerLogContext { 
+  CDentry *dn;
+  CInode *newi;
+public:
+  C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
+    ServerLogContext(s, r), dn(d), newi(ni) {}
+  void finish(int r) override {
+    ceph_assert(r == 0);
+
+    dn->pop_projected_linkage();
+
+    // dirty inode, dn, dir
+    newi->mark_dirty(mdr->ls);
+    newi->mark_dirty_parent(mdr->ls, true);
+
+    mdr->apply();
+
+    get_mds()->locker->share_inode_max_size(newi);
+
+    MDRequestRef null_ref;
+    get_mds()->mdcache->send_dentry_link(dn, null_ref);
+
+    get_mds()->balancer->hit_inode(newi, META_POP_IWR);
+
+    server->respond_to_request(mdr, 0);
+
+    ceph_assert(g_conf()->mds_kill_openc_at != 1);
+  }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_openc(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  client_t client = mdr->get_client();
+
+  dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
+
+  int cmode = ceph_flags_to_mode(req->head.args.open.flags);
+  if (cmode < 0) {
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  bool excl = req->head.args.open.flags & CEPH_O_EXCL;
+  CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true);
+  if (!dn)
+    return;
+
+  if (is_unlink_pending(dn)) {
+    wait_for_pending_unlink(dn, mdr);
+    return;
+  }
+
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  if (!excl && !dnl->is_null()) {
+    // it existed.
+    mds->locker->xlock_downgrade(&dn->lock, mdr.get());
+
+    MutationImpl::LockOpVec lov;
+    lov.add_rdlock(&dnl->get_inode()->snaplock);
+    if (!mds->locker->acquire_locks(mdr, lov))
+      return;
+
+    handle_client_open(mdr);
+    return;
+  }
+
+  ceph_assert(dnl->is_null());
+
+  if (req->get_alternate_name().size() > alternate_name_max) {
+    dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
+    respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
+    return;
+  }
+  dn->set_alternate_name(req->get_alternate_name());
+
+  // set layout
+  file_layout_t layout;
+  if (mdr->dir_layout != file_layout_t())
+    layout = mdr->dir_layout;
+  else
+    layout = mdcache->default_file_layout;
+
+  // What kind of client caps are required to complete this operation
+  uint64_t access = MAY_WRITE;
+
+  const auto default_layout = layout;
+
+  // fill in any special params from client
+  if (req->head.args.open.stripe_unit)
+    layout.stripe_unit = req->head.args.open.stripe_unit;
+  if (req->head.args.open.stripe_count)
+    layout.stripe_count = req->head.args.open.stripe_count;
+  if (req->head.args.open.object_size)
+    layout.object_size = req->head.args.open.object_size;
+  if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
+      (__s32)req->head.args.open.pool >= 0) {
+    layout.pool_id = req->head.args.open.pool;
+
+    // make sure we have as new a map as the client
+    if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+      mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+      return;
+    }
+  }
+
+  // If client doesn't have capability to modify layout pools, then
+  // only permit this request if the requested pool matches what the
+  // file would have inherited anyway from its parent.
+  if (default_layout != layout) {
+    access |= MAY_SET_VXATTR;
+  }
+
+  if (!layout.is_valid()) {
+    dout(10) << " invalid initial file layout" << dendl;
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+  if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
+    dout(10) << " invalid data pool " << layout.pool_id << dendl;
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  // created null dn.
+  CDir *dir = dn->get_dir();
+  CInode *diri = dir->get_inode();
+  if (!check_access(mdr, diri, access))
+    return;
+  if (!check_fragment_space(mdr, dir))
+    return;
+  if (!check_dir_max_entries(mdr, dir))
+    return;
+
+  if (mdr->dn[0].size() == 1)
+    mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
+
+  // create inode.
+  CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
+				   req->head.args.open.mode | S_IFREG, &layout);
+  ceph_assert(newi);
+
+  // it's a file.
+  dn->push_projected_linkage(newi);
+
+  auto _inode = newi->_get_inode();
+  _inode->version = dn->pre_dirty();
+  if (layout.pool_id != mdcache->default_file_layout.pool_id)
+    _inode->add_old_pool(mdcache->default_file_layout.pool_id);
+  _inode->update_backtrace();
+  _inode->rstat.rfiles = 1;
+  _inode->accounted_rstat = _inode->rstat;
+
+  SnapRealm *realm = diri->find_snaprealm();
+  snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+  ceph_assert(follows >= realm->get_newest_seq());
+
+  ceph_assert(dn->first == follows+1);
+  newi->first = dn->first;
+
+  // do the open
+  Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
+  newi->authlock.set_state(LOCK_EXCL);
+  newi->xattrlock.set_state(LOCK_EXCL);
+
+  if (cap && (cmode & CEPH_FILE_MODE_WR)) {
+    _inode->client_ranges[client].range.first = 0;
+    _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
+    _inode->client_ranges[client].follows = follows;
+    newi->mark_clientwriteable();
+    cap->mark_clientwriteable();
+  }
+  
+  // prepare finisher
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "openc");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  journal_allocated_inos(mdr, &le->metablob);
+  mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+  le->metablob.add_primary_dentry(dn, newi, true, true, true);
+
+  // make sure this inode gets into the journal
+  le->metablob.add_opened_ino(newi->ino());
+
+  C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, newi);
+
+  if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
+    openc_response_t	ocresp;
+
+    dout(10) << "adding created_ino and delegated_inos" << dendl;
+    ocresp.created_ino = _inode->ino;
+
+    if (delegate_inos_pct && !req->is_queued_for_replay()) {
+      // Try to delegate some prealloc_inos to the client, if it's down to half the max
+      unsigned frac = 100 / delegate_inos_pct;
+      if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
+	mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
+    }
+
+    encode(ocresp, mdr->reply_extra_bl);
+  } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
+    dout(10) << "adding ino to reply to indicate inode was created" << dendl;
+    // add the file created flag onto the reply if create_flags features is supported
+    encode(newi->ino(), mdr->reply_extra_bl);
+  }
+
+  journal_and_reply(mdr, newi, dn, le, fin);
+
+  // We hit_dir (via hit_inode) in our finish callback, but by then we might
+  // have overshot the split size (multiple opencs in flight), so here is
+  // an early chance to split the dir if this openc makes it oversized.
+  mds->balancer->maybe_fragment(dir, false);
+}
+
+
+
+void Server::handle_client_readdir(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  Session *session = mds->get_session(req);
+  client_t client = req->get_source().num();
+  MutationImpl::LockOpVec lov;
+  CInode *diri = rdlock_path_pin_ref(mdr, false, true);
+  if (!diri) return;
+
+  // it's a directory, right?
+  if (!diri->is_dir()) {
+    // not a dir
+    dout(10) << "reply to " << *req << " readdir -CEPHFS_ENOTDIR" << dendl;
+    respond_to_request(mdr, -CEPHFS_ENOTDIR);
+    return;
+  }
+
+  auto num_caps = session->get_num_caps();
+  auto session_cap_acquisition = session->get_cap_acquisition();
+
+  if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
+      dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
+	       << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
+      if (logger)
+          logger->inc(l_mdss_cap_acquisition_throttle);
+
+      mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
+      return;
+  }
+
+  lov.add_rdlock(&diri->filelock);
+  lov.add_rdlock(&diri->dirfragtreelock);
+
+  if (!mds->locker->acquire_locks(mdr, lov))
+    return;
+
+  if (!check_access(mdr, diri, MAY_READ))
+    return;
+
+  // which frag?
+  frag_t fg = (__u32)req->head.args.readdir.frag;
+  unsigned req_flags = (__u32)req->head.args.readdir.flags;
+  string offset_str = req->get_path2();
+
+  __u32 offset_hash = 0;
+  if (!offset_str.empty())
+    offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
+  else
+    offset_hash = (__u32)req->head.args.readdir.offset_hash;
+
+  dout(10) << " frag " << fg << " offset '" << offset_str << "'"
+	   << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
+
+  // does the frag exist?
+  if (diri->dirfragtree[fg.value()] != fg) {
+    frag_t newfg;
+    if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
+      if (fg.contains((unsigned)offset_hash)) {
+	newfg = diri->dirfragtree[offset_hash];
+      } else {
+	// client actually wants next frag
+	newfg = diri->dirfragtree[fg.value()];
+      }
+    } else {
+      offset_str.clear();
+      newfg = diri->dirfragtree[fg.value()];
+    }
+    dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
+    fg = newfg;
+  }
+  
+  CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
+  if (!dir) return;
+
+  // ok!
+  dout(10) << "handle_client_readdir on " << *dir << dendl;
+  ceph_assert(dir->is_auth());
+
+  if (!dir->is_complete()) {
+    if (dir->is_frozen()) {
+      dout(7) << "dir is frozen " << *dir << dendl;
+      mds->locker->drop_locks(mdr.get());
+      mdr->drop_local_auth_pins();
+      dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+      return;
+    }
+    // fetch
+    dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
+    dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
+    return;
+  }
+
+#ifdef MDS_VERIFY_FRAGSTAT
+  dir->verify_fragstat();
+#endif
+
+  utime_t now = ceph_clock_now();
+  mdr->set_mds_stamp(now);
+
+  snapid_t snapid = mdr->snapid;
+  dout(10) << "snapid " << snapid << dendl;
+
+  SnapRealm *realm = diri->find_snaprealm();
+
+  unsigned max = req->head.args.readdir.max_entries;
+  if (!max)
+    max = dir->get_num_any();  // whatever, something big.
+  unsigned max_bytes = req->head.args.readdir.max_bytes;
+  if (!max_bytes)
+    // make sure at least one item can be encoded
+    max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
+
+  // start final blob
+  bufferlist dirbl;
+  DirStat ds;
+  ds.frag = dir->get_frag();
+  ds.auth = dir->get_dir_auth().first;
+  if (dir->is_auth() && !forward_all_requests_to_auth)
+    dir->get_dist_spec(ds.dist, mds->get_nodeid());
+
+  dir->encode_dirstat(dirbl, mdr->session->info, ds);
+
+  // count bytes available.
+  //  this isn't perfect, but we should capture the main variable/unbounded size items!
+  int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
+  int bytes_left = max_bytes - front_bytes;
+  bytes_left -= realm->get_snap_trace().length();
+
+  // build dir contents
+  bufferlist dnbl;
+  __u32 numfiles = 0;
+  bool start = !offset_hash && offset_str.empty();
+  // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
+  dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
+  auto it = start ? dir->begin() : dir->lower_bound(skip_key);
+  bool end = (it == dir->end());
+  for (; !end && numfiles < max; end = (it == dir->end())) {
+    CDentry *dn = it->second;
+    ++it;
+
+    if (dn->state_test(CDentry::STATE_PURGING))
+      continue;
+
+    bool dnp = dn->use_projected(client, mdr);
+    CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
+
+    if (dnl->is_null())
+      continue;
+
+    if (dn->last < snapid || dn->first > snapid) {
+      dout(20) << "skipping non-overlapping snap " << *dn << dendl;
+      continue;
+    }
+
+    if (!start) {
+      dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
+      if (!(offset_key < dn->key()))
+	continue;
+    }
+
+    CInode *in = dnl->get_inode();
+
+    if (in && in->ino() == CEPH_INO_CEPH)
+      continue;
+
+    // remote link?
+    // better for the MDS to do the work, if we think the client will stat any of these files.
+    if (dnl->is_remote() && !in) {
+      in = mdcache->get_inode(dnl->get_remote_ino());
+      if (in) {
+	dn->link_remote(dnl, in);
+      } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
+	dout(10) << "skipping bad remote ino on " << *dn << dendl;
+	continue;
+      } else {
+	// touch everything i _do_ have
+	for (auto &p : *dir) {
+	  if (!p.second->get_linkage()->is_null())
+	    mdcache->lru.lru_touch(p.second);
+        }
+
+	// already issued caps and leases, reply immediately.
+	if (dnbl.length() > 0) {
+	  mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
+	  dout(10) << " open remote dentry after caps were issued, stopping at "
+		   << dnbl.length() << " < " << bytes_left << dendl;
+	  break;
+	}
+
+	mds->locker->drop_locks(mdr.get());
+	mdr->drop_local_auth_pins();
+	mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
+	return;
+      }
+    }
+    ceph_assert(in);
+
+    if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
+      dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
+      break;
+    }
+    
+    unsigned start_len = dnbl.length();
+
+    // dentry
+    dout(12) << "including    dn " << *dn << dendl;
+    encode(dn->get_name(), dnbl);
+    mds->locker->issue_client_lease(dn, in, mdr, now, dnbl);
+
+    // inode
+    dout(12) << "including inode " << *in << dendl;
+    int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
+    if (r < 0) {
+      // chop off dn->name, lease
+      dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
+      bufferlist keep;
+      keep.substr_of(dnbl, 0, start_len);
+      dnbl.swap(keep);
+      break;
+    }
+    ceph_assert(r >= 0);
+    numfiles++;
+
+    // touch dn
+    mdcache->lru.lru_touch(dn);
+  }
+  
+  session->touch_readdir_cap(numfiles);
+
+  __u16 flags = 0;
+  if (end) {
+    flags = CEPH_READDIR_FRAG_END;
+    if (start)
+      flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
+  }
+  // client only understand END and COMPLETE flags ?
+  if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
+    flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
+  }
+  
+  // finish final blob
+  encode(numfiles, dirbl);
+  encode(flags, dirbl);
+  dirbl.claim_append(dnbl);
+  
+  // yay, reply
+  dout(10) << "reply to " << *req << " readdir num=" << numfiles
+	   << " bytes=" << dirbl.length()
+	   << " start=" << (int)start
+	   << " end=" << (int)end
+	   << dendl;
+  mdr->reply_extra_bl = dirbl;
+
+  // bump popularity.  NOTE: this doesn't quite capture it.
+  mds->balancer->hit_dir(dir, META_POP_READDIR, -1, numfiles);
+  
+  // reply
+  mdr->tracei = diri;
+  respond_to_request(mdr, 0);
+}
+
+
+
+// ===============================================================================
+// INODE UPDATES
+
+
+/* 
+ * finisher for basic inode updates
+ */
+class C_MDS_inode_update_finish : public ServerLogContext {
+  CInode *in;
+  bool truncating_smaller, changed_ranges, adjust_realm;
+public:
+  C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
+			    bool sm=false, bool cr=false, bool ar=false) :
+    ServerLogContext(s, r), in(i),
+    truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
+  void finish(int r) override {
+    ceph_assert(r == 0);
+
+    int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
+
+    // apply
+    mdr->apply();
+
+    MDSRank *mds = get_mds();
+
+    // notify any clients
+    if (truncating_smaller && in->get_inode()->is_truncating()) {
+      mds->locker->issue_truncate(in);
+      mds->mdcache->truncate_inode(in, mdr->ls);
+    }
+
+    if (adjust_realm) {
+      mds->mdcache->send_snap_update(in, 0, snap_op);
+      mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
+    }
+
+    get_mds()->balancer->hit_inode(in, META_POP_IWR);
+
+    server->respond_to_request(mdr, 0);
+
+    if (changed_ranges)
+      get_mds()->locker->share_inode_max_size(in);
+  }
+};
+
+void Server::handle_client_file_setlock(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  MutationImpl::LockOpVec lov;
+
+  // get the inode to operate on, and set up any locks needed for that
+  CInode *cur = rdlock_path_pin_ref(mdr, true);
+  if (!cur)
+    return;
+
+  lov.add_xlock(&cur->flocklock);
+  /* acquire_locks will return true if it gets the locks. If it fails,
+     it will redeliver this request at a later date, so drop the request.
+   */
+  if (!mds->locker->acquire_locks(mdr, lov)) {
+    dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
+    return;
+  }
+
+  // copy the lock change into a ceph_filelock so we can store/apply it
+  ceph_filelock set_lock;
+  set_lock.start = req->head.args.filelock_change.start;
+  set_lock.length = req->head.args.filelock_change.length;
+  set_lock.client = req->get_orig_source().num();
+  set_lock.owner = req->head.args.filelock_change.owner;
+  set_lock.pid = req->head.args.filelock_change.pid;
+  set_lock.type = req->head.args.filelock_change.type;
+  bool will_wait = req->head.args.filelock_change.wait;
+
+  dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
+
+  ceph_lock_state_t *lock_state = NULL;
+  bool interrupt = false;
+
+  // get the appropriate lock state
+  switch (req->head.args.filelock_change.rule) {
+  case CEPH_LOCK_FLOCK_INTR:
+    interrupt = true;
+    // fall-thru
+  case CEPH_LOCK_FLOCK:
+    lock_state = cur->get_flock_lock_state();
+    break;
+
+  case CEPH_LOCK_FCNTL_INTR:
+    interrupt = true;
+    // fall-thru
+  case CEPH_LOCK_FCNTL:
+    lock_state = cur->get_fcntl_lock_state();
+    break;
+
+  default:
+    dout(10) << "got unknown lock type " << set_lock.type
+	     << ", dropping request!" << dendl;
+    respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
+    return;
+  }
+
+  dout(10) << " state prior to lock change: " << *lock_state << dendl;
+  if (CEPH_LOCK_UNLOCK == set_lock.type) {
+    list<ceph_filelock> activated_locks;
+    MDSContext::vec waiters;
+    if (lock_state->is_waiting(set_lock)) {
+      dout(10) << " unlock removing waiting lock " << set_lock << dendl;
+      lock_state->remove_waiting(set_lock);
+      cur->take_waiting(CInode::WAIT_FLOCK, waiters);
+    } else if (!interrupt) {
+      dout(10) << " unlock attempt on " << set_lock << dendl;
+      lock_state->remove_lock(set_lock, activated_locks);
+      cur->take_waiting(CInode::WAIT_FLOCK, waiters);
+    }
+    mds->queue_waiters(waiters);
+
+    respond_to_request(mdr, 0);
+  } else {
+    dout(10) << " lock attempt on " << set_lock << dendl;
+    bool deadlock = false;
+    if (mdr->more()->flock_was_waiting &&
+	!lock_state->is_waiting(set_lock)) {
+      dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
+      respond_to_request(mdr, -CEPHFS_EINTR);
+    } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
+      dout(10) << " it failed on this attempt" << dendl;
+      // couldn't set lock right now
+      if (deadlock) {
+	respond_to_request(mdr, -CEPHFS_EDEADLK);
+      } else if (!will_wait) {
+	respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
+      } else {
+	dout(10) << " added to waiting list" << dendl;
+	ceph_assert(lock_state->is_waiting(set_lock));
+	mdr->more()->flock_was_waiting = true;
+	mds->locker->drop_locks(mdr.get());
+	mdr->drop_local_auth_pins();
+	mdr->mark_event("failed to add lock, waiting");
+	mdr->mark_nowarn();
+	cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
+      }
+    } else
+      respond_to_request(mdr, 0);
+  }
+  dout(10) << " state after lock change: " << *lock_state << dendl;
+}
+
+void Server::handle_client_file_readlock(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  MutationImpl::LockOpVec lov;
+
+  // get the inode to operate on, and set up any locks needed for that
+  CInode *cur = rdlock_path_pin_ref(mdr, true);
+  if (!cur)
+    return;
+
+  /* acquire_locks will return true if it gets the locks. If it fails,
+     it will redeliver this request at a later date, so drop the request.
+  */
+  lov.add_rdlock(&cur->flocklock);
+  if (!mds->locker->acquire_locks(mdr, lov)) {
+    dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
+    return;
+  }
+  
+  // copy the lock change into a ceph_filelock so we can store/apply it
+  ceph_filelock checking_lock;
+  checking_lock.start = req->head.args.filelock_change.start;
+  checking_lock.length = req->head.args.filelock_change.length;
+  checking_lock.client = req->get_orig_source().num();
+  checking_lock.owner = req->head.args.filelock_change.owner;
+  checking_lock.pid = req->head.args.filelock_change.pid;
+  checking_lock.type = req->head.args.filelock_change.type;
+
+  // get the appropriate lock state
+  ceph_lock_state_t *lock_state = NULL;
+  switch (req->head.args.filelock_change.rule) {
+  case CEPH_LOCK_FLOCK:
+    lock_state = cur->get_flock_lock_state();
+    break;
+
+  case CEPH_LOCK_FCNTL:
+    lock_state = cur->get_fcntl_lock_state();
+    break;
+
+  default:
+    dout(10) << "got unknown lock type " << checking_lock.type << dendl;
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+  lock_state->look_for_lock(checking_lock);
+
+  bufferlist lock_bl;
+  encode(checking_lock, lock_bl);
+
+  mdr->reply_extra_bl = lock_bl;
+  respond_to_request(mdr, 0);
+}
+
+void Server::handle_client_setattr(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  MutationImpl::LockOpVec lov;
+  CInode *cur = rdlock_path_pin_ref(mdr, true);
+  if (!cur) return;
+
+  if (mdr->snapid != CEPH_NOSNAP) {
+    respond_to_request(mdr, -CEPHFS_EROFS);
+    return;
+  }
+  if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
+    respond_to_request(mdr, -CEPHFS_EPERM);
+    return;
+  }
+
+  __u32 mask = req->head.args.setattr.mask;
+  __u32 access_mask = MAY_WRITE;
+
+  // xlock inode
+  if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
+    lov.add_xlock(&cur->authlock);
+  if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
+    lov.add_xlock(&cur->filelock);
+  if (mask & CEPH_SETATTR_CTIME)
+    lov.add_wrlock(&cur->versionlock);
+
+  if (!mds->locker->acquire_locks(mdr, lov))
+    return;
+
+  if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
+    access_mask |= MAY_CHOWN;
+
+  if ((mask & CEPH_SETATTR_GID) && (cur->get_inode()->gid != req->head.args.setattr.gid))
+    access_mask |= MAY_CHGRP;
+
+  if (!check_access(mdr, cur, access_mask))
+    return;
+
+  // trunc from bigger -> smaller?
+  const auto& pip = cur->get_projected_inode();
+
+  uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
+
+  // CEPHFS_ENOSPC on growing file while full, but allow shrinks
+  if (is_full && req->head.args.setattr.size > old_size) {
+    dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl;
+    respond_to_request(mdr, -CEPHFS_ENOSPC);
+    return;
+  }
+
+  bool truncating_smaller = false;
+  if (mask & CEPH_SETATTR_SIZE) {
+    truncating_smaller = req->head.args.setattr.size < old_size;
+    if (truncating_smaller && pip->is_truncating()) {
+      dout(10) << " waiting for pending truncate from " << pip->truncate_from
+	       << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
+      mds->locker->drop_locks(mdr.get());
+      mdr->drop_local_auth_pins();
+      cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
+      return;
+    }
+  }
+
+  bool changed_ranges = false;
+
+  // project update
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "setattr");
+  mdlog->start_entry(le);
+
+  auto pi = cur->project_inode(mdr);
+
+  if (mask & CEPH_SETATTR_UID)
+    pi.inode->uid = req->head.args.setattr.uid;
+  if (mask & CEPH_SETATTR_GID)
+    pi.inode->gid = req->head.args.setattr.gid;
+
+  if (mask & CEPH_SETATTR_MODE)
+    pi.inode->mode = (pi.inode->mode & ~07777) | (req->head.args.setattr.mode & 07777);
+  else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
+	    S_ISREG(pi.inode->mode) &&
+            (pi.inode->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
+    pi.inode->mode &= ~(S_ISUID|S_ISGID);
+  }
+
+  if (mask & CEPH_SETATTR_MTIME)
+    pi.inode->mtime = req->head.args.setattr.mtime;
+  if (mask & CEPH_SETATTR_ATIME)
+    pi.inode->atime = req->head.args.setattr.atime;
+  if (mask & CEPH_SETATTR_BTIME)
+    pi.inode->btime = req->head.args.setattr.btime;
+  if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
+    pi.inode->time_warp_seq++;   // maybe not a timewarp, but still a serialization point.
+  if (mask & CEPH_SETATTR_SIZE) {
+    if (truncating_smaller) {
+      pi.inode->truncate(old_size, req->head.args.setattr.size);
+      le->metablob.add_truncate_start(cur->ino());
+    } else {
+      pi.inode->size = req->head.args.setattr.size;
+      pi.inode->rstat.rbytes = pi.inode->size;
+    }
+    pi.inode->mtime = mdr->get_op_stamp();
+
+    // adjust client's max_size?
+    if (mds->locker->calc_new_client_ranges(cur, pi.inode->size)) {
+      dout(10) << " client_ranges "  << cur->get_previous_projected_inode()->client_ranges
+	       << " -> " << pi.inode->client_ranges << dendl;
+      changed_ranges = true;
+    }
+  }
+
+  pi.inode->version = cur->pre_dirty();
+  pi.inode->ctime = mdr->get_op_stamp();
+  if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+    pi.inode->rstat.rctime = mdr->get_op_stamp();
+  pi.inode->change_attr++;
+
+  // log + wait
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+  
+  journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
+								   truncating_smaller, changed_ranges));
+
+  // flush immediately if there are readers/writers waiting
+  if (mdr->is_xlocked(&cur->filelock) &&
+      (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+    mds->mdlog->flush();
+}
+
+/* Takes responsibility for mdr */
+void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
+{
+  CInode *in = mdr->in[0];
+  client_t client = mdr->get_client();
+  ceph_assert(in);
+
+  dout(10) << "do_open_truncate " << *in << dendl;
+
+  SnapRealm *realm = in->find_snaprealm();
+  Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
+
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "open_truncate");
+  mdlog->start_entry(le);
+
+  // prepare
+  auto pi = in->project_inode(mdr);
+  pi.inode->version = in->pre_dirty();
+  pi.inode->mtime = pi.inode->ctime = mdr->get_op_stamp();
+  if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+    pi.inode->rstat.rctime = mdr->get_op_stamp();
+  pi.inode->change_attr++;
+
+  uint64_t old_size = std::max<uint64_t>(pi.inode->size, mdr->client_request->head.args.open.old_size);
+  if (old_size > 0) {
+    pi.inode->truncate(old_size, 0);
+    le->metablob.add_truncate_start(in->ino());
+  }
+
+  bool changed_ranges = false;
+  if (cap && (cmode & CEPH_FILE_MODE_WR)) {
+    pi.inode->client_ranges[client].range.first = 0;
+    pi.inode->client_ranges[client].range.last = pi.inode->get_layout_size_increment();
+    pi.inode->client_ranges[client].follows = realm->get_newest_seq();
+    changed_ranges = true;
+    in->mark_clientwriteable();
+    cap->mark_clientwriteable();
+  }
+  
+  le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+
+  mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
+  
+  // make sure ino gets into the journal
+  le->metablob.add_opened_ino(in->ino());
+  
+  mdr->o_trunc = true;
+
+  CDentry *dn = 0;
+  if (mdr->client_request->get_dentry_wanted()) {
+    ceph_assert(mdr->dn[0].size());
+    dn = mdr->dn[0].back();
+  }
+
+  journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
+								   changed_ranges));
+  // Although the `open` part can give an early reply, the truncation won't
+  // happen until our EUpdate is persistent, to give the client a prompt
+  // response we must also flush that event.
+  mdlog->flush();
+}
+
+
+/* This function cleans up the passed mdr */
+void Server::handle_client_setlayout(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  CInode *cur = rdlock_path_pin_ref(mdr, true);
+  if (!cur) return;
+
+  if (mdr->snapid != CEPH_NOSNAP) {
+    respond_to_request(mdr, -CEPHFS_EROFS);
+    return;
+  }
+  if (!cur->is_file()) {
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+  if (cur->get_projected_inode()->size ||
+      cur->get_projected_inode()->truncate_seq > 1) {
+    respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+    return;
+  }
+
+  // validate layout
+  file_layout_t layout = cur->get_projected_inode()->layout;
+  // save existing layout for later
+  const auto old_layout = layout;
+
+  int access = MAY_WRITE;
+
+  if (req->head.args.setlayout.layout.fl_object_size > 0)
+    layout.object_size = req->head.args.setlayout.layout.fl_object_size;
+  if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
+    layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
+  if (req->head.args.setlayout.layout.fl_stripe_count > 0)
+    layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
+  if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
+    layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
+
+    // make sure we have as new a map as the client
+    if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+      mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+      return;
+    }
+  }
+
+  // Don't permit layout modifications without 'p' caps
+  if (layout != old_layout) {
+    access |= MAY_SET_VXATTR;
+  }
+
+  if (!layout.is_valid()) {
+    dout(10) << "bad layout" << dendl;
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+  if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
+    dout(10) << " invalid data pool " << layout.pool_id << dendl;
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  MutationImpl::LockOpVec lov;
+  lov.add_xlock(&cur->filelock);
+  if (!mds->locker->acquire_locks(mdr, lov))
+    return;
+
+  if (!check_access(mdr, cur, access))
+    return;
+
+  // project update
+  auto pi = cur->project_inode(mdr);
+  pi.inode->layout = layout;
+  // add the old pool to the inode
+  pi.inode->add_old_pool(old_layout.pool_id);
+  pi.inode->version = cur->pre_dirty();
+  pi.inode->ctime = mdr->get_op_stamp();
+  if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+    pi.inode->rstat.rctime = mdr->get_op_stamp();
+  pi.inode->change_attr++;
+  
+  // log + wait
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "setlayout");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+  
+  journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+}
+
+bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
+{
+  if (mdr->locking_state & MutationImpl::ALL_LOCKED)
+    return true;
+
+  MutationImpl::LockOpVec lov;
+  lov.add_xlock(&in->policylock);
+  if (xlock_snaplock)
+    lov.add_xlock(&in->snaplock);
+  else
+    lov.add_rdlock(&in->snaplock);
+  if (!mds->locker->acquire_locks(mdr, lov))
+    return false;
+
+  if (want_layout && in->get_projected_inode()->has_layout()) {
+    mdr->dir_layout = in->get_projected_inode()->layout;
+    want_layout = false;
+  }
+  if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
+    if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
+      return false;
+  }
+
+  mdr->locking_state |= MutationImpl::ALL_LOCKED;
+  return true;
+}
+
+CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
+{
+  CInode *in = mdcache->get_inode(ino);
+  if (!in || in->state_test(CInode::STATE_PURGING)) {
+    respond_to_request(mdr, -CEPHFS_ESTALE);
+    return nullptr;
+  }
+  if (!in->is_auth()) {
+    mdcache->request_forward(mdr, in->authority().first);
+    return nullptr;
+  }
+
+  return in;
+}
+
+void Server::handle_client_setdirlayout(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+
+  // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
+  CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+  if (!cur)
+    return;
+
+  if (!cur->is_dir()) {
+    respond_to_request(mdr, -CEPHFS_ENOTDIR);
+    return;
+  }
+
+  if (!xlock_policylock(mdr, cur, true))
+    return;
+
+  // validate layout
+  const auto& old_pi = cur->get_projected_inode();
+  file_layout_t layout;
+  if (old_pi->has_layout())
+    layout = old_pi->layout;
+  else if (mdr->dir_layout != file_layout_t())
+    layout = mdr->dir_layout;
+  else
+    layout = mdcache->default_file_layout;
+
+  // Level of access required to complete
+  int access = MAY_WRITE;
+
+  const auto old_layout = layout;
+
+  if (req->head.args.setlayout.layout.fl_object_size > 0)
+    layout.object_size = req->head.args.setlayout.layout.fl_object_size;
+  if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
+    layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
+  if (req->head.args.setlayout.layout.fl_stripe_count > 0)
+    layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
+  if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
+    layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
+    // make sure we have as new a map as the client
+    if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+      mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+      return;
+    }  
+  }
+
+  if (layout != old_layout) {
+    access |= MAY_SET_VXATTR;
+  }
+
+  if (!layout.is_valid()) {
+    dout(10) << "bad layout" << dendl;
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+  if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
+    dout(10) << " invalid data pool " << layout.pool_id << dendl;
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  if (!check_access(mdr, cur, access))
+    return;
+
+  auto pi = cur->project_inode(mdr);
+  pi.inode->layout = layout;
+  pi.inode->version = cur->pre_dirty();
+
+  // log + wait
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "setlayout");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+  mdr->no_early_reply = true;
+  journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+}
+
+// XATTRS
+int Server::parse_layout_vxattr_json(
+  string name, string value, const OSDMap& osdmap, file_layout_t *layout)
+{
+  auto parse_pool = [&](std::string pool_name, int64_t pool_id) -> int64_t {
+    if (pool_name != "") {
+      int64_t _pool_id = osdmap.lookup_pg_pool_name(pool_name);
+      if (_pool_id < 0) {
+	dout(10) << __func__ << ": unknown pool name:" << pool_name << dendl;
+	return -CEPHFS_EINVAL;
+      }
+      return _pool_id;
+    } else if (pool_id >= 0) {
+      const auto pools = osdmap.get_pools();
+      if (pools.find(pool_id) == pools.end()) {
+	dout(10) << __func__ << ": unknown pool id:" << pool_id << dendl;
+	return -CEPHFS_EINVAL;
+      }
+      return pool_id;
+    } else {
+      return -CEPHFS_EINVAL;
+    }
+  };
+
+  try {
+    if (name == "layout.json") {
+      JSONParser json_parser;
+      if (json_parser.parse(value.c_str(), value.length()) and json_parser.is_object()) {
+	std::string field;
+	try {
+	  field = "object_size";
+	  JSONDecoder::decode_json("object_size", layout->object_size, &json_parser, true);
+
+	  field = "stripe_unit";
+	  JSONDecoder::decode_json("stripe_unit", layout->stripe_unit, &json_parser, true);
+
+	  field = "stripe_count";
+	  JSONDecoder::decode_json("stripe_count", layout->stripe_count, &json_parser, true);
+
+	  field = "pool_namespace";
+	  JSONDecoder::decode_json("pool_namespace", layout->pool_ns, &json_parser, false);
+
+	  field = "pool_id";
+	  int64_t pool_id = 0;
+	  JSONDecoder::decode_json("pool_id", pool_id, &json_parser, false);
+
+	  field = "pool_name";
+	  std::string pool_name;
+	  JSONDecoder::decode_json("pool_name", pool_name, &json_parser, false);
+
+	  pool_id = parse_pool(pool_name, pool_id);
+	  if (pool_id < 0) {
+	    return (int)pool_id;
+	  }
+	  layout->pool_id = pool_id;
+	} catch (JSONDecoder::err&) {
+	  dout(10) << __func__ << ": json is missing a mandatory field named "
+		   << field << dendl;
+	  return -CEPHFS_EINVAL;
+	}
+      } else {
+	dout(10) << __func__ << ": bad json" << dendl;
+	return -CEPHFS_EINVAL;
+      }
+    } else {
+      dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
+      return -CEPHFS_ENODATA; // no such attribute
+    }
+  } catch (boost::bad_lexical_cast const&) {
+    dout(10) << __func__ << ": bad vxattr value:" << value
+	     << ", unable to parse for xattr:" << name << dendl;
+    return -CEPHFS_EINVAL;
+  }
+  return 0;
+}
+
+// parse old style layout string
+int Server::parse_layout_vxattr_string(
+  string name, string value, const OSDMap& osdmap, file_layout_t *layout)
+{
+  try {
+    if (name == "layout") {
+      string::iterator begin = value.begin();
+      string::iterator end = value.end();
+      keys_and_values<string::iterator> p;    // create instance of parser
+      std::map<string, string> m;             // map to receive results
+      if (!qi::parse(begin, end, p, m)) {     // returns true if successful
+	return -CEPHFS_EINVAL;
+      }
+      string left(begin, end);
+      dout(10) << __func__ << ": parsed " << m << " left '" << left << "'" << dendl;
+      if (begin != end)
+	return -CEPHFS_EINVAL;
+      for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
+        // Skip validation on each attr, we do it once at the end (avoid
+        // rejecting intermediate states if the overall result is ok)
+	int r = parse_layout_vxattr_string(string("layout.") + q->first, q->second,
+					   osdmap, layout);
+	if (r < 0)
+	  return r;
+      }
+    } else if (name == "layout.object_size") {
+      layout->object_size = boost::lexical_cast<unsigned>(value);
+    } else if (name == "layout.stripe_unit") {
+      layout->stripe_unit = boost::lexical_cast<unsigned>(value);
+    } else if (name == "layout.stripe_count") {
+      layout->stripe_count = boost::lexical_cast<unsigned>(value);
+    } else if (name == "layout.pool") {
+      try {
+	layout->pool_id = boost::lexical_cast<unsigned>(value);
+      } catch (boost::bad_lexical_cast const&) {
+	int64_t pool = osdmap.lookup_pg_pool_name(value);
+	if (pool < 0) {
+	  dout(10) << __func__ << ": unknown pool " << value << dendl;
+	  return -CEPHFS_ENOENT;
+	}
+	layout->pool_id = pool;
+      }
+    } else if (name == "layout.pool_id") {
+      layout->pool_id = boost::lexical_cast<int64_t>(value);
+    } else if (name == "layout.pool_name") {
+      layout->pool_id = osdmap.lookup_pg_pool_name(value);
+      if (layout->pool_id < 0) {
+	dout(10) << __func__ << ": unknown pool " << value << dendl;
+	return -CEPHFS_EINVAL;
+      }
+    } else if (name == "layout.pool_namespace") {
+      layout->pool_ns = value;
+    } else {
+      dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
+      return -CEPHFS_ENODATA; // no such attribute
+    }
+  } catch (boost::bad_lexical_cast const&) {
+    dout(10) << __func__ << ": bad vxattr value, unable to parse int for "
+	     << name << dendl;
+    return -CEPHFS_EINVAL;
+  }
+  return 0;
+}
+
+int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
+				file_layout_t *layout, bool validate)
+{
+  dout(20) << __func__ << ": name:" << name << " value:'" << value << "'" << dendl;
+
+  int r;
+  if (name == "layout.json") {
+    r = parse_layout_vxattr_json(name, value, osdmap, layout);
+  } else {
+    r = parse_layout_vxattr_string(name, value, osdmap, layout);
+  }
+  if (r < 0) {
+    return r;
+  }
+
+  if (validate && !layout->is_valid()) {
+    dout(10) << __func__ << ": bad layout" << dendl;
+    return -CEPHFS_EINVAL;
+  }
+  if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
+    dout(10) << __func__ << ": invalid data pool " << layout->pool_id << dendl;
+    return -CEPHFS_EINVAL;
+  }
+  return 0;
+}
+
+int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
+{
+  dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
+  try {
+    if (name == "quota") {
+      string::iterator begin = value.begin();
+      string::iterator end = value.end();
+      if (begin == end) {
+	// keep quota unchanged. (for create_quota_realm())
+	return 0;
+      }
+      keys_and_values<string::iterator> p;    // create instance of parser
+      std::map<string, string> m;             // map to receive results
+      if (!qi::parse(begin, end, p, m)) {     // returns true if successful
+        return -CEPHFS_EINVAL;
+      }
+      string left(begin, end);
+      dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
+      if (begin != end)
+        return -CEPHFS_EINVAL;
+      for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
+        int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
+        if (r < 0)
+          return r;
+      }
+    } else if (name == "quota.max_bytes") {
+      int64_t q = boost::lexical_cast<int64_t>(value);
+      if (q < 0)
+        return -CEPHFS_EINVAL;
+      quota->max_bytes = q;
+    } else if (name == "quota.max_files") {
+      int64_t q = boost::lexical_cast<int64_t>(value);
+      if (q < 0)
+        return -CEPHFS_EINVAL;
+      quota->max_files = q;
+    } else {
+      dout(10) << " unknown quota vxattr " << name << dendl;
+      return -CEPHFS_EINVAL;
+    }
+  } catch (boost::bad_lexical_cast const&) {
+    dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
+    return -CEPHFS_EINVAL;
+  }
+
+  if (!quota->is_valid()) {
+    dout(10) << "bad quota" << dendl;
+    return -CEPHFS_EINVAL;
+  }
+  return 0;
+}
+
+void Server::create_quota_realm(CInode *in)
+{
+  dout(10) << __func__ << " " << *in << dendl;
+
+  auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
+  req->set_filepath(filepath(in->ino()));
+  req->set_string2("ceph.quota");
+  // empty vxattr value
+  req->set_tid(mds->issue_tid());
+
+  mds->send_message_mds(req, in->authority().first);
+}
+
+/*
+ * Verify that the file layout attribute carried by client
+ * is well-formatted.
+ * Return 0 on success, otherwise this function takes
+ * responsibility for the passed mdr.
+ */
+int Server::check_layout_vxattr(MDRequestRef& mdr,
+                                string name,
+                                string value,
+                                file_layout_t *layout)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  epoch_t epoch;
+  int r;
+
+  mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
+      r = parse_layout_vxattr(name, value, osdmap, layout);
+      epoch = osdmap.get_epoch();
+    });
+
+  if (r == -CEPHFS_ENOENT) {
+
+    // we don't have the specified pool, make sure our map
+    // is newer than or as new as the client.
+    epoch_t req_epoch = req->get_osdmap_epoch();
+
+    if (req_epoch > epoch) {
+
+      // well, our map is older. consult mds.
+      auto fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
+
+      mds->objecter->wait_for_map(req_epoch, lambdafy(fin));
+      return r;
+    } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
+
+      // For compatibility with client w/ old code, we still need get the
+      // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
+      // we can remove those code.
+      mdr->waited_for_osdmap = true;
+      mds->objecter->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
+        mds, new C_MDS_RetryRequest(mdcache, mdr))));
+      return r;
+    }
+  }
+
+  if (r < 0) {
+
+    if (r == -CEPHFS_ENOENT)
+      r = -CEPHFS_EINVAL;
+
+    respond_to_request(mdr, r);
+    return r;
+  }
+
+  // all is well
+  return 0;
+}
+
+void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  string name(req->get_path2());
+  bufferlist bl = req->get_data();
+  string value (bl.c_str(), bl.length());
+  dout(10) << "handle_set_vxattr " << name
+           << " val " << value.length()
+           << " bytes on " << *cur
+           << dendl;
+
+  CInode::mempool_inode *pip = nullptr;
+  string rest;
+
+  if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
+    return;
+  }
+
+  bool adjust_realm = false;
+  if (name.compare(0, 15, "ceph.dir.layout") == 0) {
+    if (!cur->is_dir()) {
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    if (!xlock_policylock(mdr, cur, true))
+      return;
+
+    file_layout_t layout;
+    if (cur->get_projected_inode()->has_layout())
+      layout = cur->get_projected_inode()->layout;
+    else if (mdr->dir_layout != file_layout_t())
+      layout = mdr->dir_layout;
+    else
+      layout = mdcache->default_file_layout;
+
+    rest = name.substr(name.find("layout"));
+    if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
+      return;
+
+    auto pi = cur->project_inode(mdr);
+    pi.inode->layout = layout;
+    mdr->no_early_reply = true;
+    pip = pi.inode.get();
+  } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
+    if (!cur->is_file()) {
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+    if (cur->get_projected_inode()->size ||
+        cur->get_projected_inode()->truncate_seq > 1) {
+      respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+      return;
+    }
+    file_layout_t layout = cur->get_projected_inode()->layout;
+    rest = name.substr(name.find("layout"));
+    if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
+      return;
+
+    MutationImpl::LockOpVec lov;
+    lov.add_xlock(&cur->filelock);
+    if (!mds->locker->acquire_locks(mdr, lov))
+      return;
+
+    auto pi = cur->project_inode(mdr);
+    int64_t old_pool = pi.inode->layout.pool_id;
+    pi.inode->add_old_pool(old_pool);
+    pi.inode->layout = layout;
+    pip = pi.inode.get();
+  } else if (name.compare(0, 10, "ceph.quota") == 0) { 
+    if (!cur->is_dir()) {
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    quota_info_t quota = cur->get_projected_inode()->quota;
+
+    rest = name.substr(name.find("quota"));
+    int r = parse_quota_vxattr(rest, value, &quota);
+    if (r < 0) {
+      respond_to_request(mdr, r);
+      return;
+    }
+
+    if (quota.is_enable() && !cur->get_projected_srnode())
+      adjust_realm = true;
+
+    if (!xlock_policylock(mdr, cur, false, adjust_realm))
+      return;
+
+    if (cur->get_projected_inode()->quota == quota) {
+      respond_to_request(mdr, 0);
+      return;
+    }
+
+    auto pi = cur->project_inode(mdr, false, adjust_realm);
+    pi.inode->quota = quota;
+
+    if (adjust_realm)
+      pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
+
+    mdr->no_early_reply = true;
+    pip = pi.inode.get();
+
+    client_t exclude_ct = mdr->get_client();
+    mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
+  } else if (name == "ceph.dir.subvolume"sv) {
+    if (!cur->is_dir()) {
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    bool val;
+    try {
+      val = boost::lexical_cast<bool>(value);
+    } catch (boost::bad_lexical_cast const&) {
+      dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    /* Verify it's not already a subvolume with lighter weight
+     * rdlock.
+     */
+    if (!mdr->more()->rdonly_checks) {
+      if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+        MutationImpl::LockOpVec lov;
+        lov.add_rdlock(&cur->snaplock);
+        if (!mds->locker->acquire_locks(mdr, lov))
+          return;
+        mdr->locking_state |= MutationImpl::ALL_LOCKED;
+      }
+      SnapRealm *realm = cur->find_snaprealm();
+      const auto srnode = cur->get_projected_srnode();
+      if (val == (srnode && srnode->is_subvolume())) {
+        dout(20) << "already marked subvolume" << dendl;
+        respond_to_request(mdr, 0);
+        return;
+      }
+      mdr->more()->rdonly_checks = true;
+    }
+
+    if ((mdr->locking_state & MutationImpl::ALL_LOCKED) && !mdr->is_xlocked(&cur->snaplock)) {
+      /* drop the rdlock and acquire xlocks */
+      dout(20) << "dropping rdlocks" << dendl;
+      mds->locker->drop_locks(mdr.get());
+      if (!xlock_policylock(mdr, cur, false, true))
+        return;
+    }
+
+    /* repeat rdonly checks in case changed between rdlock -> xlock */
+    SnapRealm *realm = cur->find_snaprealm();
+    if (val) {
+      inodeno_t subvol_ino = realm->get_subvolume_ino();
+      // can't create subvolume inside another subvolume
+      if (subvol_ino && subvol_ino != cur->ino()) {
+	respond_to_request(mdr, -CEPHFS_EINVAL);
+	return;
+      }
+    }
+
+    const auto srnode = cur->get_projected_srnode();
+    if (val == (srnode && srnode->is_subvolume())) {
+      respond_to_request(mdr, 0);
+      return;
+    }
+
+    auto pi = cur->project_inode(mdr, false, true);
+    if (!srnode)
+      pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
+    if (val)
+      pi.snapnode->mark_subvolume();
+    else
+      pi.snapnode->clear_subvolume();
+
+    mdr->no_early_reply = true;
+    pip = pi.inode.get();
+    adjust_realm = true;
+  } else if (name == "ceph.dir.pin"sv) {
+    if (!cur->is_dir() || cur->is_root()) {
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    mds_rank_t rank;
+    try {
+      rank = boost::lexical_cast<mds_rank_t>(value);
+      if (rank < 0) rank = MDS_RANK_NONE;
+    } catch (boost::bad_lexical_cast const&) {
+      dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    if (!xlock_policylock(mdr, cur))
+      return;
+
+    auto pi = cur->project_inode(mdr);
+    cur->set_export_pin(rank);
+    pip = pi.inode.get();
+  } else if (name == "ceph.dir.pin.random"sv) {
+    if (!cur->is_dir() || cur->is_root()) {
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    double val;
+    try {
+      val = boost::lexical_cast<double>(value);
+    } catch (boost::bad_lexical_cast const&) {
+      dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    if (val < 0.0 || 1.0 < val) {
+      respond_to_request(mdr, -CEPHFS_EDOM);
+      return;
+    } else if (mdcache->export_ephemeral_random_max < val) {
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    if (!xlock_policylock(mdr, cur))
+      return;
+
+    auto pi = cur->project_inode(mdr);
+    cur->setxattr_ephemeral_rand(val);
+    pip = pi.inode.get();
+  } else if (name == "ceph.dir.pin.distributed"sv) {
+    if (!cur->is_dir() || cur->is_root()) {
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    bool val;
+    try {
+      val = boost::lexical_cast<bool>(value);
+    } catch (boost::bad_lexical_cast const&) {
+      dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    if (!xlock_policylock(mdr, cur))
+      return;
+
+    auto pi = cur->project_inode(mdr);
+    cur->setxattr_ephemeral_dist(val);
+    pip = pi.inode.get();
+  } else {
+    dout(10) << " unknown vxattr " << name << dendl;
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  pip->change_attr++;
+  pip->ctime = mdr->get_op_stamp();
+  if (mdr->get_op_stamp() > pip->rstat.rctime)
+    pip->rstat.rctime = mdr->get_op_stamp();
+  pip->version = cur->pre_dirty();
+  if (cur->is_file())
+    pip->update_backtrace();
+
+  // log + wait
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+  journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
+								   false, false, adjust_realm));
+  return;
+}
+
+void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  string name(req->get_path2());
+
+  dout(10) << __func__ << " " << name << " on " << *cur << dendl;
+
+  if (name == "ceph.dir.layout") {
+    if (!cur->is_dir()) {
+      respond_to_request(mdr, -CEPHFS_ENODATA);
+      return;
+    }
+    if (cur->is_root()) {
+      dout(10) << "can't remove layout policy on the root directory" << dendl;
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+
+    if (!cur->get_projected_inode()->has_layout()) {
+      respond_to_request(mdr, -CEPHFS_ENODATA);
+      return;
+    }
+
+    MutationImpl::LockOpVec lov;
+    lov.add_xlock(&cur->policylock);
+    if (!mds->locker->acquire_locks(mdr, lov))
+      return;
+
+    auto pi = cur->project_inode(mdr);
+    pi.inode->clear_layout();
+    pi.inode->version = cur->pre_dirty();
+
+    // log + wait
+    mdr->ls = mdlog->get_current_segment();
+    EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
+    mdlog->start_entry(le);
+    le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+    mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+    mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+    mdr->no_early_reply = true;
+    journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+    return;
+  } else if (name == "ceph.dir.layout.pool_namespace"
+          || name == "ceph.file.layout.pool_namespace") {
+    // Namespace is the only layout field that has a meaningful
+    // null/none value (empty string, means default layout).  Is equivalent
+    // to a setxattr with empty string: pass through the empty payload of
+    // the rmxattr request to do this.
+    handle_set_vxattr(mdr, cur);
+    return;
+  }
+
+  respond_to_request(mdr, -CEPHFS_ENODATA);
+}
+
+const Server::XattrHandler Server::xattr_handlers[] = {
+  {
+    xattr_name: Server::DEFAULT_HANDLER,
+    description: "default xattr handler",
+    validate:  &Server::default_xattr_validate,
+    setxattr: &Server::default_setxattr_handler,
+    removexattr: &Server::default_removexattr_handler,
+  },
+  {
+    xattr_name: "ceph.mirror.info",
+    description: "mirror info xattr handler",
+    validate: &Server::mirror_info_xattr_validate,
+    setxattr: &Server::mirror_info_setxattr_handler,
+    removexattr: &Server::mirror_info_removexattr_handler
+  },
+};
+
+const Server::XattrHandler* Server::get_xattr_or_default_handler(std::string_view xattr_name) {
+  const XattrHandler *default_xattr_handler = nullptr;
+
+  for (auto &handler : xattr_handlers) {
+    if (handler.xattr_name == Server::DEFAULT_HANDLER) {
+      ceph_assert(default_xattr_handler == nullptr);
+      default_xattr_handler = &handler;
+    }
+    if (handler.xattr_name == xattr_name) {
+      dout(20) << "handler=" << handler.description << dendl;
+      return &handler;
+    }
+  }
+
+  ceph_assert(default_xattr_handler != nullptr);
+  dout(20) << "handler=" << default_xattr_handler->description << dendl;
+  return default_xattr_handler;
+}
+
+int Server::xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
+                           const std::string &xattr_name, int op, int flags) {
+  if (op == CEPH_MDS_OP_SETXATTR) {
+    if (xattrs) {
+      if ((flags & CEPH_XATTR_CREATE) && xattrs->count(mempool::mds_co::string(xattr_name))) {
+        dout(10) << "setxattr '" << xattr_name << "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur << dendl;
+        return -CEPHFS_EEXIST;
+      }
+    }
+    if ((flags & CEPH_XATTR_REPLACE) && !(xattrs && xattrs->count(mempool::mds_co::string(xattr_name)))) {
+      dout(10) << "setxattr '" << xattr_name << "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur << dendl;
+      return -CEPHFS_ENODATA;
+    }
+
+    return 0;
+  }
+
+  if (op == CEPH_MDS_OP_RMXATTR) {
+    if (!xattrs || xattrs->count(mempool::mds_co::string(xattr_name)) == 0) {
+      dout(10) << "removexattr '" << xattr_name << "' and CEPHFS_ENODATA on " << *cur << dendl;
+      return -CEPHFS_ENODATA;
+    }
+
+    return 0;
+  }
+
+  derr << ": unhandled validation for: " << xattr_name << dendl;
+  return -CEPHFS_EINVAL;
+}
+
+void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
+                       const bufferlist &xattr_value) {
+  size_t len = xattr_value.length();
+  bufferptr b = buffer::create(len);
+  if (len) {
+    xattr_value.begin().copy(len, b.c_str());
+  }
+  auto em = xattrs->emplace(std::piecewise_construct,
+                            std::forward_as_tuple(mempool::mds_co::string(xattr_name)),
+                            std::forward_as_tuple(b));
+  if (!em.second) {
+    em.first->second = b;
+  }
+}
+
+void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name) {
+  xattrs->erase(mempool::mds_co::string(xattr_name));
+}
+
+int Server::default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
+                                   XattrOp *xattr_op) {
+  return xattr_validate(cur, xattrs, xattr_op->xattr_name, xattr_op->op, xattr_op->flags);
+}
+
+void Server::default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+                                      const XattrOp &xattr_op) {
+  xattr_set(xattrs, xattr_op.xattr_name, xattr_op.xattr_value);
+}
+
+void Server::default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+                                         const XattrOp &xattr_op) {
+  xattr_rm(xattrs, xattr_op.xattr_name);
+}
+
+// mirror info xattr handlers
+const std::string Server::MirrorXattrInfo::MIRROR_INFO_REGEX = "^cluster_id=([a-f0-9]{8}-" \
+                                                               "[a-f0-9]{4}-[a-f0-9]{4}-" \
+                                                               "[a-f0-9]{4}-[a-f0-9]{12})" \
+                                                               " fs_id=(\\d+)$";
+const std::string Server::MirrorXattrInfo::CLUSTER_ID = "ceph.mirror.info.cluster_id";
+const std::string Server::MirrorXattrInfo::FS_ID = "ceph.mirror.info.fs_id";
+int Server::parse_mirror_info_xattr(const std::string &name, const std::string &value,
+                                    std::string &cluster_id, std::string &fs_id) {
+  dout(20) << "parsing name=" << name << ", value=" << value << dendl;
+
+  static const std::regex regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX);
+  std::smatch match;
+
+  std::regex_search(value, match, regex);
+  if (match.size() != 3) {
+    derr << "mirror info parse error" << dendl;
+    return -CEPHFS_EINVAL;
+  }
+
+  cluster_id = match[1];
+  fs_id = match[2];
+  dout(20) << " parsed cluster_id=" << cluster_id << ", fs_id=" << fs_id << dendl;
+  return 0;
+}
+
+int Server::mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
+                                       XattrOp *xattr_op) {
+  if (!cur->is_root()) {
+    return -CEPHFS_EINVAL;
+  }
+
+  int v1 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::CLUSTER_ID, xattr_op->op, xattr_op->flags);
+  int v2 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::FS_ID, xattr_op->op, xattr_op->flags);
+  if (v1 != v2) {
+    derr << "inconsistent mirror info state (" << v1 << "," << v2 << ")" << dendl;
+    return -CEPHFS_EINVAL;
+  }
+
+  if (v1 < 0) {
+    return v1;
+  }
+
+  if (xattr_op->op == CEPH_MDS_OP_RMXATTR) {
+    return 0;
+  }
+
+  std::string cluster_id;
+  std::string fs_id;
+  int r = parse_mirror_info_xattr(xattr_op->xattr_name, xattr_op->xattr_value.to_str(),
+                                  cluster_id, fs_id);
+  if (r < 0) {
+    return r;
+  }
+
+  xattr_op->xinfo = std::make_unique<MirrorXattrInfo>(cluster_id, fs_id);
+  return 0;
+}
+
+void Server::mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+                                          const XattrOp &xattr_op) {
+  auto mirror_info = dynamic_cast<MirrorXattrInfo&>(*(xattr_op.xinfo));
+
+  bufferlist bl;
+  bl.append(mirror_info.cluster_id.c_str(), mirror_info.cluster_id.length());
+  xattr_set(xattrs, Server::MirrorXattrInfo::CLUSTER_ID, bl);
+
+  bl.clear();
+  bl.append(mirror_info.fs_id.c_str(), mirror_info.fs_id.length());
+  xattr_set(xattrs, Server::MirrorXattrInfo::FS_ID, bl);
+}
+
+void Server::mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+                                             const XattrOp &xattr_op) {
+  xattr_rm(xattrs, Server::MirrorXattrInfo::CLUSTER_ID);
+  xattr_rm(xattrs, Server::MirrorXattrInfo::FS_ID);
+}
+
+void Server::handle_client_setxattr(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  string name(req->get_path2());
+
+  // is a ceph virtual xattr?
+  if (is_ceph_vxattr(name)) {
+    // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
+    CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+    if (!cur)
+      return;
+
+    handle_set_vxattr(mdr, cur);
+    return;
+  }
+
+  if (!is_allowed_ceph_xattr(name)) {
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  CInode *cur = rdlock_path_pin_ref(mdr, true);
+  if (!cur)
+    return;
+
+  if (mdr->snapid != CEPH_NOSNAP) {
+    respond_to_request(mdr, -CEPHFS_EROFS);
+    return;
+  }
+
+  int flags = req->head.args.setxattr.flags;
+
+  MutationImpl::LockOpVec lov;
+  lov.add_xlock(&cur->xattrlock);
+  if (!mds->locker->acquire_locks(mdr, lov))
+    return;
+
+  if (!check_access(mdr, cur, MAY_WRITE))
+    return;
+
+  size_t len = req->get_data().length();
+  size_t inc = len + name.length();
+
+  auto handler = Server::get_xattr_or_default_handler(name);
+  const auto& pxattrs = cur->get_projected_xattrs();
+  if (pxattrs) {
+    // check xattrs kv pairs size
+    size_t cur_xattrs_size = 0;
+    for (const auto& p : *pxattrs) {
+      if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
+	continue;
+      }
+      cur_xattrs_size += p.first.length() + p.second.length();
+    }
+
+    if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
+      dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
+	<< cur_xattrs_size << ", inc " << inc << dendl;
+      respond_to_request(mdr, -CEPHFS_ENOSPC);
+      return;
+    }
+  }
+
+  XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
+  int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
+  if (r < 0) {
+    respond_to_request(mdr, r);
+    return;
+  }
+
+  dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
+
+  // project update
+  auto pi = cur->project_inode(mdr, true);
+  pi.inode->version = cur->pre_dirty();
+  pi.inode->ctime = mdr->get_op_stamp();
+  if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+    pi.inode->rstat.rctime = mdr->get_op_stamp();
+  if (name == "encryption.ctx"sv)
+    pi.inode->fscrypt = true;
+  pi.inode->change_attr++;
+  pi.inode->xattr_version++;
+
+  if ((flags & CEPH_XATTR_REMOVE)) {
+    std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
+  } else {
+    std::invoke(handler->setxattr, this, cur, pi.xattrs, xattr_op);
+  }
+
+  // log + wait
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "setxattr");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+  journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+}
+
+void Server::handle_client_removexattr(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  std::string name(req->get_path2());
+
+  // is a ceph virtual xattr?
+  if (is_ceph_vxattr(name)) {
+    // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
+    CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+    if (!cur)
+      return;
+
+    handle_remove_vxattr(mdr, cur);
+    return;
+  }
+
+  if (!is_allowed_ceph_xattr(name)) {
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  CInode* cur = rdlock_path_pin_ref(mdr, true);
+  if (!cur)
+    return;
+
+  if (mdr->snapid != CEPH_NOSNAP) {
+    respond_to_request(mdr, -CEPHFS_EROFS);
+    return;
+  }
+
+  MutationImpl::LockOpVec lov;
+  lov.add_xlock(&cur->xattrlock);
+  if (!mds->locker->acquire_locks(mdr, lov))
+    return;
+
+
+  auto handler = Server::get_xattr_or_default_handler(name);
+  bufferlist bl;
+  XattrOp xattr_op(CEPH_MDS_OP_RMXATTR, name, bl, 0);
+
+  const auto& pxattrs = cur->get_projected_xattrs();
+  int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
+  if (r < 0) {
+    respond_to_request(mdr, r);
+    return;
+  }
+
+  dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
+
+  // project update
+  auto pi = cur->project_inode(mdr, true);
+  pi.inode->version = cur->pre_dirty();
+  pi.inode->ctime = mdr->get_op_stamp();
+  if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+    pi.inode->rstat.rctime = mdr->get_op_stamp();
+  pi.inode->change_attr++;
+  pi.inode->xattr_version++;
+  std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
+
+  // log + wait
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "removexattr");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+  journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+}
+
+void Server::handle_client_getvxattr(MDRequestRef& mdr)
+{
+  const auto& req = mdr->client_request;
+  string xattr_name{req->get_path2()};
+
+  // is a ceph virtual xattr?
+  if (!is_ceph_vxattr(xattr_name)) {
+    respond_to_request(mdr, -CEPHFS_ENODATA);
+    return;
+  }
+
+  CInode *cur = rdlock_path_pin_ref(mdr, true, false);
+  if (!cur) {
+    return;
+  }
+
+  if (is_ceph_dir_vxattr(xattr_name)) {
+    if (!cur->is_dir()) {
+      respond_to_request(mdr, -CEPHFS_ENODATA);
+      return;
+    }
+  } else if (is_ceph_file_vxattr(xattr_name)) {
+    if (cur->is_dir()) {
+      respond_to_request(mdr, -CEPHFS_ENODATA);
+      return;
+    }
+  }
+
+  CachedStackStringStream css;
+  int r = 0;
+  ceph::bufferlist bl;
+  // handle these vxattrs
+  if ((xattr_name.substr(0, 15) == "ceph.dir.layout"sv) ||
+      (xattr_name.substr(0, 16) == "ceph.file.layout"sv)) {
+    std::string layout_field;
+
+    struct layout_xattr_info_t {
+      enum class InheritanceStatus : uint32_t {
+	DEFAULT = 0,
+	SET = 1,
+	INHERITED = 2
+      };
+
+      const file_layout_t     layout;
+      const InheritanceStatus status;
+
+      layout_xattr_info_t(const file_layout_t& l, InheritanceStatus inh)
+        : layout(l), status(inh) { }
+
+      static std::string status_to_string(InheritanceStatus status) {
+	switch (status) {
+	  case InheritanceStatus::DEFAULT: return "default"s;
+	  case InheritanceStatus::SET: return "set"s;
+	  case InheritanceStatus::INHERITED: return "inherited"s;
+	  default: return "unknown"s;
+	}
+      }
+    };
+
+    auto is_default_layout = [&](const file_layout_t& layout) -> bool {
+      return (layout == mdcache->default_file_layout);
+    };
+    auto get_inherited_layout = [&](CInode *cur) -> layout_xattr_info_t {
+      auto orig_in = cur;
+
+      while (cur) {
+        if (cur->get_projected_inode()->has_layout()) {
+	  auto& curr_layout = cur->get_projected_inode()->layout;
+	  if (is_default_layout(curr_layout)) {
+	    return {curr_layout, layout_xattr_info_t::InheritanceStatus::DEFAULT};
+	  }
+          if (cur == orig_in) {
+	      // we've found a new layout at this inode
+	      return {curr_layout, layout_xattr_info_t::InheritanceStatus::SET};
+          } else {
+	      return {curr_layout, layout_xattr_info_t::InheritanceStatus::INHERITED};
+          }
+        }
+
+        if (cur->is_root()) {
+          break;
+	}
+
+        cur = cur->get_projected_parent_dir()->get_inode();
+      }
+      mds->clog->error() << "no layout found at root dir!";
+      ceph_abort("no layout found at root dir! something is really messed up with layouts!");
+    };
+
+    if (xattr_name == "ceph.dir.layout.json"sv ||
+	xattr_name == "ceph.file.layout.json"sv) {
+      // fetch layout only for valid xattr_name
+      const auto lxi = get_inherited_layout(cur);
+
+      *css << "{\"stripe_unit\": " << lxi.layout.stripe_unit
+	   << ", \"stripe_count\": " << lxi.layout.stripe_count
+	   << ", \"object_size\": " << lxi.layout.object_size
+	   << ", \"pool_name\": ";
+      mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
+	  *css << "\"";
+          if (o.have_pg_pool(lxi.layout.pool_id)) {
+	    *css << o.get_pool_name(lxi.layout.pool_id);
+	  }
+	  *css << "\"";
+	});
+      *css << ", \"pool_id\": " << (uint64_t)lxi.layout.pool_id;
+      *css << ", \"pool_namespace\": \"" << lxi.layout.pool_ns << "\"";
+      *css << ", \"inheritance\": \"@"
+	   << layout_xattr_info_t::status_to_string(lxi.status) << "\"}";
+    } else if ((xattr_name == "ceph.dir.layout.pool_name"sv) ||
+	       (xattr_name == "ceph.file.layout.pool_name"sv)) {
+      // fetch layout only for valid xattr_name
+      const auto lxi = get_inherited_layout(cur);
+      mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
+	  if (o.have_pg_pool(lxi.layout.pool_id)) {
+	  *css << o.get_pool_name(lxi.layout.pool_id);
+	  }
+	  });
+    } else if ((xattr_name == "ceph.dir.layout.pool_id"sv) ||
+               (xattr_name == "ceph.file.layout.pool_id"sv)) {
+      // fetch layout only for valid xattr_name
+      const auto lxi = get_inherited_layout(cur);
+      *css << (uint64_t)lxi.layout.pool_id;
+    } else {
+      r = -CEPHFS_ENODATA; // no such attribute
+    }
+  } else if (xattr_name.substr(0, 12) == "ceph.dir.pin"sv) {
+    if (xattr_name == "ceph.dir.pin"sv) {
+      *css << cur->get_projected_inode()->export_pin;
+    } else if (xattr_name == "ceph.dir.pin.random"sv) {
+      *css << cur->get_projected_inode()->export_ephemeral_random_pin;
+    } else if (xattr_name == "ceph.dir.pin.distributed"sv) {
+      *css << cur->get_projected_inode()->export_ephemeral_distributed_pin;
+    } else {
+      // otherwise respond as invalid request
+      // since we only handle ceph vxattrs here
+      r = -CEPHFS_ENODATA; // no such attribute
+    }
+  } else {
+    // otherwise respond as invalid request
+    // since we only handle ceph vxattrs here
+    r = -CEPHFS_ENODATA; // no such attribute
+  }
+
+  if (r == 0) {
+    ENCODE_START(1, 1, bl);
+    encode(css->strv(), bl);
+    ENCODE_FINISH(bl);
+    mdr->reply_extra_bl = bl;
+  }
+
+  respond_to_request(mdr, r);
+}
+
+// =================================================================
+// DIRECTORY and NAMESPACE OPS
+
+
+// ------------------------------------------------
+
+struct C_WaitUnlinkToFinish : public MDSContext {
+protected:
+  MDCache *mdcache;
+  CDentry *dn;
+  MDSContext *fin;
+
+  MDSRank *get_mds() override
+  {
+    ceph_assert(mdcache != NULL);
+    return mdcache->mds;
+  }
+
+public:
+  C_WaitUnlinkToFinish(MDCache *m, CDentry *d, MDSContext *f) :
+    mdcache(m), dn(d), fin(f) {}
+  void finish(int r) override {
+    fin->complete(r);
+    dn->put(CDentry::PIN_PURGING);
+  }
+};
+
+bool Server::is_unlink_pending(CDentry *dn)
+{
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  if (!dnl->is_null() && dn->state_test(CDentry::STATE_UNLINKING)) {
+      return true;
+  }
+  return false;
+}
+
+void Server::wait_for_pending_unlink(CDentry *dn, MDRequestRef& mdr)
+{
+  dout(20) << __func__ << " dn " << *dn << dendl;
+  mds->locker->drop_locks(mdr.get());
+  auto fin = new C_MDS_RetryRequest(mdcache, mdr);
+  dn->get(CDentry::PIN_PURGING);
+  dn->add_waiter(CDentry::WAIT_UNLINK_FINISH, new C_WaitUnlinkToFinish(mdcache, dn, fin));
+}
+
+// MKNOD
+
+class C_MDS_mknod_finish : public ServerLogContext {
+  CDentry *dn;
+  CInode *newi;
+public:
+  C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
+    ServerLogContext(s, r), dn(d), newi(ni) {}
+  void finish(int r) override {
+    ceph_assert(r == 0);
+
+    // link the inode
+    dn->pop_projected_linkage();
+    
+    // be a bit hacky with the inode version, here.. we decrement it
+    // just to keep mark_dirty() happen. (we didn't bother projecting
+    // a new version of hte inode since it's just been created)
+    newi->mark_dirty(mdr->ls);
+    newi->mark_dirty_parent(mdr->ls, true);
+
+    // mkdir?
+    if (newi->is_dir()) {
+      CDir *dir = newi->get_dirfrag(frag_t());
+      ceph_assert(dir);
+      dir->mark_dirty(mdr->ls);
+      dir->mark_new(mdr->ls);
+    }
+
+    mdr->apply();
+
+    MDRequestRef null_ref;
+    get_mds()->mdcache->send_dentry_link(dn, null_ref);
+
+    if (newi->is_file()) {
+      get_mds()->locker->share_inode_max_size(newi);
+    } else if (newi->is_dir()) {
+      // We do this now so that the linkages on the new directory are stable.
+      newi->maybe_ephemeral_rand();
+    }
+
+    // hit pop
+    get_mds()->balancer->hit_inode(newi, META_POP_IWR);
+
+    // reply
+    server->respond_to_request(mdr, 0);
+  }
+};
+
+
+void Server::handle_client_mknod(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  client_t client = mdr->get_client();
+
+  unsigned mode = req->head.args.mknod.mode;
+  if ((mode & S_IFMT) == 0)
+    mode |= S_IFREG;
+
+  mdr->disable_lock_cache();
+  CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, S_ISREG(mode));
+  if (!dn)
+    return;
+
+  if (is_unlink_pending(dn)) {
+    wait_for_pending_unlink(dn, mdr);
+    return;
+  }
+
+  CDir *dir = dn->get_dir();
+  CInode *diri = dir->get_inode();
+  if (!check_access(mdr, diri, MAY_WRITE))
+    return;
+  if (!check_fragment_space(mdr, dir))
+    return;
+  if (!check_dir_max_entries(mdr, dir))
+    return;
+
+  ceph_assert(dn->get_projected_linkage()->is_null());
+  if (req->get_alternate_name().size() > alternate_name_max) {
+    dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
+    respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
+    return;
+  }
+  dn->set_alternate_name(req->get_alternate_name());
+
+  // set layout
+  file_layout_t layout;
+  if (mdr->dir_layout != file_layout_t())
+    layout = mdr->dir_layout;
+  else
+    layout = mdcache->default_file_layout;
+
+  CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
+  ceph_assert(newi);
+
+  dn->push_projected_linkage(newi);
+
+  auto _inode = newi->_get_inode();
+  _inode->version = dn->pre_dirty();
+  _inode->rdev = req->head.args.mknod.rdev;
+  _inode->rstat.rfiles = 1;
+  _inode->accounted_rstat = _inode->rstat;
+  if (layout.pool_id != mdcache->default_file_layout.pool_id)
+    _inode->add_old_pool(mdcache->default_file_layout.pool_id);
+  _inode->update_backtrace();
+
+  snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+  SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
+  ceph_assert(follows >= realm->get_newest_seq());
+
+  // if the client created a _regular_ file via MKNOD, it's highly likely they'll
+  // want to write to it (e.g., if they are reexporting NFS)
+  if (S_ISREG(_inode->mode)) {
+    // issue a cap on the file
+    int cmode = CEPH_FILE_MODE_RDWR;
+    Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
+    if (cap) {
+      cap->set_wanted(0);
+
+      // put locks in excl mode
+      newi->filelock.set_state(LOCK_EXCL);
+      newi->authlock.set_state(LOCK_EXCL);
+      newi->xattrlock.set_state(LOCK_EXCL);
+
+      dout(15) << " setting a client_range too, since this is a regular file" << dendl;
+      _inode->client_ranges[client].range.first = 0;
+      _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
+      _inode->client_ranges[client].follows = follows;
+      newi->mark_clientwriteable();
+      cap->mark_clientwriteable();
+    }
+  }
+
+  ceph_assert(dn->first == follows + 1);
+  newi->first = dn->first;
+    
+  dout(10) << "mknod mode " << _inode->mode << " rdev " << _inode->rdev << dendl;
+
+  // prepare finisher
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "mknod");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  journal_allocated_inos(mdr, &le->metablob);
+  
+  mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
+				    PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+  le->metablob.add_primary_dentry(dn, newi, true, true, true);
+
+  journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
+  mds->balancer->maybe_fragment(dn->get_dir(), false);
+}
+
+
+
+// MKDIR
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_mkdir(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+
+  mdr->disable_lock_cache();
+  CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
+  if (!dn)
+    return;
+
+  if (is_unlink_pending(dn)) {
+    wait_for_pending_unlink(dn, mdr);
+    return;
+  }
+
+  CDir *dir = dn->get_dir();
+  CInode *diri = dir->get_inode();
+
+  // mkdir check access
+  if (!check_access(mdr, diri, MAY_WRITE))
+    return;
+
+  if (!check_fragment_space(mdr, dir))
+    return;
+  if (!check_dir_max_entries(mdr, dir))
+    return;
+
+  ceph_assert(dn->get_projected_linkage()->is_null());
+  if (req->get_alternate_name().size() > alternate_name_max) {
+    dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
+    respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
+    return;
+  }
+  dn->set_alternate_name(req->get_alternate_name());
+
+  // new inode
+  unsigned mode = req->head.args.mkdir.mode;
+  mode &= ~S_IFMT;
+  mode |= S_IFDIR;
+  CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
+  ceph_assert(newi);
+
+  // it's a directory.
+  dn->push_projected_linkage(newi);
+
+  auto _inode = newi->_get_inode();
+  _inode->version = dn->pre_dirty();
+  _inode->rstat.rsubdirs = 1;
+  _inode->accounted_rstat = _inode->rstat;
+  _inode->update_backtrace();
+
+  snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+  SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
+  ceph_assert(follows >= realm->get_newest_seq());
+
+  dout(12) << " follows " << follows << dendl;
+  ceph_assert(dn->first == follows + 1);
+  newi->first = dn->first;
+
+  // ...and that new dir is empty.
+  CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
+  newdir->state_set(CDir::STATE_CREATING);
+  newdir->mark_complete();
+  newdir->_get_fnode()->version = newdir->pre_dirty();
+
+  // prepare finisher
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "mkdir");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  journal_allocated_inos(mdr, &le->metablob);
+  mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+  le->metablob.add_primary_dentry(dn, newi, true, true);
+  le->metablob.add_new_dir(newdir); // dirty AND complete AND new
+  
+  // issue a cap on the directory
+  int cmode = CEPH_FILE_MODE_RDWR;
+  Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
+  if (cap) {
+    cap->set_wanted(0);
+
+    // put locks in excl mode
+    newi->filelock.set_state(LOCK_EXCL);
+    newi->authlock.set_state(LOCK_EXCL);
+    newi->xattrlock.set_state(LOCK_EXCL);
+  }
+
+  // make sure this inode gets into the journal
+  le->metablob.add_opened_ino(newi->ino());
+
+  journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
+
+  // We hit_dir (via hit_inode) in our finish callback, but by then we might
+  // have overshot the split size (multiple mkdir in flight), so here is
+  // an early chance to split the dir if this mkdir makes it oversized.
+  mds->balancer->maybe_fragment(dir, false);
+}
+
+
+// SYMLINK
+
+void Server::handle_client_symlink(MDRequestRef& mdr)
+{
+  const auto& req = mdr->client_request;
+
+  mdr->disable_lock_cache();
+  CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
+  if (!dn)
+    return;
+
+  if (is_unlink_pending(dn)) {
+    wait_for_pending_unlink(dn, mdr);
+    return;
+  }
+
+  CDir *dir = dn->get_dir();
+  CInode *diri = dir->get_inode();
+
+  if (!check_access(mdr, diri, MAY_WRITE))
+    return;
+  if (!check_fragment_space(mdr, dir))
+    return;
+  if (!check_dir_max_entries(mdr, dir))
+    return;
+
+  ceph_assert(dn->get_projected_linkage()->is_null());
+  if (req->get_alternate_name().size() > alternate_name_max) {
+    dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
+    respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
+  }
+  dn->set_alternate_name(req->get_alternate_name());
+
+  unsigned mode = S_IFLNK | 0777;
+  CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
+  ceph_assert(newi);
+
+  // it's a symlink
+  dn->push_projected_linkage(newi);
+
+  newi->symlink = req->get_path2();
+  auto _inode = newi->_get_inode();
+  _inode->version = dn->pre_dirty();
+  _inode->size = newi->symlink.length();
+  _inode->rstat.rbytes = _inode->size;
+  _inode->rstat.rfiles = 1;
+  _inode->accounted_rstat = _inode->rstat;
+  _inode->update_backtrace();
+
+  newi->first = dn->first;
+
+  // prepare finisher
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "symlink");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  journal_allocated_inos(mdr, &le->metablob);
+  mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+  le->metablob.add_primary_dentry(dn, newi, true, true);
+
+  journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
+  mds->balancer->maybe_fragment(dir, false);
+}
+
+
+
+
+
+// LINK
+
+void Server::handle_client_link(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+
+  dout(7) << "handle_client_link " << req->get_filepath()
+	  << " to " << req->get_filepath2()
+	  << dendl;
+
+  mdr->disable_lock_cache();
+
+  CDentry *destdn;
+  CInode *targeti;
+
+  if (req->get_filepath2().depth() == 0) {
+    targeti = mdcache->get_inode(req->get_filepath2().get_ino());
+    if (!targeti) {
+      dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
+      inodeno_t ino = req->get_filepath2().get_ino();
+      mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
+      return;
+    }
+    mdr->pin(targeti);
+
+    if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
+      CDentry *pdn = targeti->get_projected_parent_dn();
+      if (!pdn) {
+	dout(7) << "target has no parent dn, failing..." << dendl;
+	respond_to_request(mdr, -CEPHFS_EINVAL);
+	return;
+      }
+      if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
+	return;
+      mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
+    }
+
+    destdn = rdlock_path_xlock_dentry(mdr, false);
+    if (!destdn)
+      return;
+  } else {
+    auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
+    destdn = ret.first;
+    if (!destdn)
+      return;
+
+    if (!destdn->get_projected_linkage()->is_null()) {
+      respond_to_request(mdr, -CEPHFS_EEXIST);
+      return;
+    }
+
+    targeti = ret.second->get_projected_linkage()->get_inode();
+  }
+
+  if (is_unlink_pending(destdn)) {
+    wait_for_pending_unlink(destdn, mdr);
+    return;
+  }
+
+  ceph_assert(destdn->get_projected_linkage()->is_null());
+  if (req->get_alternate_name().size() > alternate_name_max) {
+    dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
+    respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
+    return;
+  }
+  destdn->set_alternate_name(req->get_alternate_name());
+
+  if (targeti->is_dir()) {
+    dout(7) << "target is a dir, failing..." << dendl;
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  CDir *dir = destdn->get_dir();
+  dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
+  dout(7) << "target is " << *targeti << dendl;
+
+  if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+    MutationImpl::LockOpVec lov;
+    lov.add_xlock(&targeti->snaplock);
+    lov.add_xlock(&targeti->linklock);
+
+    if (!mds->locker->acquire_locks(mdr, lov))
+      return;
+
+    mdr->locking_state |= MutationImpl::ALL_LOCKED;
+  }
+
+  if (targeti->get_projected_inode()->nlink == 0) {
+    dout(7) << "target has no link, failing..." << dendl;
+    respond_to_request(mdr, -CEPHFS_ENOENT);
+    return;
+  }
+
+  if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
+    if (!check_access(mdr, targeti, MAY_WRITE))
+      return;
+
+    if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
+      return;
+
+    if (!check_fragment_space(mdr, dir))
+      return;
+
+    if (!check_dir_max_entries(mdr, dir))
+      return;
+  }
+
+  CInode* target_pin = targeti->get_projected_parent_dir()->inode;
+  SnapRealm *target_realm = target_pin->find_snaprealm();
+  if (target_pin != dir->inode &&
+      target_realm->get_subvolume_ino() !=
+      dir->inode->find_snaprealm()->get_subvolume_ino()) {
+    dout(7) << "target is in different subvolume, failing..." << dendl;
+    respond_to_request(mdr, -CEPHFS_EXDEV);
+    return;
+  }
+
+  // go!
+  ceph_assert(g_conf()->mds_kill_link_at != 1);
+
+  // local or remote?
+  if (targeti->is_auth()) 
+    _link_local(mdr, destdn, targeti, target_realm);
+  else 
+    _link_remote(mdr, true, destdn, targeti);
+  mds->balancer->maybe_fragment(dir, false);  
+}
+
+
+class C_MDS_link_local_finish : public ServerLogContext {
+  CDentry *dn;
+  CInode *targeti;
+  version_t dnpv;
+  version_t tipv;
+  bool adjust_realm;
+public:
+  C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
+			  version_t dnpv_, version_t tipv_, bool ar) :
+    ServerLogContext(s, r), dn(d), targeti(ti),
+    dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
+  void finish(int r) override {
+    ceph_assert(r == 0);
+    server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
+  }
+};
+
+
+void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
+{
+  dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
+
+  mdr->ls = mdlog->get_current_segment();
+
+  // predirty NEW dentry
+  version_t dnpv = dn->pre_dirty();
+  version_t tipv = targeti->pre_dirty();
+  
+  // project inode update
+  auto pi = targeti->project_inode(mdr);
+  pi.inode->nlink++;
+  pi.inode->ctime = mdr->get_op_stamp();
+  if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+    pi.inode->rstat.rctime = mdr->get_op_stamp();
+  pi.inode->change_attr++;
+  pi.inode->version = tipv;
+
+  bool adjust_realm = false;
+  if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
+    sr_t *newsnap = targeti->project_snaprealm();
+    targeti->mark_snaprealm_global(newsnap);
+    targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
+    adjust_realm = true;
+  }
+
+  // log + wait
+  EUpdate *le = new EUpdate(mdlog, "link_local");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+  mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);      // new dn
+  mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY);           // targeti
+  le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type());  // new remote
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
+
+  // do this after predirty_*, to avoid funky extra dnl arg
+  dn->push_projected_linkage(targeti->ino(), targeti->d_type());
+
+  journal_and_reply(mdr, targeti, dn, le,
+		    new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
+}
+
+void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
+				version_t dnpv, version_t tipv, bool adjust_realm)
+{
+  dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
+
+  // link and unlock the NEW dentry
+  CDentry::linkage_t *dnl = dn->pop_projected_linkage();
+  if (!dnl->get_inode())
+    dn->link_remote(dnl, targeti);
+  dn->mark_dirty(dnpv, mdr->ls);
+
+  // target inode
+  mdr->apply();
+
+  MDRequestRef null_ref;
+  mdcache->send_dentry_link(dn, null_ref);
+
+  if (adjust_realm) {
+    int op = CEPH_SNAP_OP_SPLIT;
+    mds->mdcache->send_snap_update(targeti, 0, op);
+    mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
+  }
+
+  // bump target popularity
+  mds->balancer->hit_inode(targeti, META_POP_IWR);
+  mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
+
+  // reply
+  respond_to_request(mdr, 0);
+}
+
+
+// link / unlink remote
+
+class C_MDS_link_remote_finish : public ServerLogContext {
+  bool inc;
+  CDentry *dn;
+  CInode *targeti;
+  version_t dpv;
+public:
+  C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
+    ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
+    dpv(d->get_projected_version()) {}
+  void finish(int r) override {
+    ceph_assert(r == 0);
+    server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
+  }
+};
+
+void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
+{
+  dout(10) << "_link_remote " 
+	   << (inc ? "link ":"unlink ")
+	   << *dn << " to " << *targeti << dendl;
+
+  // 1. send LinkPrepare to dest (journal nlink++ prepare)
+  mds_rank_t linkauth = targeti->authority().first;
+  if (mdr->more()->witnessed.count(linkauth) == 0) {
+    if (mds->is_cluster_degraded() &&
+	!mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
+      dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
+      if (mdr->more()->waiting_on_peer.empty())
+	mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
+      return;
+    }
+
+    dout(10) << " targeti auth must prepare nlink++/--" << dendl;
+    int op;
+    if (inc)
+      op = MMDSPeerRequest::OP_LINKPREP;
+    else 
+      op = MMDSPeerRequest::OP_UNLINKPREP;
+    auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, op);
+    targeti->set_object_info(req->get_object_info());
+    req->op_stamp = mdr->get_op_stamp();
+    if (auto& desti_srnode = mdr->more()->desti_srnode)
+      encode(*desti_srnode, req->desti_snapbl);
+    mds->send_message_mds(req, linkauth);
+
+    ceph_assert(mdr->more()->waiting_on_peer.count(linkauth) == 0);
+    mdr->more()->waiting_on_peer.insert(linkauth);
+    return;
+  }
+  dout(10) << " targeti auth has prepared nlink++/--" << dendl;
+
+  ceph_assert(g_conf()->mds_kill_link_at != 2);
+
+  if (auto& desti_srnode = mdr->more()->desti_srnode) {
+    delete desti_srnode;
+    desti_srnode = NULL;
+  }
+
+  mdr->set_mds_stamp(ceph_clock_now());
+
+  // add to event
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+  if (!mdr->more()->witnessed.empty()) {
+    dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
+    le->reqid = mdr->reqid;
+    le->had_peers = true;
+    mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+  }
+
+  if (inc) {
+    dn->pre_dirty();
+    mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
+    le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
+    dn->push_projected_linkage(targeti->ino(), targeti->d_type());
+  } else {
+    dn->pre_dirty();
+    mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
+    mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
+    le->metablob.add_null_dentry(dn, true);
+    dn->push_projected_linkage();
+  }
+
+  journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
+		    new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
+}
+
+void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
+				 CDentry *dn, CInode *targeti,
+				 version_t dpv)
+{
+  dout(10) << "_link_remote_finish "
+	   << (inc ? "link ":"unlink ")
+	   << *dn << " to " << *targeti << dendl;
+
+  ceph_assert(g_conf()->mds_kill_link_at != 3);
+
+  if (!mdr->more()->witnessed.empty())
+    mdcache->logged_leader_update(mdr->reqid);
+
+  if (inc) {
+    // link the new dentry
+    CDentry::linkage_t *dnl = dn->pop_projected_linkage();
+    if (!dnl->get_inode())
+      dn->link_remote(dnl, targeti);
+    dn->mark_dirty(dpv, mdr->ls);
+  } else {
+    // unlink main dentry
+    dn->get_dir()->unlink_inode(dn);
+    dn->pop_projected_linkage();
+    dn->mark_dirty(dn->get_projected_version(), mdr->ls);  // dirty old dentry
+  }
+
+  mdr->apply();
+
+  MDRequestRef null_ref;
+  if (inc) {
+    mdcache->send_dentry_link(dn, null_ref);
+  } else {
+    dn->state_clear(CDentry::STATE_UNLINKING);
+    mdcache->send_dentry_unlink(dn, NULL, null_ref);
+
+    MDSContext::vec finished;
+    dn->take_waiting(CDentry::WAIT_UNLINK_FINISH, finished);
+    mdcache->mds->queue_waiters(finished);
+  }
+
+  // bump target popularity
+  mds->balancer->hit_inode(targeti, META_POP_IWR);
+  mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
+
+  // reply
+  respond_to_request(mdr, 0);
+
+  if (!inc)
+    // removing a new dn?
+    dn->get_dir()->try_remove_unlinked_dn(dn);
+}
+
+
+// remote linking/unlinking
+
+class C_MDS_PeerLinkPrep : public ServerLogContext {
+  CInode *targeti;
+  bool adjust_realm;
+public:
+  C_MDS_PeerLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
+    ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
+  void finish(int r) override {
+    ceph_assert(r == 0);
+    server->_logged_peer_link(mdr, targeti, adjust_realm);
+  }
+};
+
+class C_MDS_PeerLinkCommit : public ServerContext {
+  MDRequestRef mdr;
+  CInode *targeti;
+public:
+  C_MDS_PeerLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
+    ServerContext(s), mdr(r), targeti(t) { }
+  void finish(int r) override {
+    server->_commit_peer_link(mdr, r, targeti);
+  }
+};
+
+void Server::handle_peer_link_prep(MDRequestRef& mdr)
+{
+  dout(10) << "handle_peer_link_prep " << *mdr
+	   << " on " << mdr->peer_request->get_object_info()
+	   << dendl;
+
+  ceph_assert(g_conf()->mds_kill_link_at != 4);
+
+  CInode *targeti = mdcache->get_inode(mdr->peer_request->get_object_info().ino);
+  ceph_assert(targeti);
+  dout(10) << "targeti " << *targeti << dendl;
+  CDentry *dn = targeti->get_parent_dn();
+  CDentry::linkage_t *dnl = dn->get_linkage();
+  ceph_assert(dnl->is_primary());
+
+  mdr->set_op_stamp(mdr->peer_request->op_stamp);
+
+  mdr->auth_pin(targeti);
+
+  //ceph_abort();  // test hack: make sure leader can handle a peer that fails to prepare...
+  ceph_assert(g_conf()->mds_kill_link_at != 5);
+
+  // journal it
+  mdr->ls = mdlog->get_current_segment();
+  EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_prep", mdr->reqid, mdr->peer_to_mds,
+				      EPeerUpdate::OP_PREPARE, EPeerUpdate::LINK);
+  mdlog->start_entry(le);
+
+  auto pi = dnl->get_inode()->project_inode(mdr);
+
+  // update journaled target inode
+  bool inc;
+  bool adjust_realm = false;
+  bool realm_projected = false;
+  if (mdr->peer_request->get_op() == MMDSPeerRequest::OP_LINKPREP) {
+    inc = true;
+    pi.inode->nlink++;
+
+    CDentry *target_pdn = targeti->get_projected_parent_dn();
+    SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
+    if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
+      sr_t *newsnap = targeti->project_snaprealm();
+      targeti->mark_snaprealm_global(newsnap);
+      targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
+      adjust_realm = true;
+      realm_projected = true;
+    }
+  } else {
+    inc = false;
+    pi.inode->nlink--;
+    if (targeti->is_projected_snaprealm_global()) {
+      ceph_assert(mdr->peer_request->desti_snapbl.length());
+      auto p = mdr->peer_request->desti_snapbl.cbegin();
+
+      sr_t *newsnap = targeti->project_snaprealm();
+      decode(*newsnap, p);
+
+      if (pi.inode->nlink == 0)
+	ceph_assert(!newsnap->is_parent_global());
+
+      realm_projected = true;
+    } else {
+      ceph_assert(mdr->peer_request->desti_snapbl.length() == 0);
+    }
+  }
+
+  link_rollback rollback;
+  rollback.reqid = mdr->reqid;
+  rollback.ino = targeti->ino();
+  rollback.old_ctime = targeti->get_inode()->ctime;   // we hold versionlock xlock; no concorrent projections
+  const auto& pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
+  rollback.old_dir_mtime = pf->fragstat.mtime;
+  rollback.old_dir_rctime = pf->rstat.rctime;
+  rollback.was_inc = inc;
+  if (realm_projected) {
+    if (targeti->snaprealm) {
+      encode(true, rollback.snapbl);
+      targeti->encode_snap_blob(rollback.snapbl);
+    } else {
+      encode(false, rollback.snapbl);
+    }
+  }
+  encode(rollback, le->rollback);
+  mdr->more()->rollback_bl = le->rollback;
+
+  pi.inode->ctime = mdr->get_op_stamp();
+  pi.inode->version = targeti->pre_dirty();
+
+  dout(10) << " projected inode " << pi.inode->ino << " v " << pi.inode->version << dendl;
+
+  // commit case
+  mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
+  mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
+  mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
+
+  // set up commit waiter
+  mdr->more()->peer_commit = new C_MDS_PeerLinkCommit(this, mdr, targeti);
+
+  mdr->more()->peer_update_journaled = true;
+  submit_mdlog_entry(le, new C_MDS_PeerLinkPrep(this, mdr, targeti, adjust_realm),
+                     mdr, __func__);
+  mdlog->flush();
+}
+
+void Server::_logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
+{
+  dout(10) << "_logged_peer_link " << *mdr
+	   << " " << *targeti << dendl;
+
+  ceph_assert(g_conf()->mds_kill_link_at != 6);
+
+  // update the target
+  mdr->apply();
+
+  // hit pop
+  mds->balancer->hit_inode(targeti, META_POP_IWR);
+
+  // done.
+  mdr->reset_peer_request();
+
+  if (adjust_realm) {
+    int op = CEPH_SNAP_OP_SPLIT;
+    mds->mdcache->send_snap_update(targeti, 0, op);
+    mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
+  }
+
+  // ack
+  if (!mdr->aborted) {
+    auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_LINKPREPACK);
+    mds->send_message_mds(reply, mdr->peer_to_mds);
+  } else {
+    dout(10) << " abort flag set, finishing" << dendl;
+    mdcache->request_finish(mdr);
+  }
+}
+
+
+struct C_MDS_CommittedPeer : public ServerLogContext {
+  C_MDS_CommittedPeer(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
+  void finish(int r) override {
+    server->_committed_peer(mdr);
+  }
+};
+
+void Server::_commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti)
+{  
+  dout(10) << "_commit_peer_link " << *mdr
+	   << " r=" << r
+	   << " " << *targeti << dendl;
+
+  ceph_assert(g_conf()->mds_kill_link_at != 7);
+
+  if (r == 0) {
+    // drop our pins, etc.
+    mdr->cleanup();
+
+    // write a commit to the journal
+    EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_commit", mdr->reqid, mdr->peer_to_mds,
+					EPeerUpdate::OP_COMMIT, EPeerUpdate::LINK);
+    mdlog->start_entry(le);
+    submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
+    mdlog->flush();
+  } else {
+    do_link_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
+  }
+}
+
+void Server::_committed_peer(MDRequestRef& mdr)
+{
+  dout(10) << "_committed_peer " << *mdr << dendl;
+
+  ceph_assert(g_conf()->mds_kill_link_at != 8);
+
+  bool assert_exist = mdr->more()->peer_update_journaled;
+  mdcache->finish_uncommitted_peer(mdr->reqid, assert_exist);
+  auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_COMMITTED);
+  mds->send_message_mds(req, mdr->peer_to_mds);
+  mdcache->request_finish(mdr);
+}
+
+struct C_MDS_LoggedLinkRollback : public ServerLogContext {
+  MutationRef mut;
+  map<client_t,ref_t<MClientSnap>> splits;
+  C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
+			   map<client_t,ref_t<MClientSnap>>&& _splits) :
+    ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
+  }
+  void finish(int r) override {
+    server->_link_rollback_finish(mut, mdr, splits);
+  }
+};
+
+void Server::do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
+{
+  link_rollback rollback;
+  auto p = rbl.cbegin();
+  decode(rollback, p);
+
+  dout(10) << "do_link_rollback on " << rollback.reqid 
+	   << (rollback.was_inc ? " inc":" dec") 
+	   << " ino " << rollback.ino
+	   << dendl;
+
+  ceph_assert(g_conf()->mds_kill_link_at != 9);
+
+  mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
+  ceph_assert(mdr || mds->is_resolve());
+
+  MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
+  mut->ls = mds->mdlog->get_current_segment();
+
+  CInode *in = mdcache->get_inode(rollback.ino);
+  ceph_assert(in);
+  dout(10) << " target is " << *in << dendl;
+  ceph_assert(!in->is_projected());  // live peer request hold versionlock xlock.
+  
+  auto pi = in->project_inode(mut);
+  pi.inode->version = in->pre_dirty();
+
+  // parent dir rctime
+  CDir *parent = in->get_projected_parent_dn()->get_dir();
+  auto pf = parent->project_fnode(mut);
+  pf->version = parent->pre_dirty();
+  if (pf->fragstat.mtime == pi.inode->ctime) {
+    pf->fragstat.mtime = rollback.old_dir_mtime;
+    if (pf->rstat.rctime == pi.inode->ctime)
+      pf->rstat.rctime = rollback.old_dir_rctime;
+    mut->add_updated_lock(&parent->get_inode()->filelock);
+    mut->add_updated_lock(&parent->get_inode()->nestlock);
+  }
+
+  // inode
+  pi.inode->ctime = rollback.old_ctime;
+  if (rollback.was_inc)
+    pi.inode->nlink--;
+  else
+    pi.inode->nlink++;
+
+  map<client_t,ref_t<MClientSnap>> splits;
+  if (rollback.snapbl.length() && in->snaprealm) {
+    bool hadrealm;
+    auto p = rollback.snapbl.cbegin();
+    decode(hadrealm, p);
+    if (hadrealm) {
+      if (!mds->is_resolve()) {
+	sr_t *new_srnode = new sr_t();
+	decode(*new_srnode, p);
+	in->project_snaprealm(new_srnode);
+      } else {
+	decode(in->snaprealm->srnode, p);
+      }
+    } else {
+      SnapRealm *realm = parent->get_inode()->find_snaprealm();
+      if (!mds->is_resolve())
+	mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
+      in->project_snaprealm(NULL);
+    }
+  }
+
+  // journal it
+  EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_rollback", rollback.reqid, leader,
+				      EPeerUpdate::OP_ROLLBACK, EPeerUpdate::LINK);
+  mdlog->start_entry(le);
+  le->commit.add_dir_context(parent);
+  le->commit.add_dir(parent, true);
+  le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
+  
+  submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
+                     mdr, __func__);
+  mdlog->flush();
+}
+
+void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
+				   map<client_t,ref_t<MClientSnap>>& splits)
+{
+  dout(10) << "_link_rollback_finish" << dendl;
+
+  ceph_assert(g_conf()->mds_kill_link_at != 10);
+
+  mut->apply();
+
+  if (!mds->is_resolve())
+    mdcache->send_snaps(splits);
+
+  if (mdr)
+    mdcache->request_finish(mdr);
+
+  mdcache->finish_rollback(mut->reqid, mdr);
+
+  mut->cleanup();
+}
+
+
+void Server::handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m)
+{
+  dout(10) << "handle_peer_link_prep_ack " << *mdr
+	   << " " << *m << dendl;
+  mds_rank_t from = mds_rank_t(m->get_source().num());
+
+  ceph_assert(g_conf()->mds_kill_link_at != 11);
+
+  // note peer
+  mdr->more()->peers.insert(from);
+  
+  // witnessed!
+  ceph_assert(mdr->more()->witnessed.count(from) == 0);
+  mdr->more()->witnessed.insert(from);
+  ceph_assert(!m->is_not_journaled());
+  mdr->more()->has_journaled_peers = true;
+  
+  // remove from waiting list
+  ceph_assert(mdr->more()->waiting_on_peer.count(from));
+  mdr->more()->waiting_on_peer.erase(from);
+
+  ceph_assert(mdr->more()->waiting_on_peer.empty());
+
+  dispatch_client_request(mdr);  // go again!
+}
+
+
+
+
+
+// UNLINK
+
+void Server::handle_client_unlink(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  client_t client = mdr->get_client();
+
+  // rmdir or unlink?
+  bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
+
+  if (rmdir)
+    mdr->disable_lock_cache();
+
+  CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
+  if (!dn)
+    return;
+
+  // notify replica MDSes the dentry is under unlink
+  if (!dn->state_test(CDentry::STATE_UNLINKING)) {
+    dn->state_set(CDentry::STATE_UNLINKING);
+    mdcache->send_dentry_unlink(dn, nullptr, mdr, true);
+    if (dn->replica_unlinking_ref) {
+      return;
+    }
+  }
+
+  CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
+  ceph_assert(!dnl->is_null());
+  CInode *in = dnl->get_inode();
+
+  if (rmdir) {
+    dout(7) << "handle_client_rmdir on " << *dn << dendl;
+  } else {
+    dout(7) << "handle_client_unlink on " << *dn << dendl;
+  }
+  dout(7) << "dn links to " << *in << dendl;
+
+  // rmdir vs is_dir 
+  if (in->is_dir()) {
+    if (rmdir) {
+      // do empty directory checks
+      if (_dir_is_nonempty_unlocked(mdr, in)) {
+        dn->state_clear(CDentry::STATE_UNLINKING);
+        respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+	return;
+      }
+    } else {
+      dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
+      dn->state_clear(CDentry::STATE_UNLINKING);
+      respond_to_request(mdr, -CEPHFS_EISDIR);
+      return;
+    }
+  } else {
+    if (rmdir) {
+      // unlink
+      dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
+      dn->state_clear(CDentry::STATE_UNLINKING);
+      respond_to_request(mdr, -CEPHFS_ENOTDIR);
+      return;
+    }
+  }
+
+  CInode *diri = dn->get_dir()->get_inode();
+  if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
+    if (!check_access(mdr, diri, MAY_WRITE)) {
+      dn->state_clear(CDentry::STATE_UNLINKING);
+      return;
+    }
+  }
+
+  // -- create stray dentry? --
+  CDentry *straydn = NULL;
+  if (dnl->is_primary()) {
+    straydn = prepare_stray_dentry(mdr, dnl->get_inode());
+    if (!straydn)
+      return;
+    dout(10) << " straydn is " << *straydn << dendl;
+  } else if (mdr->straydn) {
+    mdr->unpin(mdr->straydn);
+    mdr->straydn = NULL;
+  }
+
+  // lock
+  if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+    MutationImpl::LockOpVec lov;
+
+    lov.add_xlock(&in->linklock);
+    lov.add_xlock(&in->snaplock);
+    if (in->is_dir())
+      lov.add_rdlock(&in->filelock);   // to verify it's empty
+
+    if (straydn) {
+      lov.add_wrlock(&straydn->get_dir()->inode->filelock);
+      lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
+      lov.add_xlock(&straydn->lock);
+    }
+
+    if (!mds->locker->acquire_locks(mdr, lov))
+      return;
+
+    mdr->locking_state |= MutationImpl::ALL_LOCKED;
+  }
+
+  if (in->is_dir() &&
+      _dir_is_nonempty(mdr, in)) {
+    respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+    dn->state_clear(CDentry::STATE_UNLINKING);
+    return;
+  }
+
+  if (straydn)
+    straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+
+  if (!mdr->more()->desti_srnode) {
+    if (in->is_projected_snaprealm_global()) {
+      sr_t *new_srnode = in->prepare_new_srnode(0);
+      in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
+      // dropping the last linkage or dropping the last remote linkage,
+      // detch the inode from global snaprealm
+      auto nlink = in->get_projected_inode()->nlink;
+      if (nlink == 1 ||
+	  (nlink == 2 && !dnl->is_primary() &&
+	   !in->get_projected_parent_dir()->inode->is_stray()))
+	in->clear_snaprealm_global(new_srnode);
+      mdr->more()->desti_srnode = new_srnode;
+    } else if (dnl->is_primary()) {
+      // prepare snaprealm blob for peer request
+      SnapRealm *realm = in->find_snaprealm();
+      snapid_t follows = realm->get_newest_seq();
+      if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
+	sr_t *new_srnode = in->prepare_new_srnode(follows);
+	in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
+	mdr->more()->desti_srnode = new_srnode;
+      }
+    }
+  }
+
+  // yay!
+  if (in->is_dir() && in->has_subtree_root_dirfrag()) {
+    // subtree root auths need to be witnesses
+    set<mds_rank_t> witnesses;
+    in->list_replicas(witnesses);
+    dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
+
+    for (set<mds_rank_t>::iterator p = witnesses.begin();
+	 p != witnesses.end();
+	 ++p) {
+      if (mdr->more()->witnessed.count(*p)) {
+	dout(10) << " already witnessed by mds." << *p << dendl;
+      } else if (mdr->more()->waiting_on_peer.count(*p)) {
+	dout(10) << " already waiting on witness mds." << *p << dendl;      
+      } else {
+	if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
+	  return;
+      }
+    }
+    if (!mdr->more()->waiting_on_peer.empty())
+      return;  // we're waiting for a witness.
+  }
+
+  if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
+    mds->locker->create_lock_cache(mdr, diri);
+
+  // ok!
+  if (dnl->is_remote() && !dnl->get_inode()->is_auth()) 
+    _link_remote(mdr, false, dn, dnl->get_inode());
+  else
+    _unlink_local(mdr, dn, straydn);
+}
+
+class C_MDS_unlink_local_finish : public ServerLogContext {
+  CDentry *dn;
+  CDentry *straydn;
+  version_t dnpv;  // deleted dentry
+public:
+  C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
+    ServerLogContext(s, r), dn(d), straydn(sd),
+    dnpv(d->get_projected_version()) {}
+  void finish(int r) override {
+    ceph_assert(r == 0);
+    server->_unlink_local_finish(mdr, dn, straydn, dnpv);
+  }
+};
+
+void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
+{
+  dout(10) << "_unlink_local " << *dn << dendl;
+
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  CInode *in = dnl->get_inode();
+
+
+  // ok, let's do it.
+  mdr->ls = mdlog->get_current_segment();
+
+  // prepare log entry
+  EUpdate *le = new EUpdate(mdlog, "unlink_local");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+  if (!mdr->more()->witnessed.empty()) {
+    dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
+    le->reqid = mdr->reqid;
+    le->had_peers = true;
+    mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+  }
+
+  if (straydn) {
+    ceph_assert(dnl->is_primary());
+    straydn->push_projected_linkage(in);
+  }
+
+  // the unlinked dentry
+  dn->pre_dirty();
+
+  auto pi = in->project_inode(mdr);
+  {
+    std::string t;
+    dn->make_path_string(t, true);
+    pi.inode->stray_prior_path = std::move(t);
+  }
+  pi.inode->version = in->pre_dirty();
+  pi.inode->ctime = mdr->get_op_stamp();
+  if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+    pi.inode->rstat.rctime = mdr->get_op_stamp();
+  pi.inode->change_attr++;
+  pi.inode->nlink--;
+  if (pi.inode->nlink == 0)
+    in->state_set(CInode::STATE_ORPHAN);
+
+  if (mdr->more()->desti_srnode) {
+    auto& desti_srnode = mdr->more()->desti_srnode;
+    in->project_snaprealm(desti_srnode);
+    desti_srnode = NULL;
+  }
+
+  if (straydn) {
+    // will manually pop projected inode
+
+    // primary link.  add stray dentry.
+    mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
+    mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+
+    pi.inode->update_backtrace();
+    le->metablob.add_primary_dentry(straydn, in, true, true);
+  } else {
+    // remote link.  update remote inode.
+    mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
+    mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
+    mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
+  }
+
+  mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
+  le->metablob.add_null_dentry(dn, true);
+
+  if (in->is_dir()) {
+    dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
+    le->metablob.renamed_dirino = in->ino();
+  }
+
+  dn->push_projected_linkage();
+
+  if (straydn) {
+    ceph_assert(in->first <= straydn->first);
+    in->first = straydn->first;
+  }
+
+  if (in->is_dir()) {
+    ceph_assert(straydn);
+    mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
+  }
+
+  journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
+}
+
+void Server::_unlink_local_finish(MDRequestRef& mdr,
+				  CDentry *dn, CDentry *straydn,
+				  version_t dnpv) 
+{
+  dout(10) << "_unlink_local_finish " << *dn << dendl;
+
+  if (!mdr->more()->witnessed.empty())
+    mdcache->logged_leader_update(mdr->reqid);
+
+  CInode *strayin = NULL;
+  bool hadrealm = false;
+  if (straydn) {
+    // if there is newly created snaprealm, need to split old snaprealm's
+    // inodes_with_caps. So pop snaprealm before linkage changes.
+    strayin = dn->get_linkage()->get_inode();
+    hadrealm = strayin->snaprealm ? true : false;
+    strayin->early_pop_projected_snaprealm();
+  }
+
+  // unlink main dentry
+  dn->get_dir()->unlink_inode(dn);
+  dn->pop_projected_linkage();
+  dn->mark_dirty(dnpv, mdr->ls);
+
+  // relink as stray?  (i.e. was primary link?)
+  if (straydn) {
+    dout(20) << " straydn is " << *straydn << dendl;
+    straydn->pop_projected_linkage();
+    mdcache->touch_dentry_bottom(straydn);
+  }
+
+  mdr->apply();
+
+  dn->state_clear(CDentry::STATE_UNLINKING);
+  mdcache->send_dentry_unlink(dn, straydn, mdr);
+
+  MDSContext::vec finished;
+  dn->take_waiting(CDentry::WAIT_UNLINK_FINISH, finished);
+  mdcache->mds->queue_waiters(finished);
+
+  if (straydn) {
+    // update subtree map?
+    if (strayin->is_dir())
+      mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
+
+    if (strayin->snaprealm && !hadrealm)
+      mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
+  }
+
+  // bump pop
+  mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
+
+  // reply
+  respond_to_request(mdr, 0);
+
+  // removing a new dn?
+  dn->get_dir()->try_remove_unlinked_dn(dn);
+
+  // clean up ?
+  // respond_to_request() drops locks. So stray reintegration can race with us.
+  if (straydn && !straydn->get_projected_linkage()->is_null()) {
+    // Tip off the MDCache that this dentry is a stray that
+    // might be elegible for purge.
+    mdcache->notify_stray(straydn);
+  }
+}
+
+bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
+{
+  if (mds->is_cluster_degraded() &&
+      !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
+    dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
+    if (mdr->more()->waiting_on_peer.empty())
+      mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
+    return false;
+  }
+  
+  dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
+  auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREP);
+  req->srcdnpath = filepath(trace.front()->get_dir()->ino());
+  for (auto dn : trace)
+    req->srcdnpath.push_dentry(dn->get_name());
+  mdcache->encode_replica_stray(straydn, who, req->straybl);
+  if (mdr->more()->desti_srnode)
+    encode(*mdr->more()->desti_srnode, req->desti_snapbl);
+
+  req->op_stamp = mdr->get_op_stamp();
+  mds->send_message_mds(req, who);
+  
+  ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
+  mdr->more()->waiting_on_peer.insert(who);
+  return true;
+}
+
+struct C_MDS_PeerRmdirPrep : public ServerLogContext {
+  CDentry *dn, *straydn;
+  C_MDS_PeerRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
+    : ServerLogContext(s, r), dn(d), straydn(st) {}
+  void finish(int r) override {
+    server->_logged_peer_rmdir(mdr, dn, straydn);
+  }
+};
+
+struct C_MDS_PeerRmdirCommit : public ServerContext {
+  MDRequestRef mdr;
+  CDentry *straydn;
+  C_MDS_PeerRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
+    : ServerContext(s), mdr(r), straydn(sd) { }
+  void finish(int r) override {
+    server->_commit_peer_rmdir(mdr, r, straydn);
+  }
+};
+
+void Server::handle_peer_rmdir_prep(MDRequestRef& mdr)
+{
+  dout(10) << "handle_peer_rmdir_prep " << *mdr
+	   << " " << mdr->peer_request->srcdnpath
+	   << " to " << mdr->peer_request->destdnpath
+	   << dendl;
+
+  vector<CDentry*> trace;
+  filepath srcpath(mdr->peer_request->srcdnpath);
+  dout(10) << " src " << srcpath << dendl;
+  CInode *in;
+  CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
+  int r = mdcache->path_traverse(mdr, cf, srcpath,
+				 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
+				 &trace, &in);
+  if (r > 0) return;
+  if (r == -CEPHFS_ESTALE) {
+    mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
+			    mdr->peer_to_mds, true);
+    return;
+  }
+  ceph_assert(r == 0);
+  CDentry *dn = trace.back();
+  dout(10) << " dn " << *dn << dendl;
+  mdr->pin(dn);
+
+  ceph_assert(mdr->straydn);
+  CDentry *straydn = mdr->straydn;
+  dout(10) << " straydn " << *straydn << dendl;
+  
+  mdr->set_op_stamp(mdr->peer_request->op_stamp);
+
+  rmdir_rollback rollback;
+  rollback.reqid = mdr->reqid;
+  rollback.src_dir = dn->get_dir()->dirfrag();
+  rollback.src_dname = dn->get_name();
+  rollback.dest_dir = straydn->get_dir()->dirfrag();
+  rollback.dest_dname = straydn->get_name();
+  if (mdr->peer_request->desti_snapbl.length()) {
+    if (in->snaprealm) {
+      encode(true, rollback.snapbl);
+      in->encode_snap_blob(rollback.snapbl);
+    } else {
+      encode(false, rollback.snapbl);
+    }
+  }
+  encode(rollback, mdr->more()->rollback_bl);
+  // FIXME: rollback snaprealm
+  dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
+
+  // set up commit waiter
+  mdr->more()->peer_commit = new C_MDS_PeerRmdirCommit(this, mdr, straydn);
+
+  straydn->push_projected_linkage(in);
+  dn->push_projected_linkage();
+
+  ceph_assert(straydn->first >= in->first);
+  in->first = straydn->first;
+
+  if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
+    dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
+    _logged_peer_rmdir(mdr, dn, straydn);
+    return;
+  }
+
+  mdr->ls = mdlog->get_current_segment();
+  EPeerUpdate *le =  new EPeerUpdate(mdlog, "peer_rmdir", mdr->reqid, mdr->peer_to_mds,
+				       EPeerUpdate::OP_PREPARE, EPeerUpdate::RMDIR);
+  mdlog->start_entry(le);
+  le->rollback = mdr->more()->rollback_bl;
+
+  le->commit.add_dir_context(straydn->get_dir());
+  le->commit.add_primary_dentry(straydn, in, true);
+  // peer: no need to journal original dentry
+
+  dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
+  le->commit.renamed_dirino = in->ino();
+
+  mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
+  mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
+
+  mdr->more()->peer_update_journaled = true;
+  submit_mdlog_entry(le, new C_MDS_PeerRmdirPrep(this, mdr, dn, straydn),
+                     mdr, __func__);
+  mdlog->flush();
+}
+
+void Server::_logged_peer_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
+{
+  dout(10) << "_logged_peer_rmdir " << *mdr << " on " << *dn << dendl;
+  CInode *in = dn->get_linkage()->get_inode();
+
+  bool new_realm;
+  if (mdr->peer_request->desti_snapbl.length()) {
+    new_realm = !in->snaprealm;
+    in->decode_snap_blob(mdr->peer_request->desti_snapbl);
+    ceph_assert(in->snaprealm);
+  } else {
+    new_realm = false;
+  }
+
+  // update our cache now, so we are consistent with what is in the journal
+  // when we journal a subtree map
+  dn->get_dir()->unlink_inode(dn);
+  straydn->pop_projected_linkage();
+  dn->pop_projected_linkage();
+
+  mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->peer_update_journaled);
+
+  if (new_realm)
+      mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
+
+  // done.
+  mdr->reset_peer_request();
+  mdr->straydn = 0;
+
+  if (!mdr->aborted) {
+    auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREPACK);
+    if (!mdr->more()->peer_update_journaled)
+      reply->mark_not_journaled();
+    mds->send_message_mds(reply, mdr->peer_to_mds);
+  } else {
+    dout(10) << " abort flag set, finishing" << dendl;
+    mdcache->request_finish(mdr);
+  }
+}
+
+void Server::handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
+{
+  dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
+	   << " " << *ack << dendl;
+
+  mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+  mdr->more()->peers.insert(from);
+  mdr->more()->witnessed.insert(from);
+  if (!ack->is_not_journaled())
+    mdr->more()->has_journaled_peers = true;
+
+  // remove from waiting list
+  ceph_assert(mdr->more()->waiting_on_peer.count(from));
+  mdr->more()->waiting_on_peer.erase(from);
+
+  if (mdr->more()->waiting_on_peer.empty())
+    dispatch_client_request(mdr);  // go again!
+  else 
+    dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
+}
+
+void Server::_commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
+{
+  dout(10) << "_commit_peer_rmdir " << *mdr << " r=" << r << dendl;
+
+  if (r == 0) {
+    if (mdr->more()->peer_update_journaled) {
+      CInode *strayin = straydn->get_projected_linkage()->get_inode();
+      if (strayin && !strayin->snaprealm)
+	mdcache->clear_dirty_bits_for_stray(strayin);
+    }
+
+    mdr->cleanup();
+
+    if (mdr->more()->peer_update_journaled) {
+      // write a commit to the journal
+      EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_commit", mdr->reqid,
+					  mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
+					  EPeerUpdate::RMDIR);
+      mdlog->start_entry(le);
+      submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
+      mdlog->flush();
+    } else {
+      _committed_peer(mdr);
+    }
+  } else {
+    // abort
+    do_rmdir_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
+  }
+}
+
+struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
+  metareqid_t reqid;
+  CDentry *dn;
+  CDentry *straydn;
+  C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
+    : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
+  void finish(int r) override {
+    server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
+  }
+};
+
+void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
+{
+  // unlink the other rollback methods, the rmdir rollback is only
+  // needed to record the subtree changes in the journal for inode
+  // replicas who are auth for empty dirfrags.  no actual changes to
+  // the file system are taking place here, so there is no Mutation.
+
+  rmdir_rollback rollback;
+  auto p = rbl.cbegin();
+  decode(rollback, p);
+  
+  dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
+  mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
+  ceph_assert(mdr || mds->is_resolve());
+
+  CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
+  if (!dir)
+    dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
+  ceph_assert(dir);
+  CDentry *dn = dir->lookup(rollback.src_dname);
+  ceph_assert(dn);
+  dout(10) << " dn " << *dn << dendl;
+  CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
+  ceph_assert(straydir);
+  CDentry *straydn = straydir->lookup(rollback.dest_dname);
+  ceph_assert(straydn);
+  dout(10) << " straydn " << *straydn << dendl;
+  CInode *in = straydn->get_linkage()->get_inode();
+
+  dn->push_projected_linkage(in);
+  straydn->push_projected_linkage();
+
+  if (rollback.snapbl.length() && in->snaprealm) {
+    bool hadrealm;
+    auto p = rollback.snapbl.cbegin();
+    decode(hadrealm, p);
+    if (hadrealm) {
+      decode(in->snaprealm->srnode, p);
+    } else {
+      in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
+    }
+  }
+
+  if (mdr && !mdr->more()->peer_update_journaled) {
+    ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
+
+    _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
+    return;
+  }
+
+
+  EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_rollback", rollback.reqid, leader,
+				      EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RMDIR);
+  mdlog->start_entry(le);
+  
+  le->commit.add_dir_context(dn->get_dir());
+  le->commit.add_primary_dentry(dn, in, true);
+  // peer: no need to journal straydn
+  
+  dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
+  le->commit.renamed_dirino = in->ino();
+
+  mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
+
+  submit_mdlog_entry(le,
+                     new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
+                                                   dn, straydn),
+                     mdr, __func__);
+  mdlog->flush();
+}
+
+void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
+{
+  dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
+
+  straydn->get_dir()->unlink_inode(straydn);
+  dn->pop_projected_linkage();
+  straydn->pop_projected_linkage();
+
+  CInode *in = dn->get_linkage()->get_inode();
+  mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
+				       !mdr || mdr->more()->peer_update_journaled);
+
+  if (mds->is_resolve()) {
+    CDir *root = mdcache->get_subtree_root(straydn->get_dir());
+    mdcache->try_trim_non_auth_subtree(root);
+  }
+
+  if (mdr)
+    mdcache->request_finish(mdr);
+
+  mdcache->finish_rollback(reqid, mdr);
+}
+
+
+/** _dir_is_nonempty[_unlocked]
+ *
+ * check if a directory is non-empty (i.e. we can rmdir it).
+ *
+ * the unlocked varient this is a fastpath check.  we can't really be
+ * sure until we rdlock the filelock.
+ */
+bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
+{
+  dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
+  ceph_assert(in->is_auth());
+
+  if (in->filelock.is_cached())
+    return false; // there can be pending async create/unlink. don't know.
+  if (in->snaprealm && in->snaprealm->srnode.snaps.size())
+    return true; // in a snapshot!
+
+  auto&& ls = in->get_dirfrags();
+  for (const auto& dir : ls) {
+    // is the frag obviously non-empty?
+    if (dir->is_auth()) {
+      if (dir->get_projected_fnode()->fragstat.size()) {
+	dout(10) << "dir_is_nonempty_unlocked dirstat has " 
+		 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
+	return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
+{
+  dout(10) << "dir_is_nonempty " << *in << dendl;
+  ceph_assert(in->is_auth());
+  ceph_assert(in->filelock.can_read(mdr->get_client()));
+
+  frag_info_t dirstat;
+  version_t dirstat_version = in->get_projected_inode()->dirstat.version;
+
+  auto&& ls = in->get_dirfrags();
+  for (const auto& dir : ls) {
+    const auto& pf = dir->get_projected_fnode();
+    if (pf->fragstat.size()) {
+      dout(10) << "dir_is_nonempty dirstat has "
+	       << pf->fragstat.size() << " items " << *dir << dendl;
+      return true;
+    }
+
+    if (pf->accounted_fragstat.version == dirstat_version)
+      dirstat.add(pf->accounted_fragstat);
+    else
+      dirstat.add(pf->fragstat);
+  }
+
+  return dirstat.size() != in->get_projected_inode()->dirstat.size();
+}
+
+
+// ======================================================
+
+
+class C_MDS_rename_finish : public ServerLogContext {
+  CDentry *srcdn;
+  CDentry *destdn;
+  CDentry *straydn;
+public:
+  C_MDS_rename_finish(Server *s, MDRequestRef& r,
+		      CDentry *sdn, CDentry *ddn, CDentry *stdn) :
+    ServerLogContext(s, r),
+    srcdn(sdn), destdn(ddn), straydn(stdn) { }
+  void finish(int r) override {
+    ceph_assert(r == 0);
+    server->_rename_finish(mdr, srcdn, destdn, straydn);
+  }
+};
+
+
+/** handle_client_rename
+ *
+ * rename leader is the destdn auth.  this is because cached inodes
+ * must remain connected.  thus, any replica of srci, must also
+ * replicate destdn, and possibly straydn, so that srci (and
+ * destdn->inode) remain connected during the rename.
+ *
+ * to do this, we freeze srci, then leader (destdn auth) verifies that
+ * all other nodes have also replciated destdn and straydn.  note that
+ * destdn replicas need not also replicate srci.  this only works when 
+ * destdn is leader.
+ *
+ * This function takes responsibility for the passed mdr.
+ */
+void Server::handle_client_rename(MDRequestRef& mdr)
+{
+  const auto& req = mdr->client_request;
+  dout(7) << "handle_client_rename " << *req << dendl;
+
+  filepath destpath = req->get_filepath();
+  filepath srcpath = req->get_filepath2();
+  if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
+    respond_to_request(mdr, -CEPHFS_EBUSY);
+    return;
+  }
+
+  if (req->get_alternate_name().size() > alternate_name_max) {
+    dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
+    respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
+    return;
+  }
+
+  auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
+  if (!destdn)
+    return;
+
+  if (is_unlink_pending(destdn)) {
+    wait_for_pending_unlink(destdn, mdr);
+    return;
+  }
+
+  if (is_unlink_pending(srcdn)) {
+    wait_for_pending_unlink(srcdn, mdr);
+    return;
+  }
+
+  dout(10) << " destdn " << *destdn << dendl;
+  CDir *destdir = destdn->get_dir();
+  ceph_assert(destdir->is_auth());
+  CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
+
+  dout(10) << " srcdn " << *srcdn << dendl;
+  CDir *srcdir = srcdn->get_dir();
+  CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+  CInode *srci = srcdnl->get_inode();
+  dout(10) << " srci " << *srci << dendl;
+
+  // -- some sanity checks --
+  if (destdn == srcdn) {
+    dout(7) << "rename src=dest, noop" << dendl;
+    respond_to_request(mdr, 0);
+    return;
+  }
+
+  // dest a child of src?
+  // e.g. mv /usr /usr/foo
+  if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
+    dout(7) << "cannot rename item to be a child of itself" << dendl;
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  // is this a stray migration, reintegration or merge? (sanity checks!)
+  if (mdr->reqid.name.is_mds() &&
+      !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
+	MDS_INO_IS_STRAY(destpath.get_ino())) &&
+      !(destdnl->is_remote() &&
+	destdnl->get_remote_ino() == srci->ino())) {
+    respond_to_request(mdr, -CEPHFS_EINVAL);  // actually, this won't reply, but whatev.
+    return;
+  }
+
+  CInode *oldin = 0;
+  if (!destdnl->is_null()) {
+    //dout(10) << "dest dn exists " << *destdn << dendl;
+    oldin = mdcache->get_dentry_inode(destdn, mdr, true);
+    if (!oldin) return;
+    dout(10) << " oldin " << *oldin << dendl;
+
+    // non-empty dir? do trivial fast unlocked check, do another check later with read locks
+    if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
+      respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+      return;
+    }
+
+    // mv /some/thing /to/some/existing_other_thing
+    if (oldin->is_dir() && !srci->is_dir()) {
+      respond_to_request(mdr, -CEPHFS_EISDIR);
+      return;
+    }
+    if (!oldin->is_dir() && srci->is_dir()) {
+      respond_to_request(mdr, -CEPHFS_ENOTDIR);
+      return;
+    }
+    if (srci == oldin && !srcdir->inode->is_stray()) {
+      respond_to_request(mdr, 0);  // no-op.  POSIX makes no sense.
+      return;
+    }
+    if (destdn->get_alternate_name() != req->get_alternate_name()) {
+      /* the dentry exists but the alternate_names do not match, fail... */
+      respond_to_request(mdr, -CEPHFS_EINVAL);
+      return;
+    }
+  }
+
+  vector<CDentry*>& srctrace = mdr->dn[1];
+  vector<CDentry*>& desttrace = mdr->dn[0];
+
+  // src+dest traces _must_ share a common ancestor for locking to prevent orphans
+  if (destpath.get_ino() != srcpath.get_ino() &&
+      !(req->get_source().is_mds() &&
+	MDS_INO_IS_STRAY(srcpath.get_ino()))) {  // <-- mds 'rename' out of stray dir is ok!
+    CInode *srcbase = srctrace[0]->get_dir()->get_inode();
+    CInode *destbase = desttrace[0]->get_dir()->get_inode();
+    // ok, extend srctrace toward root until it is an ancestor of desttrace.
+    while (srcbase != destbase &&
+	   !srcbase->is_projected_ancestor_of(destbase)) {
+      CDentry *pdn = srcbase->get_projected_parent_dn();
+      srctrace.insert(srctrace.begin(), pdn);
+      dout(10) << "rename prepending srctrace with " << *pdn << dendl;
+      srcbase = pdn->get_dir()->get_inode();
+    }
+
+    // then, extend destpath until it shares the same parent inode as srcpath.
+    while (destbase != srcbase) {
+      CDentry *pdn = destbase->get_projected_parent_dn();
+      desttrace.insert(desttrace.begin(), pdn);
+      dout(10) << "rename prepending desttrace with " << *pdn << dendl;
+      destbase = pdn->get_dir()->get_inode();
+    }
+    dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
+  }
+
+
+  bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
+  if (linkmerge)
+    dout(10) << " this is a link merge" << dendl;
+
+  // -- create stray dentry? --
+  CDentry *straydn = NULL;
+  if (destdnl->is_primary() && !linkmerge) {
+    straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
+    if (!straydn)
+      return;
+    dout(10) << " straydn is " << *straydn << dendl;
+  } else if (mdr->straydn) {
+    mdr->unpin(mdr->straydn);
+    mdr->straydn = NULL;
+  }
+
+
+  // -- locks --
+  if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+    MutationImpl::LockOpVec lov;
+
+    // we need to update srci's ctime.  xlock its least contended lock to do that...
+    lov.add_xlock(&srci->linklock);
+    lov.add_xlock(&srci->snaplock);
+
+    if (oldin) {
+      // xlock oldin (for nlink--)
+      lov.add_xlock(&oldin->linklock);
+      lov.add_xlock(&oldin->snaplock);
+      if (oldin->is_dir()) {
+	ceph_assert(srci->is_dir());
+	lov.add_rdlock(&oldin->filelock);   // to verify it's empty
+
+	// adjust locking order?
+	int cmp = mdr->compare_paths();
+	if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
+	  std::reverse(lov.begin(), lov.end());
+      } else {
+	ceph_assert(!srci->is_dir());
+	// adjust locking order;
+	if (srci->ino() > oldin->ino())
+	  std::reverse(lov.begin(), lov.end());
+      }
+    }
+
+    // straydn?
+    if (straydn) {
+      lov.add_wrlock(&straydn->get_dir()->inode->filelock);
+      lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
+      lov.add_xlock(&straydn->lock);
+    }
+
+    CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
+    if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
+      return;
+
+    mdr->locking_state |= MutationImpl::ALL_LOCKED;
+  }
+
+  if (linkmerge)
+    ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
+
+  if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
+    if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
+      return;
+
+    if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
+      return;
+
+    if (!linkmerge && !check_fragment_space(mdr, destdn->get_dir()))
+      return;
+
+    if (!linkmerge && !check_dir_max_entries(mdr, destdn->get_dir()))
+      return;
+
+    if (!check_dir_max_entries(mdr, destdn->get_dir()))
+      return;
+
+    if (!check_access(mdr, srci, MAY_WRITE))
+      return;
+  }
+
+  // with read lock, really verify oldin is empty
+  if (oldin &&
+      oldin->is_dir() &&
+      _dir_is_nonempty(mdr, oldin)) {
+    respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+    return;
+  }
+
+  /* project_snaprealm_past_parent() will do this job
+   *
+  // moving between snaprealms?
+  if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
+    SnapRealm *srcrealm = srci->find_snaprealm();
+    SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
+    if (srcrealm != destrealm &&
+	(srcrealm->get_newest_seq() + 1 > srcdn->first ||
+	 destrealm->get_newest_seq() + 1 > srcdn->first)) {
+      dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
+      mdcache->snaprealm_create(mdr, srci);
+      return;
+    }
+  }
+  */
+
+  SnapRealm *dest_realm = nullptr;
+  SnapRealm *src_realm = nullptr;
+  if (!linkmerge) {
+    dest_realm = destdir->inode->find_snaprealm();
+    if (srcdir->inode == destdir->inode)
+      src_realm = dest_realm;
+    else
+      src_realm = srcdir->inode->find_snaprealm();
+    if (src_realm != dest_realm &&
+	src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
+      respond_to_request(mdr, -CEPHFS_EXDEV);
+      return;
+    }
+  }
+
+  ceph_assert(g_conf()->mds_kill_rename_at != 1);
+
+  // -- open all srcdn inode frags, if any --
+  // we need these open so that auth can properly delegate from inode to dirfrags
+  // after the inode is _ours_.
+  if (srcdnl->is_primary() && 
+      !srcdn->is_auth() && 
+      srci->is_dir()) {
+    dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
+    mdr->set_stickydirs(srci);
+
+    frag_vec_t leaves;
+    srci->dirfragtree.get_leaves(leaves);
+    for (const auto& leaf : leaves) {
+      CDir *dir = srci->get_dirfrag(leaf);
+      if (!dir) {
+	dout(10) << " opening " << leaf << " under " << *srci << dendl;
+	mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
+	return;
+      }
+    }
+  }
+
+  // -- prepare snaprealm ---
+
+  if (linkmerge) {
+    if (!mdr->more()->srci_srnode &&
+	srci->get_projected_inode()->nlink == 1 &&
+	srci->is_projected_snaprealm_global()) {
+      sr_t *new_srnode = srci->prepare_new_srnode(0);
+      srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
+
+      srci->clear_snaprealm_global(new_srnode);
+      mdr->more()->srci_srnode = new_srnode;
+    }
+  } else {
+    if (oldin && !mdr->more()->desti_srnode) {
+      if (oldin->is_projected_snaprealm_global()) {
+	sr_t *new_srnode = oldin->prepare_new_srnode(0);
+	oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
+	// dropping the last linkage or dropping the last remote linkage,
+	// detch the inode from global snaprealm
+	auto nlink = oldin->get_projected_inode()->nlink;
+	if (nlink == 1 ||
+	    (nlink == 2 && !destdnl->is_primary() &&
+	     !oldin->get_projected_parent_dir()->inode->is_stray()))
+	  oldin->clear_snaprealm_global(new_srnode);
+	mdr->more()->desti_srnode = new_srnode;
+      } else if (destdnl->is_primary()) {
+	snapid_t follows = dest_realm->get_newest_seq();
+	if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
+	  sr_t *new_srnode = oldin->prepare_new_srnode(follows);
+	  oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
+	  mdr->more()->desti_srnode = new_srnode;
+	}
+      }
+    }
+    if (!mdr->more()->srci_srnode) {
+      if (srci->is_projected_snaprealm_global()) {
+	sr_t *new_srnode = srci->prepare_new_srnode(0);
+	srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
+	mdr->more()->srci_srnode = new_srnode;
+      } else if (srcdnl->is_primary()) {
+	snapid_t follows = src_realm->get_newest_seq();
+	if (src_realm != dest_realm &&
+	    (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
+	  sr_t *new_srnode = srci->prepare_new_srnode(follows);
+	  srci->record_snaprealm_past_parent(new_srnode, dest_realm);
+	  mdr->more()->srci_srnode = new_srnode;
+	}
+      }
+    }
+  }
+
+  // -- prepare witnesses --
+
+  /*
+   * NOTE: we use _all_ replicas as witnesses.
+   * this probably isn't totally necessary (esp for file renames),
+   * but if/when we change that, we have to make sure rejoin is
+   * sufficiently robust to handle strong rejoins from survivors
+   * with totally wrong dentry->inode linkage.
+   * (currently, it can ignore rename effects, because the resolve
+   * stage will sort them out.)
+   */
+  set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
+  if (srcdn->is_auth())
+    srcdn->list_replicas(witnesses);
+  else
+    witnesses.insert(srcdn->authority().first);
+  if (srcdnl->is_remote() && !srci->is_auth())
+    witnesses.insert(srci->authority().first);
+  destdn->list_replicas(witnesses);
+  if (destdnl->is_remote() && !oldin->is_auth())
+    witnesses.insert(oldin->authority().first);
+  dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
+
+  if (!witnesses.empty()) {
+    // Replicas can't see projected dentry linkages and will get confused.
+    // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
+    // can't project these inodes' linkages.
+    bool need_flush = false;
+    for (auto& dn : srctrace) {
+      if (dn->is_projected()) {
+	need_flush = true;
+	break;
+      }
+    }
+    if (!need_flush) {
+      CDentry *dn = destdn;
+      do {
+	if (dn->is_projected()) {
+	  need_flush = true;
+	  break;
+	}
+	CInode *diri = dn->get_dir()->get_inode();
+	dn = diri->get_projected_parent_dn();
+      } while (dn);
+    }
+    if (need_flush) {
+      mdlog->wait_for_safe(
+	  new MDSInternalContextWrapper(mds,
+	    new C_MDS_RetryRequest(mdcache, mdr)));
+      mdlog->flush();
+      return;
+    }
+  }
+
+  // do srcdn auth last
+  mds_rank_t last = MDS_RANK_NONE;
+  if (!srcdn->is_auth()) {
+    last = srcdn->authority().first;
+    mdr->more()->srcdn_auth_mds = last;
+    // ask auth of srci to mark srci as ambiguous auth if more than two MDS
+    // are involved in the rename operation.
+    if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
+      dout(10) << " preparing ambiguous auth for srci" << dendl;
+      ceph_assert(mdr->more()->is_remote_frozen_authpin);
+      ceph_assert(mdr->more()->rename_inode == srci);
+      _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
+      return;
+    }
+  }
+  
+  for (set<mds_rank_t>::iterator p = witnesses.begin();
+       p != witnesses.end();
+       ++p) {
+    if (*p == last) continue;  // do it last!
+    if (mdr->more()->witnessed.count(*p)) {
+      dout(10) << " already witnessed by mds." << *p << dendl;
+    } else if (mdr->more()->waiting_on_peer.count(*p)) {
+      dout(10) << " already waiting on witness mds." << *p << dendl;      
+    } else {
+      if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
+	return;
+    }
+  }
+  if (!mdr->more()->waiting_on_peer.empty())
+    return;  // we're waiting for a witness.
+
+  if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
+    dout(10) << " preparing last witness (srcdn auth)" << dendl;
+    ceph_assert(mdr->more()->waiting_on_peer.count(last) == 0);
+    _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
+    return;
+  }
+
+  // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
+  if (!mdr->more()->peers.empty() && !srci->is_dir())
+    ceph_assert(g_conf()->mds_kill_rename_at != 3);
+  if (!mdr->more()->peers.empty() && srci->is_dir())
+    ceph_assert(g_conf()->mds_kill_rename_at != 4);
+
+  // -- declare now --
+  mdr->set_mds_stamp(ceph_clock_now());
+
+  // -- prepare journal entry --
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "rename");
+  mdlog->start_entry(le);
+  le->metablob.add_client_req(mdr->reqid, req->get_oldest_client_tid());
+  if (!mdr->more()->witnessed.empty()) {
+    dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
+    
+    le->reqid = mdr->reqid;
+    le->had_peers = true;
+    
+    mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+    // no need to send frozen auth pin to recovring auth MDS of srci
+    mdr->more()->is_remote_frozen_authpin = false;
+  }
+  
+  _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, req->get_alternate_name(), straydn);
+  if (le->client_map.length())
+    le->cmapv = mds->sessionmap.get_projected();
+
+  // -- commit locally --
+  C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
+
+  journal_and_reply(mdr, srci, destdn, le, fin);
+  mds->balancer->maybe_fragment(destdn->get_dir(), false);
+}
+
+
+void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+  dout(10) << "_rename_finish " << *mdr << dendl;
+
+  if (!mdr->more()->witnessed.empty())
+    mdcache->logged_leader_update(mdr->reqid);
+
+  // apply
+  _rename_apply(mdr, srcdn, destdn, straydn);
+
+  mdcache->send_dentry_link(destdn, mdr);
+
+  CDentry::linkage_t *destdnl = destdn->get_linkage();
+  CInode *in = destdnl->get_inode();
+  bool need_eval = mdr->more()->cap_imports.count(in);
+
+  // test hack: test peer commit
+  if (!mdr->more()->peers.empty() && !in->is_dir())
+    ceph_assert(g_conf()->mds_kill_rename_at != 5);
+  if (!mdr->more()->peers.empty() && in->is_dir())
+    ceph_assert(g_conf()->mds_kill_rename_at != 6);
+  
+  // bump popularity
+  mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
+  if (destdnl->is_remote() && in->is_auth())
+    mds->balancer->hit_inode(in, META_POP_IWR);
+
+  // did we import srci?  if so, explicitly ack that import that, before we unlock and reply.
+
+  ceph_assert(g_conf()->mds_kill_rename_at != 7);
+
+  // reply
+  respond_to_request(mdr, 0);
+
+  if (need_eval)
+    mds->locker->eval(in, CEPH_CAP_LOCKS, true);
+
+  // clean up?
+  // respond_to_request() drops locks. So stray reintegration can race with us.
+  if (straydn && !straydn->get_projected_linkage()->is_null()) {
+    mdcache->notify_stray(straydn);
+  }
+}
+
+
+
+// helpers
+
+bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
+				     vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
+{
+  const auto& client_req = mdr->client_request;
+  ceph_assert(client_req);
+
+  if (mds->is_cluster_degraded() &&
+      !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
+    dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
+    if (mdr->more()->waiting_on_peer.empty())
+      mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
+    return false;
+  }
+
+  dout(10) << "_rename_prepare_witness mds." << who << dendl;
+  auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREP);
+
+  req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
+  for (auto dn : srctrace)
+    req->srcdnpath.push_dentry(dn->get_name());
+  req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
+  for (auto dn : dsttrace)
+    req->destdnpath.push_dentry(dn->get_name());
+  req->alternate_name = client_req->alternate_name;
+  if (straydn)
+    mdcache->encode_replica_stray(straydn, who, req->straybl);
+
+  if (mdr->more()->srci_srnode)
+    encode(*mdr->more()->srci_srnode, req->srci_snapbl);
+  if (mdr->more()->desti_srnode)
+    encode(*mdr->more()->desti_srnode, req->desti_snapbl);
+
+  req->srcdn_auth = mdr->more()->srcdn_auth_mds;
+  
+  // srcdn auth will verify our current witness list is sufficient
+  req->witnesses = witnesse;
+
+  req->op_stamp = mdr->get_op_stamp();
+  mds->send_message_mds(req, who);
+  
+  ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
+  mdr->more()->waiting_on_peer.insert(who);
+  return true;
+}
+
+version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
+{
+  version_t oldpv = mdr->more()->inode_import_v;
+
+  CDentry::linkage_t *srcdnl = srcdn->get_linkage();
+
+  /* import node */
+  auto blp = mdr->more()->inode_import.cbegin();
+	  
+  // imported caps
+  map<client_t,entity_inst_t> client_map;
+  map<client_t, client_metadata_t> client_metadata_map;
+  decode(client_map, blp);
+  decode(client_metadata_map, blp);
+  prepare_force_open_sessions(client_map, client_metadata_map,
+			      mdr->more()->imported_session_map);
+  encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
+  encode(client_metadata_map, *client_map_bl);
+
+  list<ScatterLock*> updated_scatterlocks;
+  mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
+					 mdr->more()->cap_imports, updated_scatterlocks);
+
+  // hack: force back to !auth and clean, temporarily
+  srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
+  srcdnl->get_inode()->mark_clean();
+
+  return oldpv;
+}
+
+bool Server::_need_force_journal(CInode *diri, bool empty)
+{
+  auto&& dirs = diri->get_dirfrags();
+
+  bool force_journal = false;
+  if (empty) {
+    for (const auto& dir : dirs) {
+      if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
+	dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
+	force_journal = true;
+	break;
+      } else
+	dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
+    }
+  } else {
+    // see if any children of our frags are auth subtrees.
+    std::vector<CDir*> subtrees;
+    mdcache->get_subtrees(subtrees);
+    dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
+    for (const auto& dir : dirs) {
+      for (const auto& subtree : subtrees) {
+	if (dir->contains(subtree)) {
+	  if (subtree->get_dir_auth().first == mds->get_nodeid()) {
+	    dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
+		     << *subtree << dendl;
+	    force_journal = true;
+	    break;
+	  } else
+	    dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
+	} else
+	  dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
+      }
+      if (force_journal)
+	break;
+    }
+  }
+  return force_journal;
+}
+
+void Server::_rename_prepare(MDRequestRef& mdr,
+			     EMetaBlob *metablob, bufferlist *client_map_bl,
+			     CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
+                             CDentry *straydn)
+{
+  dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
+  if (straydn)
+    dout(10) << " straydn " << *straydn << dendl;
+
+  CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+  CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
+  CInode *srci = srcdnl->get_inode();
+  CInode *oldin = destdnl->get_inode();
+
+  // primary+remote link merge?
+  bool linkmerge = (srci == oldin);
+  if (linkmerge)
+    ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
+  bool silent = srcdn->get_dir()->inode->is_stray();
+
+  bool force_journal_dest = false;
+  if (srci->is_dir() && !destdn->is_auth()) {
+    if (srci->is_auth()) {
+      // if we are auth for srci and exporting it, force journal because journal replay needs
+      // the source inode to create auth subtrees.
+      dout(10) << " we are exporting srci, will force journal destdn" << dendl;
+      force_journal_dest = true;
+    } else
+      force_journal_dest = _need_force_journal(srci, false);
+  }
+
+  bool force_journal_stray = false;
+  if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
+    force_journal_stray = _need_force_journal(oldin, true);
+
+  if (linkmerge)
+    dout(10) << " merging remote and primary links to the same inode" << dendl;
+  if (silent)
+    dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
+  if (force_journal_dest)
+    dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
+  if (force_journal_stray)
+    dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
+
+  if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
+    dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
+    metablob->renamed_dirino = srci->ino();
+  } else if (oldin && oldin->is_dir() && force_journal_stray) {
+    dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
+    metablob->renamed_dirino = oldin->ino();
+  }
+
+  // prepare
+  CInode::mempool_inode *spi = 0;    // renamed inode
+  CInode::mempool_inode *tpi = 0;  // target/overwritten inode
+  
+  // target inode
+  if (!linkmerge) {
+    if (destdnl->is_primary()) {
+      ceph_assert(straydn);  // moving to straydn.
+      // link--, and move.
+      if (destdn->is_auth()) {
+	auto pi= oldin->project_inode(mdr); //project_snaprealm
+	pi.inode->version = straydn->pre_dirty(pi.inode->version);
+	pi.inode->update_backtrace();
+        tpi = pi.inode.get();
+      }
+      straydn->push_projected_linkage(oldin);
+    } else if (destdnl->is_remote()) {
+      // nlink-- targeti
+      if (oldin->is_auth()) {
+	auto pi = oldin->project_inode(mdr);
+	pi.inode->version = oldin->pre_dirty();
+        tpi = pi.inode.get();
+      }
+    }
+  }
+
+  // dest
+  if (destdnl->is_null()) {
+    /* handle_client_rename checks that alternate_name matches for existing destdn */
+    destdn->set_alternate_name(alternate_name);
+  }
+  if (srcdnl->is_remote()) {
+    if (!linkmerge) {
+      // destdn
+      if (destdn->is_auth())
+	mdr->more()->pvmap[destdn] = destdn->pre_dirty();
+      destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
+      // srci
+      if (srci->is_auth()) {
+	auto pi = srci->project_inode(mdr);
+	pi.inode->version = srci->pre_dirty();
+        spi = pi.inode.get();
+      }
+    } else {
+      dout(10) << " will merge remote onto primary link" << dendl;
+      if (destdn->is_auth()) {
+	auto pi = oldin->project_inode(mdr);
+	pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->get_version());
+        spi = pi.inode.get();
+      }
+    }
+  } else { // primary
+    if (destdn->is_auth()) {
+      version_t oldpv;
+      if (srcdn->is_auth())
+	oldpv = srci->get_projected_version();
+      else {
+	oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
+
+	// note which dirfrags have child subtrees in the journal
+	// event, so that we can open those (as bounds) during replay.
+	if (srci->is_dir()) {
+	  auto&& ls = srci->get_dirfrags();
+	  for (const auto& dir : ls) {
+	    if (!dir->is_auth())
+	      metablob->renamed_dir_frags.push_back(dir->get_frag());
+	  }
+	  dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
+	}
+      }
+      auto pi = srci->project_inode(mdr); // project snaprealm if srcdnl->is_primary
+                                                 // & srcdnl->snaprealm
+      pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
+      pi.inode->update_backtrace();
+      spi = pi.inode.get();
+    }
+    destdn->push_projected_linkage(srci);
+  }
+
+  // src
+  if (srcdn->is_auth())
+    mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
+  srcdn->push_projected_linkage();  // push null linkage
+
+  if (!silent) {
+    if (spi) {
+      spi->ctime = mdr->get_op_stamp();
+      if (mdr->get_op_stamp() > spi->rstat.rctime)
+	spi->rstat.rctime = mdr->get_op_stamp();
+      spi->change_attr++;
+      if (linkmerge)
+	spi->nlink--;
+    }
+    if (tpi) {
+      tpi->ctime = mdr->get_op_stamp();
+      if (mdr->get_op_stamp() > tpi->rstat.rctime)
+	tpi->rstat.rctime = mdr->get_op_stamp();
+      tpi->change_attr++;
+      {
+        std::string t;
+        destdn->make_path_string(t, true);
+        tpi->stray_prior_path = std::move(t);
+      }
+      tpi->nlink--;
+      if (tpi->nlink == 0)
+	oldin->state_set(CInode::STATE_ORPHAN);
+    }
+  }
+
+  // prepare nesting, mtime updates
+  int predirty_dir = silent ? 0:PREDIRTY_DIR;
+  
+  // guarantee stray dir is processed first during journal replay. unlink the old inode,
+  // then link the source inode to destdn
+  if (destdnl->is_primary()) {
+    ceph_assert(straydn);
+    if (straydn->is_auth()) {
+      metablob->add_dir_context(straydn->get_dir());
+      metablob->add_dir(straydn->get_dir(), true);
+    }
+  }
+
+  if (!linkmerge && destdnl->is_remote() && oldin->is_auth()) {
+    CDir *oldin_dir = oldin->get_projected_parent_dir();
+    if (oldin_dir != srcdn->get_dir() && oldin_dir != destdn->get_dir())
+      mdcache->predirty_journal_parents(mdr, metablob, oldin, oldin_dir, PREDIRTY_PRIMARY);
+  }
+
+  // sub off target
+  if (destdn->is_auth() && !destdnl->is_null()) {
+    mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
+				      (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
+    if (destdnl->is_primary()) {
+      ceph_assert(straydn);
+      mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
+					PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+    }
+  }
+
+  if (srcdnl->is_remote() && srci->is_auth()) {
+    CDir *srci_dir = srci->get_projected_parent_dir();
+    if (srci_dir != srcdn->get_dir() && srci_dir != destdn->get_dir())
+      mdcache->predirty_journal_parents(mdr, metablob, srci, srci_dir, PREDIRTY_PRIMARY);
+  }
+  
+  // move srcdn
+  int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
+  int flags = predirty_dir | predirty_primary;
+  if (srcdn->is_auth())
+    mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
+  if (destdn->is_auth())
+    mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
+
+  // add it all to the metablob
+  // target inode
+  if (!linkmerge) {
+    if (destdnl->is_primary()) {
+      ceph_assert(straydn);
+      if (destdn->is_auth()) {
+	// project snaprealm, too
+	if (auto& desti_srnode = mdr->more()->desti_srnode) {
+	  oldin->project_snaprealm(desti_srnode);
+	  if (tpi->nlink == 0)
+	    ceph_assert(!desti_srnode->is_parent_global());
+	  desti_srnode = NULL;
+	}
+	straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+	metablob->add_primary_dentry(straydn, oldin, true, true);
+      } else if (force_journal_stray) {
+	dout(10) << " forced journaling straydn " << *straydn << dendl;
+	metablob->add_dir_context(straydn->get_dir());
+	metablob->add_primary_dentry(straydn, oldin, true);
+      }
+    } else if (destdnl->is_remote()) {
+      if (oldin->is_auth()) {
+	sr_t *new_srnode = NULL;
+	if (mdr->peer_request) {
+	  if (mdr->peer_request->desti_snapbl.length() > 0) {
+	    new_srnode = new sr_t();
+	    auto p = mdr->peer_request->desti_snapbl.cbegin();
+	    decode(*new_srnode, p);
+	  }
+	} else if (auto& desti_srnode = mdr->more()->desti_srnode) {
+	  new_srnode = desti_srnode;
+	  desti_srnode = NULL;
+	}
+	if (new_srnode) {
+	  oldin->project_snaprealm(new_srnode);
+	  if (tpi->nlink == 0)
+	    ceph_assert(!new_srnode->is_parent_global());
+	}
+	// auth for targeti
+	CDentry *oldin_pdn = oldin->get_projected_parent_dn();
+	mdcache->journal_cow_dentry(mdr.get(), metablob, oldin_pdn);
+	metablob->add_primary_dentry(oldin_pdn, oldin, true);
+      }
+    }
+  }
+
+  // dest
+  if (srcdnl->is_remote()) {
+    ceph_assert(!linkmerge);
+    if (destdn->is_auth() && !destdnl->is_null())
+      mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
+    else
+      destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+
+    if (destdn->is_auth())
+      metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
+
+    if (srci->is_auth() ) { // it's remote
+      if (mdr->peer_request) {
+	if (mdr->peer_request->srci_snapbl.length() > 0) {
+	  sr_t *new_srnode = new sr_t();
+	  auto p = mdr->peer_request->srci_snapbl.cbegin();
+	  decode(*new_srnode, p);
+	  srci->project_snaprealm(new_srnode);
+	}
+      } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
+	srci->project_snaprealm(srci_srnode);
+	srci_srnode = NULL;
+      }
+
+      CDentry *srci_pdn = srci->get_projected_parent_dn();
+      mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn);
+      metablob->add_primary_dentry(srci_pdn, srci, true);
+    }
+  } else if (srcdnl->is_primary()) {
+    // project snap parent update?
+    if (destdn->is_auth()) {
+      if (auto& srci_srnode = mdr->more()->srci_srnode) {
+	srci->project_snaprealm(srci_srnode);
+	srci_srnode = NULL;
+      }
+    }
+    
+    if (destdn->is_auth() && !destdnl->is_null())
+      mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
+
+    destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+
+    if (destdn->is_auth())
+      metablob->add_primary_dentry(destdn, srci, true, true);
+    else if (force_journal_dest) {
+      dout(10) << " forced journaling destdn " << *destdn << dendl;
+      metablob->add_dir_context(destdn->get_dir());
+      metablob->add_primary_dentry(destdn, srci, true);
+      if (srcdn->is_auth() && srci->is_dir()) {
+	// journal new subtrees root dirfrags
+	auto&& ls = srci->get_dirfrags();
+	for (const auto& dir : ls) {
+	  if (dir->is_auth())
+	    metablob->add_dir(dir, true);
+	}
+      }
+    }
+  }
+    
+  // src
+  if (srcdn->is_auth()) {
+    dout(10) << " journaling srcdn " << *srcdn << dendl;
+    mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
+    // also journal the inode in case we need do peer rename rollback. It is Ok to add
+    // both primary and NULL dentries. Because during journal replay, null dentry is
+    // processed after primary dentry.
+    if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
+      metablob->add_primary_dentry(srcdn, srci, true);
+    metablob->add_null_dentry(srcdn, true);
+  } else
+    dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
+
+  // make renamed inode first track the dn
+  if (srcdnl->is_primary() && destdn->is_auth()) {
+    ceph_assert(srci->first <= destdn->first);
+    srci->first = destdn->first;
+  }
+  // make stray inode first track the straydn
+  if (straydn && straydn->is_auth()) {
+    ceph_assert(oldin->first <= straydn->first);
+    oldin->first = straydn->first;
+  }
+
+  if (oldin && oldin->is_dir()) {
+    ceph_assert(straydn);
+    mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
+  }
+  if (srci->is_dir())
+    mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
+
+}
+
+
+void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+  dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
+  dout(10) << " pvs " << mdr->more()->pvmap << dendl;
+
+  CDentry::linkage_t *srcdnl = srcdn->get_linkage();
+  CDentry::linkage_t *destdnl = destdn->get_linkage();
+
+  CInode *oldin = destdnl->get_inode();
+
+  // primary+remote link merge?
+  bool linkmerge = (srcdnl->get_inode() == oldin);
+  if (linkmerge)
+    ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
+
+  bool new_in_snaprealm = false;
+  bool new_oldin_snaprealm = false;
+
+  // target inode
+  if (!linkmerge) {
+    if (destdnl->is_primary()) {
+      ceph_assert(straydn);
+      dout(10) << "straydn is " << *straydn << dendl;
+
+      // if there is newly created snaprealm, need to split old snaprealm's
+      // inodes_with_caps. So pop snaprealm before linkage changes.
+      if (destdn->is_auth()) {
+	bool hadrealm = (oldin->snaprealm ? true : false);
+	oldin->early_pop_projected_snaprealm();
+	new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
+      } else {
+	ceph_assert(mdr->peer_request);
+	if (mdr->peer_request->desti_snapbl.length()) {
+	  new_oldin_snaprealm = !oldin->snaprealm;
+	  oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
+	  ceph_assert(oldin->snaprealm);
+	}
+      }
+
+      destdn->get_dir()->unlink_inode(destdn, false);
+
+      straydn->pop_projected_linkage();
+      if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
+	ceph_assert(!straydn->is_projected()); // no other projected
+
+      // nlink-- targeti
+      if (destdn->is_auth())
+	oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
+
+      mdcache->touch_dentry_bottom(straydn);  // drop dn as quickly as possible.
+    } else if (destdnl->is_remote()) {
+      destdn->get_dir()->unlink_inode(destdn, false);
+      if (oldin->is_auth()) {
+	oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
+      } else if (mdr->peer_request) {
+	if (mdr->peer_request->desti_snapbl.length() > 0) {
+	  ceph_assert(oldin->snaprealm);
+	  oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
+	}
+      } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
+	delete desti_srnode;
+	desti_srnode = NULL;
+      }
+    }
+  }
+
+  // unlink src before we relink it at dest
+  CInode *in = srcdnl->get_inode();
+  ceph_assert(in);
+
+  bool srcdn_was_remote = srcdnl->is_remote();
+  if (!srcdn_was_remote) {
+    // if there is newly created snaprealm, need to split old snaprealm's
+    // inodes_with_caps. So pop snaprealm before linkage changes.
+    if (destdn->is_auth()) {
+      bool hadrealm = (in->snaprealm ? true : false);
+      in->early_pop_projected_snaprealm();
+      new_in_snaprealm = (in->snaprealm && !hadrealm);
+    } else {
+      ceph_assert(mdr->peer_request);
+      if (mdr->peer_request->srci_snapbl.length()) {
+	new_in_snaprealm = !in->snaprealm;
+	in->decode_snap_blob(mdr->peer_request->srci_snapbl);
+	ceph_assert(in->snaprealm);
+      }
+    }
+  }
+
+  srcdn->get_dir()->unlink_inode(srcdn);
+
+  // dest
+  if (srcdn_was_remote) {
+    if (!linkmerge) {
+      // destdn
+      destdnl = destdn->pop_projected_linkage();
+      if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
+	ceph_assert(!destdn->is_projected()); // no other projected
+
+      destdn->link_remote(destdnl, in);
+      if (destdn->is_auth())
+	destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
+      // in
+      if (in->is_auth()) {
+	in->pop_and_dirty_projected_inode(mdr->ls, mdr);
+      } else if (mdr->peer_request) {
+	if (mdr->peer_request->srci_snapbl.length() > 0) {
+	  ceph_assert(in->snaprealm);
+	  in->decode_snap_blob(mdr->peer_request->srci_snapbl);
+	}
+      } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
+	delete srci_srnode;
+	srci_srnode = NULL;
+      }
+    } else {
+      dout(10) << "merging remote onto primary link" << dendl;
+      oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
+    }
+  } else { // primary
+    if (linkmerge) {
+      dout(10) << "merging primary onto remote link" << dendl;
+      destdn->get_dir()->unlink_inode(destdn, false);
+    }
+    destdnl = destdn->pop_projected_linkage();
+    if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
+      ceph_assert(!destdn->is_projected()); // no other projected
+
+    // srcdn inode import?
+    if (!srcdn->is_auth() && destdn->is_auth()) {
+      ceph_assert(mdr->more()->inode_import.length() > 0);
+
+      map<client_t,Capability::Import> imported_caps;
+      
+      // finish cap imports
+      finish_force_open_sessions(mdr->more()->imported_session_map);
+      if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
+	mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
+						    mdr->more()->srcdn_auth_mds, true,
+						    mdr->more()->imported_session_map,
+						    mdr->more()->cap_imports[destdnl->get_inode()],
+						    imported_caps);
+      }
+
+      mdr->more()->inode_import.clear();
+      encode(imported_caps, mdr->more()->inode_import);
+
+      /* hack: add an auth pin for each xlock we hold. These were
+       * remote xlocks previously but now they're local and
+       * we're going to try and unpin when we xlock_finish. */
+
+      for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
+	   i !=  mdr->locks.end();
+	   ++i) {
+	SimpleLock *lock = i->lock;
+	if (lock->get_parent() != destdnl->get_inode())
+	  break;
+	if (i->is_xlock() && !lock->is_locallock())
+	  mds->locker->xlock_import(lock);
+      }
+      
+      // hack: fix auth bit
+      in->state_set(CInode::STATE_AUTH);
+
+      mdr->clear_ambiguous_auth();
+    }
+
+    if (destdn->is_auth())
+      in->pop_and_dirty_projected_inode(mdr->ls, mdr);
+  }
+
+  // src
+  if (srcdn->is_auth())
+    srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
+  srcdn->pop_projected_linkage();
+  if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
+    ceph_assert(!srcdn->is_projected()); // no other projected
+  
+  // apply remaining projected inodes (nested)
+  mdr->apply();
+
+  // update subtree map?
+  if (destdnl->is_primary() && in->is_dir())
+    mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
+
+  if (straydn && oldin->is_dir())
+    mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
+
+  if (new_oldin_snaprealm)
+    mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
+  if (new_in_snaprealm)
+    mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
+
+  // removing a new dn?
+  if (srcdn->is_auth())
+    srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
+}
+
+
+
+// ------------
+// PEER
+
+class C_MDS_PeerRenamePrep : public ServerLogContext {
+  CDentry *srcdn, *destdn, *straydn;
+public:
+  C_MDS_PeerRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
+    ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
+  void finish(int r) override {
+    server->_logged_peer_rename(mdr, srcdn, destdn, straydn);
+  }
+};
+
+class C_MDS_PeerRenameCommit : public ServerContext {
+  MDRequestRef mdr;
+  CDentry *srcdn, *destdn, *straydn;
+public:
+  C_MDS_PeerRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
+    ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
+  void finish(int r) override {
+    server->_commit_peer_rename(mdr, r, srcdn, destdn, straydn);
+  }
+};
+
+class C_MDS_PeerRenameSessionsFlushed : public ServerContext {
+  MDRequestRef mdr;
+public:
+  C_MDS_PeerRenameSessionsFlushed(Server *s, MDRequestRef& r) :
+    ServerContext(s), mdr(r) {}
+  void finish(int r) override {
+    server->_peer_rename_sessions_flushed(mdr);
+  }
+};
+
+void Server::handle_peer_rename_prep(MDRequestRef& mdr)
+{
+  dout(10) << "handle_peer_rename_prep " << *mdr
+	   << " " << mdr->peer_request->srcdnpath
+	   << " to " << mdr->peer_request->destdnpath
+	   << dendl;
+
+  if (mdr->peer_request->is_interrupted()) {
+    dout(10) << " peer request interrupted, sending noop reply" << dendl;
+    auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
+    reply->mark_interrupted();
+    mds->send_message_mds(reply, mdr->peer_to_mds);
+    mdr->reset_peer_request();
+    return;
+  }
+
+  // discover destdn
+  filepath destpath(mdr->peer_request->destdnpath);
+  dout(10) << " dest " << destpath << dendl;
+  vector<CDentry*> trace;
+  CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
+  int r = mdcache->path_traverse(mdr, cf, destpath,
+				 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
+				 &trace);
+  if (r > 0) return;
+  if (r == -CEPHFS_ESTALE) {
+    mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
+			    mdr->peer_to_mds, true);
+    return;
+  }
+  ceph_assert(r == 0);  // we shouldn't get an error here!
+      
+  CDentry *destdn = trace.back();
+  CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
+  dout(10) << " destdn " << *destdn << dendl;
+  mdr->pin(destdn);
+  
+  // discover srcdn
+  filepath srcpath(mdr->peer_request->srcdnpath);
+  dout(10) << " src " << srcpath << dendl;
+  CInode *srci = nullptr;
+  r = mdcache->path_traverse(mdr, cf, srcpath,
+			     MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
+			     &trace, &srci);
+  if (r > 0) return;
+  ceph_assert(r == 0);
+
+  CDentry *srcdn = trace.back();
+  CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+  dout(10) << " srcdn " << *srcdn << dendl;
+  mdr->pin(srcdn);
+  mdr->pin(srci);
+
+  // stray?
+  bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
+  if (linkmerge)
+    ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
+  CDentry *straydn = mdr->straydn;
+  if (destdnl->is_primary() && !linkmerge)
+    ceph_assert(straydn);
+
+  mdr->set_op_stamp(mdr->peer_request->op_stamp);
+  mdr->more()->srcdn_auth_mds = srcdn->authority().first;
+
+  // set up commit waiter (early, to clean up any freezing etc we do)
+  if (!mdr->more()->peer_commit)
+    mdr->more()->peer_commit = new C_MDS_PeerRenameCommit(this, mdr, srcdn, destdn, straydn);
+
+  // am i srcdn auth?
+  if (srcdn->is_auth()) {
+    set<mds_rank_t> srcdnrep;
+    srcdn->list_replicas(srcdnrep);
+
+    bool reply_witness = false;
+    if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
+      // freeze?
+      // we need this to
+      //  - avoid conflicting lock state changes
+      //  - avoid concurrent updates to the inode
+      //     (this could also be accomplished with the versionlock)
+      int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
+      dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
+      bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
+
+      // unfreeze auth pin after freezing the inode to avoid queueing waiters
+      if (srcdnl->get_inode()->is_frozen_auth_pin())
+	mdr->unfreeze_auth_pin();
+
+      if (!frozen_inode) {
+	srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
+	return;
+      }
+
+      /*
+       * set ambiguous auth for srci
+       * NOTE: we don't worry about ambiguous cache expire as we do
+       * with subtree migrations because all peers will pin
+       * srcdn->get_inode() for duration of this rename.
+       */
+      mdr->set_ambiguous_auth(srcdnl->get_inode());
+
+      // just mark the source inode as ambiguous auth if more than two MDS are involved.
+      // the leader will send another OP_RENAMEPREP peer request later.
+      if (mdr->peer_request->witnesses.size() > 1) {
+	dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
+	reply_witness = true;
+      }
+
+      // make sure bystanders have received all lock related messages
+      for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
+	if (*p == mdr->peer_to_mds ||
+	    (mds->is_cluster_degraded() &&
+	     !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
+	  continue;
+	auto notify = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMENOTIFY);
+	mds->send_message_mds(notify, *p);
+	mdr->more()->waiting_on_peer.insert(*p);
+      }
+
+      // make sure clients have received all cap related messages
+      set<client_t> export_client_set;
+      mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
+
+      MDSGatherBuilder gather(g_ceph_context);
+      flush_client_sessions(export_client_set, gather);
+      if (gather.has_subs()) {
+	mdr->more()->waiting_on_peer.insert(MDS_RANK_NONE);
+	gather.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr));
+	gather.activate();
+      }
+    }
+
+    // is witness list sufficient?
+    for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
+      if (*p == mdr->peer_to_mds ||
+	  mdr->peer_request->witnesses.count(*p)) continue;
+      dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
+      reply_witness = true;
+      break;
+    }
+
+    if (reply_witness) {
+      ceph_assert(!srcdnrep.empty());
+      auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
+      reply->witnesses.swap(srcdnrep);
+      mds->send_message_mds(reply, mdr->peer_to_mds);
+      mdr->reset_peer_request();
+      return;	
+    }
+    dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
+    if (!mdr->more()->waiting_on_peer.empty()) {
+      dout(10) << " still waiting for rename notify acks from "
+	       << mdr->more()->waiting_on_peer << dendl;
+      return;
+    }
+  } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
+    // set ambiguous auth for srci on witnesses
+    mdr->set_ambiguous_auth(srcdnl->get_inode());
+  }
+
+  // encode everything we'd need to roll this back... basically, just the original state.
+  rename_rollback rollback;
+  
+  rollback.reqid = mdr->reqid;
+  
+  rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
+  rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
+  rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
+  rollback.orig_src.dname = srcdn->get_name();
+  if (srcdnl->is_primary())
+    rollback.orig_src.ino = srcdnl->get_inode()->ino();
+  else {
+    ceph_assert(srcdnl->is_remote());
+    rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
+    rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
+  }
+  
+  rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
+  rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
+  rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
+  rollback.orig_dest.dname = destdn->get_name();
+  if (destdnl->is_primary())
+    rollback.orig_dest.ino = destdnl->get_inode()->ino();
+  else if (destdnl->is_remote()) {
+    rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
+    rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
+  }
+  
+  if (straydn) {
+    rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
+    rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
+    rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
+    rollback.stray.dname = straydn->get_name();
+  }
+  if (mdr->peer_request->desti_snapbl.length()) {
+    CInode *oldin = destdnl->get_inode();
+    if (oldin->snaprealm) {
+      encode(true, rollback.desti_snapbl);
+      oldin->encode_snap_blob(rollback.desti_snapbl);
+    } else {
+      encode(false, rollback.desti_snapbl);
+    }
+  }
+  if (mdr->peer_request->srci_snapbl.length()) {
+    if (srci->snaprealm) {
+      encode(true, rollback.srci_snapbl);
+      srci->encode_snap_blob(rollback.srci_snapbl);
+    } else {
+      encode(false, rollback.srci_snapbl);
+    }
+  }
+  encode(rollback, mdr->more()->rollback_bl);
+  // FIXME: rollback snaprealm
+  dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
+
+  // journal.
+  mdr->ls = mdlog->get_current_segment();
+  EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_prep", mdr->reqid, mdr->peer_to_mds,
+				      EPeerUpdate::OP_PREPARE, EPeerUpdate::RENAME);
+  mdlog->start_entry(le);
+  le->rollback = mdr->more()->rollback_bl;
+  
+  bufferlist blah;  // inode import data... obviously not used if we're the peer
+  _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, mdr->peer_request->alternate_name, straydn);
+
+  if (le->commit.empty()) {
+    dout(10) << " empty metablob, skipping journal" << dendl;
+    mdlog->cancel_entry(le);
+    mdr->ls = NULL;
+    _logged_peer_rename(mdr, srcdn, destdn, straydn);
+  } else {
+    mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
+    mdr->more()->peer_update_journaled = true;
+    submit_mdlog_entry(le, new C_MDS_PeerRenamePrep(this, mdr, srcdn, destdn, straydn),
+		       mdr, __func__);
+    mdlog->flush();
+  }
+}
+
+void Server::_logged_peer_rename(MDRequestRef& mdr,
+				  CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+  dout(10) << "_logged_peer_rename " << *mdr << dendl;
+
+  // prepare ack
+  ref_t<MMDSPeerRequest> reply;
+  if (!mdr->aborted) {
+    reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
+    if (!mdr->more()->peer_update_journaled)
+      reply->mark_not_journaled();
+  }
+
+  CDentry::linkage_t *srcdnl = srcdn->get_linkage();
+  //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
+
+  // export srci?
+  if (srcdn->is_auth() && srcdnl->is_primary()) {
+    // set export bounds for CInode::encode_export()
+    if (reply) {
+      std::vector<CDir*> bounds;
+      if (srcdnl->get_inode()->is_dir()) {
+	srcdnl->get_inode()->get_dirfrags(bounds);
+	for (const auto& bound : bounds) {
+	  bound->state_set(CDir::STATE_EXPORTBOUND);
+        }
+      }
+
+      map<client_t,entity_inst_t> exported_client_map;
+      map<client_t, client_metadata_t> exported_client_metadata_map;
+      bufferlist inodebl;
+      mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
+					     exported_client_map,
+					     exported_client_metadata_map);
+
+      for (const auto& bound : bounds) {
+	bound->state_clear(CDir::STATE_EXPORTBOUND);
+      }
+
+      encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
+      encode(exported_client_metadata_map, reply->inode_export);
+      reply->inode_export.claim_append(inodebl);
+      reply->inode_export_v = srcdnl->get_inode()->get_version();
+    }
+
+    // remove mdr auth pin
+    mdr->auth_unpin(srcdnl->get_inode());
+    mdr->more()->is_inode_exporter = true;
+
+    if (srcdnl->get_inode()->is_dirty())
+      srcdnl->get_inode()->mark_clean();
+
+    dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
+  }
+
+  // apply
+  _rename_apply(mdr, srcdn, destdn, straydn);   
+
+  CDentry::linkage_t *destdnl = destdn->get_linkage();
+
+  // bump popularity
+  mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
+  if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
+    mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
+
+  // done.
+  mdr->reset_peer_request();
+  mdr->straydn = 0;
+
+  if (reply) {
+    mds->send_message_mds(reply, mdr->peer_to_mds);
+  } else {
+    ceph_assert(mdr->aborted);
+    dout(10) << " abort flag set, finishing" << dendl;
+    mdcache->request_finish(mdr);
+  }
+}
+
+void Server::_commit_peer_rename(MDRequestRef& mdr, int r,
+				  CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+  dout(10) << "_commit_peer_rename " << *mdr << " r=" << r << dendl;
+
+  CInode *in = destdn->get_linkage()->get_inode();
+
+  inodeno_t migrated_stray;
+  if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
+    migrated_stray = in->ino();
+
+  MDSContext::vec finished;
+  if (r == 0) {
+    // unfreeze+singleauth inode
+    //  hmm, do i really need to delay this?
+    if (mdr->more()->is_inode_exporter) {
+      // drop our pins
+      // we exported, clear out any xlocks that we moved to another MDS
+
+      for (auto i = mdr->locks.lower_bound(&in->versionlock);
+	   i !=  mdr->locks.end(); ) {
+	SimpleLock *lock = i->lock;
+	if (lock->get_parent() != in)
+	  break;
+	// we only care about xlocks on the exported inode
+	if (i->is_xlock() && !lock->is_locallock())
+	  mds->locker->xlock_export(i++, mdr.get());
+	else
+	  ++i;
+      }
+
+      map<client_t,Capability::Import> peer_imported;
+      auto bp = mdr->more()->inode_import.cbegin();
+      decode(peer_imported, bp);
+
+      dout(10) << " finishing inode export on " << *in << dendl;
+      mdcache->migrator->finish_export_inode(in, mdr->peer_to_mds, peer_imported, finished);
+      mds->queue_waiters(finished);   // this includes SINGLEAUTH waiters.
+
+      // unfreeze
+      ceph_assert(in->is_frozen_inode());
+      in->unfreeze_inode(finished);
+    }
+
+    // singleauth
+    if (mdr->more()->is_ambiguous_auth) {
+      mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+      mdr->more()->is_ambiguous_auth = false;
+    }
+
+    if (straydn && mdr->more()->peer_update_journaled) {
+      CInode *strayin = straydn->get_projected_linkage()->get_inode();
+      if (strayin && !strayin->snaprealm)
+	mdcache->clear_dirty_bits_for_stray(strayin);
+    }
+
+    mds->queue_waiters(finished);
+    mdr->cleanup();
+
+    if (mdr->more()->peer_update_journaled) {
+      // write a commit to the journal
+      EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_commit", mdr->reqid,
+					  mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
+					  EPeerUpdate::RENAME);
+      mdlog->start_entry(le);
+      submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
+      mdlog->flush();
+    } else {
+      _committed_peer(mdr);
+    }
+  } else {
+
+    // abort
+    //  rollback_bl may be empty if we froze the inode but had to provide an expanded
+    // witness list from the leader, and they failed before we tried prep again.
+    if (mdr->more()->rollback_bl.length()) {
+      if (mdr->more()->is_inode_exporter) {
+	dout(10) << " reversing inode export of " << *in << dendl;
+	in->abort_export();
+      }
+      if (mdcache->is_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds)) {
+	mdcache->remove_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds);
+	// rollback but preserve the peer request
+	do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, false);
+	mdr->more()->rollback_bl.clear();
+      } else
+	do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, true);
+    } else {
+      dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl;
+      // singleauth
+      if (mdr->more()->is_ambiguous_auth) {
+	if (srcdn->is_auth())
+	  mdr->more()->rename_inode->unfreeze_inode(finished);
+
+	mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+	mdr->more()->is_ambiguous_auth = false;
+      }
+      mds->queue_waiters(finished);
+      mdcache->request_finish(mdr);
+    }
+  }
+
+  if (migrated_stray && mds->is_stopping())
+    mdcache->shutdown_export_stray_finish(migrated_stray);
+}
+
+static void _rollback_repair_dir(MutationRef& mut, CDir *dir,
+				 rename_rollback::drec &r, utime_t ctime,
+				 bool isdir, const nest_info_t &rstat)
+{
+  auto pf = dir->project_fnode(mut);
+  pf->version = dir->pre_dirty();
+
+  if (isdir) {
+    pf->fragstat.nsubdirs += 1;
+  } else {
+    pf->fragstat.nfiles += 1;
+  }    
+  if (r.ino) {
+    pf->rstat.rbytes += rstat.rbytes;
+    pf->rstat.rfiles += rstat.rfiles;
+    pf->rstat.rsubdirs += rstat.rsubdirs;
+    pf->rstat.rsnaps += rstat.rsnaps;
+  }
+  if (pf->fragstat.mtime == ctime) {
+    pf->fragstat.mtime = r.dirfrag_old_mtime;
+    if (pf->rstat.rctime == ctime)
+      pf->rstat.rctime = r.dirfrag_old_rctime;
+  }
+  mut->add_updated_lock(&dir->get_inode()->filelock);
+  mut->add_updated_lock(&dir->get_inode()->nestlock);
+}
+
+struct C_MDS_LoggedRenameRollback : public ServerLogContext {
+  MutationRef mut;
+  CDentry *srcdn;
+  version_t srcdnpv;
+  CDentry *destdn;
+  CDentry *straydn;
+  map<client_t,ref_t<MClientSnap>> splits[2];
+  bool finish_mdr;
+  C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
+			     CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
+			     map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
+    ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
+    straydn(st), finish_mdr(f) {
+      splits[0].swap(_splits[0]);
+      splits[1].swap(_splits[1]);
+    }
+  void finish(int r) override {
+    server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
+				    destdn, straydn, splits, finish_mdr);
+  }
+};
+
+void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr,
+				bool finish_mdr)
+{
+  rename_rollback rollback;
+  auto p = rbl.cbegin();
+  decode(rollback, p);
+
+  dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
+  // need to finish this update before sending resolve to claim the subtree
+  mdcache->add_rollback(rollback.reqid, leader);
+
+  MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
+  mut->ls = mds->mdlog->get_current_segment();
+
+  CDentry *srcdn = NULL;
+  CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
+  if (!srcdir)
+    srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
+  if (srcdir) {
+    dout(10) << "  srcdir " << *srcdir << dendl;
+    srcdn = srcdir->lookup(rollback.orig_src.dname);
+    if (srcdn) {
+      dout(10) << "   srcdn " << *srcdn << dendl;
+      ceph_assert(srcdn->get_linkage()->is_null());
+    } else
+      dout(10) << "   srcdn not found" << dendl;
+  } else
+    dout(10) << "  srcdir not found" << dendl;
+
+  CDentry *destdn = NULL;
+  CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
+  if (!destdir)
+    destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
+  if (destdir) {
+    dout(10) << " destdir " << *destdir << dendl;
+    destdn = destdir->lookup(rollback.orig_dest.dname);
+    if (destdn)
+      dout(10) << "  destdn " << *destdn << dendl;
+    else
+      dout(10) << "  destdn not found" << dendl;
+  } else
+    dout(10) << " destdir not found" << dendl;
+
+  CInode *in = NULL;
+  if (rollback.orig_src.ino) {
+    in = mdcache->get_inode(rollback.orig_src.ino);
+    if (in && in->is_dir())
+      ceph_assert(srcdn && destdn);
+  } else
+    in = mdcache->get_inode(rollback.orig_src.remote_ino);
+
+  CDir *straydir = NULL;
+  CDentry *straydn = NULL;
+  if (rollback.stray.dirfrag.ino) {
+    straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
+    if (straydir) {
+      dout(10) << "straydir " << *straydir << dendl;
+      straydn = straydir->lookup(rollback.stray.dname);
+      if (straydn) {
+	dout(10) << " straydn " << *straydn << dendl;
+	ceph_assert(straydn->get_linkage()->is_primary());
+      } else
+	dout(10) << " straydn not found" << dendl;
+    } else
+      dout(10) << "straydir not found" << dendl;
+  }
+
+  CInode *target = NULL;
+  if (rollback.orig_dest.ino) {
+    target = mdcache->get_inode(rollback.orig_dest.ino);
+    if (target)
+      ceph_assert(destdn && straydn);
+  } else if (rollback.orig_dest.remote_ino)
+    target = mdcache->get_inode(rollback.orig_dest.remote_ino);
+
+  // can't use is_auth() in the resolve stage
+  mds_rank_t whoami = mds->get_nodeid();
+  // peer
+  ceph_assert(!destdn || destdn->authority().first != whoami);
+  ceph_assert(!straydn || straydn->authority().first != whoami);
+
+  bool force_journal_src = false;
+  bool force_journal_dest = false;
+  if (in && in->is_dir() && srcdn->authority().first != whoami)
+    force_journal_src = _need_force_journal(in, false);
+  if (in && target && target->is_dir())
+    force_journal_dest = _need_force_journal(in, true);
+  
+  version_t srcdnpv = 0;
+  // repair src
+  if (srcdn) {
+    if (srcdn->authority().first == whoami)
+      srcdnpv = srcdn->pre_dirty();
+    if (rollback.orig_src.ino) {
+      ceph_assert(in);
+      srcdn->push_projected_linkage(in);
+    } else
+      srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
+				    rollback.orig_src.remote_d_type);
+  }
+
+  map<client_t,ref_t<MClientSnap>> splits[2];
+
+  const CInode::mempool_inode *pip = nullptr;
+  if (in) {
+    bool projected;
+    CDir *pdir = in->get_projected_parent_dir();
+    if (pdir->authority().first == whoami) {
+      auto pi = in->project_inode(mut);
+      pi.inode->version = in->pre_dirty();
+      if (pdir != srcdir) {
+	auto pf = pdir->project_fnode(mut);
+	pf->version = pdir->pre_dirty();
+      }
+      if (pi.inode->ctime == rollback.ctime)
+	pi.inode->ctime = rollback.orig_src.old_ctime;
+      projected = true;
+    } else {
+      if (in->get_inode()->ctime == rollback.ctime) {
+	auto _inode = CInode::allocate_inode(*in->get_inode());
+	_inode->ctime = rollback.orig_src.old_ctime;
+	in->reset_inode(_inode);
+      }
+      projected = false;
+    }
+    pip = in->get_projected_inode().get();
+
+    if (rollback.srci_snapbl.length() && in->snaprealm) {
+      bool hadrealm;
+      auto p = rollback.srci_snapbl.cbegin();
+      decode(hadrealm, p);
+      if (hadrealm) {
+	if (projected && !mds->is_resolve()) {
+	  sr_t *new_srnode = new sr_t();
+	  decode(*new_srnode, p);
+	  in->project_snaprealm(new_srnode);
+	} else
+	  decode(in->snaprealm->srnode, p);
+      } else {
+	SnapRealm *realm;
+	if (rollback.orig_src.ino) {
+	  ceph_assert(srcdir);
+	  realm = srcdir->get_inode()->find_snaprealm();
+	} else {
+	  realm = in->snaprealm->parent;
+	}
+	if (!mds->is_resolve())
+	  mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
+	if (projected)
+	  in->project_snaprealm(NULL);
+	else
+	  in->snaprealm->merge_to(realm);
+      }
+    }
+  }
+
+  // repair dest
+  if (destdn) {
+    if (rollback.orig_dest.ino && target) {
+      destdn->push_projected_linkage(target);
+    } else if (rollback.orig_dest.remote_ino) {
+      destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
+				     rollback.orig_dest.remote_d_type);
+    } else {
+      // the dentry will be trimmed soon, it's ok to have wrong linkage
+      if (rollback.orig_dest.ino)
+	ceph_assert(mds->is_resolve());
+      destdn->push_projected_linkage();
+    }
+  }
+
+  if (straydn)
+    straydn->push_projected_linkage();
+
+  if (target) {
+    bool projected;
+    CInode::inode_ptr ti;
+    CDir *pdir = target->get_projected_parent_dir();
+    if (pdir->authority().first == whoami) {
+      auto pi = target->project_inode(mut);
+      pi.inode->version = target->pre_dirty();
+      if (pdir != srcdir) {
+	auto pf = pdir->project_fnode(mut);
+	pf->version = pdir->pre_dirty();
+      }
+      ti = pi.inode;
+      projected = true;
+    } else {
+      ti = CInode::allocate_inode(*target->get_inode());
+      projected = false;
+    }
+
+    if (ti->ctime == rollback.ctime)
+      ti->ctime = rollback.orig_dest.old_ctime;
+    if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
+      if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
+	ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
+      else
+	ceph_assert(rollback.orig_dest.remote_ino &&
+	       rollback.orig_dest.remote_ino == rollback.orig_src.ino);
+    } else
+      ti->nlink++;
+
+    if (!projected)
+      target->reset_inode(ti);
+
+    if (rollback.desti_snapbl.length() && target->snaprealm) {
+      bool hadrealm;
+      auto p = rollback.desti_snapbl.cbegin();
+      decode(hadrealm, p);
+      if (hadrealm) {
+	if (projected && !mds->is_resolve()) {
+	  sr_t *new_srnode = new sr_t();
+	  decode(*new_srnode, p);
+	  target->project_snaprealm(new_srnode);
+	} else
+	  decode(target->snaprealm->srnode, p);
+      } else {
+	SnapRealm *realm;
+	if (rollback.orig_dest.ino) {
+	  ceph_assert(destdir);
+	  realm = destdir->get_inode()->find_snaprealm();
+	} else {
+	  realm = target->snaprealm->parent;
+	}
+	if (!mds->is_resolve())
+	  mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
+	if (projected)
+	  target->project_snaprealm(NULL);
+	else
+	  target->snaprealm->merge_to(realm);
+      }
+    }
+  }
+
+  if (srcdn && srcdn->authority().first == whoami) {
+    nest_info_t blah;
+    _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
+			 in && in->is_dir(), pip ? pip->accounted_rstat : blah);
+  }
+
+  if (srcdn)
+    dout(0) << " srcdn back to " << *srcdn << dendl;
+  if (in)
+    dout(0) << "  srci back to " << *in << dendl;
+  if (destdn)
+    dout(0) << " destdn back to " << *destdn << dendl;
+  if (target)
+    dout(0) << "  desti back to " << *target << dendl;
+  
+  // journal it
+  EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_rollback", rollback.reqid, leader,
+				      EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RENAME);
+  mdlog->start_entry(le);
+
+  if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
+    le->commit.add_dir_context(srcdir);
+    if (rollback.orig_src.ino)
+      le->commit.add_primary_dentry(srcdn, 0, true);
+    else
+      le->commit.add_remote_dentry(srcdn, true);
+  }
+
+  if (!rollback.orig_src.ino && // remote linkage
+      in && in->authority().first == whoami) {
+    le->commit.add_dir_context(in->get_projected_parent_dir());
+    le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
+  }
+
+  if (force_journal_dest) {
+    ceph_assert(rollback.orig_dest.ino);
+    le->commit.add_dir_context(destdir);
+    le->commit.add_primary_dentry(destdn, 0, true);
+  }
+
+  // peer: no need to journal straydn
+
+  if (target && target != in && target->authority().first == whoami) {
+    ceph_assert(rollback.orig_dest.remote_ino);
+    le->commit.add_dir_context(target->get_projected_parent_dir());
+    le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
+  }
+
+  if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
+    dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
+    le->commit.renamed_dirino = in->ino();
+    if (srcdn->authority().first == whoami) {
+      auto&& ls = in->get_dirfrags();
+      for (const auto& dir : ls) {
+	if (!dir->is_auth())
+	  le->commit.renamed_dir_frags.push_back(dir->get_frag());
+      }
+      dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
+    }
+  } else if (force_journal_dest) {
+    dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
+    le->commit.renamed_dirino = target->ino();
+  }
+  
+  if (target && target->is_dir()) {
+    ceph_assert(destdn);
+    mdcache->project_subtree_rename(target, straydir, destdir);
+  }
+
+  if (in && in->is_dir()) {
+    ceph_assert(srcdn);
+    mdcache->project_subtree_rename(in, destdir, srcdir);
+  }
+
+  if (mdr && !mdr->more()->peer_update_journaled) {
+    ceph_assert(le->commit.empty());
+    mdlog->cancel_entry(le);
+    mut->ls = NULL;
+    _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
+  } else {
+    ceph_assert(!le->commit.empty());
+    if (mdr)
+      mdr->more()->peer_update_journaled = false;
+    MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
+							    srcdn, srcdnpv, destdn, straydn,
+							    splits, finish_mdr);
+    submit_mdlog_entry(le, fin, mdr, __func__);
+    mdlog->flush();
+  }
+}
+
+void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
+				     version_t srcdnpv, CDentry *destdn, CDentry *straydn,
+				     map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
+{
+  dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
+
+  if (straydn) {
+    straydn->get_dir()->unlink_inode(straydn);
+    straydn->pop_projected_linkage();
+  }
+  if (destdn) {
+    destdn->get_dir()->unlink_inode(destdn);
+    destdn->pop_projected_linkage();
+  }
+  if (srcdn) {
+    srcdn->pop_projected_linkage();
+    if (srcdn->authority().first == mds->get_nodeid()) {
+      srcdn->mark_dirty(srcdnpv, mut->ls);
+      if (srcdn->get_linkage()->is_primary())
+	srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
+    }
+  }
+
+  mut->apply();
+
+  if (srcdn && srcdn->get_linkage()->is_primary()) {
+    CInode *in = srcdn->get_linkage()->get_inode();
+    if (in && in->is_dir()) {
+      ceph_assert(destdn);
+      mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
+    }
+  }
+
+  if (destdn) {
+    CInode *oldin = destdn->get_linkage()->get_inode();
+    // update subtree map?
+    if (oldin && oldin->is_dir()) {
+      ceph_assert(straydn);
+      mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
+    }
+  }
+
+  if (mds->is_resolve()) {
+    CDir *root = NULL;
+    if (straydn)
+      root = mdcache->get_subtree_root(straydn->get_dir());
+    else if (destdn)
+      root = mdcache->get_subtree_root(destdn->get_dir());
+    if (root)
+      mdcache->try_trim_non_auth_subtree(root);
+  } else {
+    mdcache->send_snaps(splits[1]);
+    mdcache->send_snaps(splits[0]);
+  }
+
+  if (mdr) {
+    MDSContext::vec finished;
+    if (mdr->more()->is_ambiguous_auth) {
+      if (srcdn->is_auth())
+	mdr->more()->rename_inode->unfreeze_inode(finished);
+
+      mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+      mdr->more()->is_ambiguous_auth = false;
+    }
+    mds->queue_waiters(finished);
+    if (finish_mdr || mdr->aborted)
+      mdcache->request_finish(mdr);
+    else
+      mdr->more()->peer_rolling_back = false;
+  }
+
+  mdcache->finish_rollback(mut->reqid, mdr);
+
+  mut->cleanup();
+}
+
+void Server::handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
+{
+  dout(10) << "handle_peer_rename_prep_ack " << *mdr
+	   << " witnessed by " << ack->get_source()
+	   << " " << *ack << dendl;
+  mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+  // note peer
+  mdr->more()->peers.insert(from);
+  if (mdr->more()->srcdn_auth_mds == from &&
+      mdr->more()->is_remote_frozen_authpin &&
+      !mdr->more()->is_ambiguous_auth) {
+    mdr->set_ambiguous_auth(mdr->more()->rename_inode);
+  }
+
+  // witnessed?  or add extra witnesses?
+  ceph_assert(mdr->more()->witnessed.count(from) == 0);
+  if (ack->is_interrupted()) {
+    dout(10) << " peer request interrupted, noop" << dendl;
+  } else if (ack->witnesses.empty()) {
+    mdr->more()->witnessed.insert(from);
+    if (!ack->is_not_journaled())
+      mdr->more()->has_journaled_peers = true;
+  } else {
+    dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
+    mdr->more()->extra_witnesses = ack->witnesses;
+    mdr->more()->extra_witnesses.erase(mds->get_nodeid());  // not me!
+  }
+
+  // srci import?
+  if (ack->inode_export.length()) {
+    dout(10) << " got srci import" << dendl;
+    mdr->more()->inode_import.share(ack->inode_export);
+    mdr->more()->inode_import_v = ack->inode_export_v;
+  }
+
+  // remove from waiting list
+  ceph_assert(mdr->more()->waiting_on_peer.count(from));
+  mdr->more()->waiting_on_peer.erase(from);
+
+  if (mdr->more()->waiting_on_peer.empty())
+    dispatch_client_request(mdr);  // go again!
+  else 
+    dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
+}
+
+void Server::handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
+{
+  dout(10) << "handle_peer_rename_notify_ack " << *mdr << " from mds."
+	   << ack->get_source() << dendl;
+  ceph_assert(mdr->is_peer());
+  mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+  if (mdr->more()->waiting_on_peer.count(from)) {
+    mdr->more()->waiting_on_peer.erase(from);
+
+    if (mdr->more()->waiting_on_peer.empty()) {
+      if (mdr->peer_request)
+	dispatch_peer_request(mdr);
+    } else 
+      dout(10) << " still waiting for rename notify acks from "
+	       << mdr->more()->waiting_on_peer << dendl;
+  }
+}
+
+void Server::_peer_rename_sessions_flushed(MDRequestRef& mdr)
+{
+  dout(10) << "_peer_rename_sessions_flushed " << *mdr << dendl;
+
+  if (mdr->more()->waiting_on_peer.count(MDS_RANK_NONE)) {
+    mdr->more()->waiting_on_peer.erase(MDS_RANK_NONE);
+
+    if (mdr->more()->waiting_on_peer.empty()) {
+      if (mdr->peer_request)
+	dispatch_peer_request(mdr);
+    } else
+      dout(10) << " still waiting for rename notify acks from "
+	<< mdr->more()->waiting_on_peer << dendl;
+  }
+}
+
+// snaps
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_lssnap(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+
+  // traverse to path
+  CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+  if (!diri)
+    return;
+
+  if (!diri->is_dir()) {
+    respond_to_request(mdr, -CEPHFS_ENOTDIR);
+    return;
+  }
+  dout(10) << "lssnap on " << *diri << dendl;
+
+  // lock snap
+  if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
+    return;
+
+  if (!check_access(mdr, diri, MAY_READ))
+    return;
+
+  SnapRealm *realm = diri->find_snaprealm();
+  map<snapid_t,const SnapInfo*> infomap;
+  realm->get_snap_info(infomap, diri->get_oldest_snap());
+
+  unsigned max_entries = req->head.args.readdir.max_entries;
+  if (!max_entries)
+    max_entries = infomap.size();
+  int max_bytes = req->head.args.readdir.max_bytes;
+  if (!max_bytes)
+    // make sure at least one item can be encoded
+    max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
+
+  __u64 last_snapid = 0;
+  string offset_str = req->get_path2();
+  if (!offset_str.empty())
+    last_snapid = realm->resolve_snapname(offset_str, diri->ino());
+
+  //Empty DirStat
+  bufferlist dirbl;
+  static DirStat empty;
+  CDir::encode_dirstat(dirbl, mdr->session->info, empty);
+
+  max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
+
+  __u32 num = 0;
+  bufferlist dnbl;
+  auto p = infomap.upper_bound(last_snapid);
+  for (; p != infomap.end() && num < max_entries; ++p) {
+    dout(10) << p->first << " -> " << *p->second << dendl;
+
+    // actual
+    string snap_name;
+    if (p->second->ino == diri->ino())
+      snap_name = p->second->name;
+    else
+      snap_name = p->second->get_long_name();
+
+    unsigned start_len = dnbl.length();
+    if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
+      break;
+
+    encode(snap_name, dnbl);
+    //infinite lease
+    LeaseStat e(CEPH_LEASE_VALID, -1, 0);
+    mds->locker->encode_lease(dnbl, mdr->session->info, e);
+    dout(20) << "encode_infinite_lease" << dendl;
+
+    int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
+    if (r < 0) {
+      bufferlist keep;
+      keep.substr_of(dnbl, 0, start_len);
+      dnbl.swap(keep);
+      break;
+    }
+    ++num;
+  }
+
+  encode(num, dirbl);
+  __u16 flags = 0;
+  if (p == infomap.end()) {
+    flags = CEPH_READDIR_FRAG_END;
+    if (last_snapid == 0)
+      flags |= CEPH_READDIR_FRAG_COMPLETE;
+  }
+  encode(flags, dirbl);
+  dirbl.claim_append(dnbl);
+  
+  mdr->reply_extra_bl = dirbl;
+  mdr->tracei = diri;
+  respond_to_request(mdr, 0);
+}
+
+
+// MKSNAP
+
+struct C_MDS_mksnap_finish : public ServerLogContext {
+  CInode *diri;
+  SnapInfo info;
+  C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
+    ServerLogContext(s, r), diri(di), info(i) {}
+  void finish(int r) override {
+    server->_mksnap_finish(mdr, diri, info);
+  }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_mksnap(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  // make sure we have as new a map as the client
+  if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+    mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+    return;
+  }
+  if (!mds->mdsmap->allows_snaps()) {
+    // you can't make snapshots until you set an option right now
+    dout(5) << "new snapshots are disabled for this fs" << dendl;
+    respond_to_request(mdr, -CEPHFS_EPERM);
+    return;
+  }
+
+  CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+  if (!diri)
+    return;
+
+  // dir only
+  if (!diri->is_dir()) {
+    respond_to_request(mdr, -CEPHFS_ENOTDIR);
+    return;
+  }
+  if (diri->is_system() && !diri->is_root()) {
+    // no snaps in system dirs (root is ok)
+    dout(5) << "is an internal system dir" << dendl;
+    respond_to_request(mdr, -CEPHFS_EPERM);
+    return;
+  }
+  
+  std::string_view snapname = req->get_filepath().last_dentry();
+
+  if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
+    dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
+    respond_to_request(mdr, -CEPHFS_EPERM);
+    return;
+  }
+  
+  dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
+
+  // lock snap
+  if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+    MutationImpl::LockOpVec lov;
+    lov.add_xlock(&diri->snaplock);
+    if (!mds->locker->acquire_locks(mdr, lov))
+      return;
+
+    if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
+      if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
+	return;
+    }
+    mdr->locking_state |= MutationImpl::ALL_LOCKED;
+  }
+
+  if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
+    return;
+
+  if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
+      (subvol_ino && subvol_ino != diri->ino())) {
+    dout(5) << "is a descendent of a subvolume dir" << dendl;
+    respond_to_request(mdr, -CEPHFS_EPERM);
+    return;
+  }
+
+  // check if we can create any more snapshots
+  // we don't allow any more if we are already at or beyond the limit
+  if (diri->snaprealm &&
+      diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
+    respond_to_request(mdr, -CEPHFS_EMLINK);
+    return;
+  }
+
+  // make sure name is unique
+  if (diri->snaprealm &&
+      diri->snaprealm->exists(snapname)) {
+    respond_to_request(mdr, -CEPHFS_EEXIST);
+    return;
+  }
+  if (snapname.length() == 0 ||
+      snapname[0] == '_') {
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  // allocate a snapid
+  if (!mdr->more()->stid) {
+    // prepare an stid
+    mds->snapclient->prepare_create(diri->ino(), snapname,
+				    mdr->get_mds_stamp(),
+				    &mdr->more()->stid, &mdr->more()->snapidbl,
+				    new C_MDS_RetryRequest(mdcache, mdr));
+    return;
+  }
+
+  version_t stid = mdr->more()->stid;
+  snapid_t snapid;
+  auto p = mdr->more()->snapidbl.cbegin();
+  decode(snapid, p);
+  dout(10) << " stid " << stid << " snapid " << snapid << dendl;
+
+  ceph_assert(mds->snapclient->get_cached_version() >= stid);
+
+  SnapPayload payload;
+  if (req->get_data().length()) {
+    try {
+      auto iter = req->get_data().cbegin();
+      decode(payload, iter);
+    } catch (const ceph::buffer::error &e) {
+      // backward compat -- client sends xattr bufferlist. however,
+      // that is not used anywhere -- so (log and) ignore.
+      dout(20) << ": no metadata in payload (old client?)" << dendl;
+    }
+  }
+
+  // journal
+  SnapInfo info;
+  info.ino = diri->ino();
+  info.snapid = snapid;
+  info.name = snapname;
+  info.stamp = mdr->get_op_stamp();
+  info.metadata = payload.metadata;
+
+  auto pi = diri->project_inode(mdr, false, true);
+  pi.inode->ctime = info.stamp;
+  if (info.stamp > pi.inode->rstat.rctime)
+    pi.inode->rstat.rctime = info.stamp;
+  pi.inode->rstat.rsnaps++;
+  pi.inode->version = diri->pre_dirty();
+
+  // project the snaprealm
+  auto &newsnap = *pi.snapnode;
+  newsnap.created = snapid;
+  auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
+  if (!em.second)
+    em.first->second = info;
+  newsnap.seq = snapid;
+  newsnap.last_created = snapid;
+
+  // journal the inode changes
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "mksnap");
+  mdlog->start_entry(le);
+
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  le->metablob.add_table_transaction(TABLE_SNAP, stid);
+  mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
+
+  // journal the snaprealm changes
+  submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
+                     mdr, __func__);
+  mdlog->flush();
+}
+
+void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
+{
+  dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
+
+  int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
+
+  mdr->apply();
+
+  mds->snapclient->commit(mdr->more()->stid, mdr->ls);
+
+  // create snap
+  dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
+
+  // notify other mds
+  mdcache->send_snap_update(diri, mdr->more()->stid, op);
+
+  mdcache->do_realm_invalidate_and_update_notify(diri, op);
+
+  // yay
+  mdr->in[0] = diri;
+  mdr->snapid = info.snapid;
+  mdr->tracei = diri;
+  respond_to_request(mdr, 0);
+}
+
+
+// RMSNAP
+
+struct C_MDS_rmsnap_finish : public ServerLogContext {
+  CInode *diri;
+  snapid_t snapid;
+  C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
+    ServerLogContext(s, r), diri(di), snapid(sn) {}
+  void finish(int r) override {
+    server->_rmsnap_finish(mdr, diri, snapid);
+  }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_rmsnap(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+
+  CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+  if (!diri)
+    return;
+
+  if (!diri->is_dir()) {
+    respond_to_request(mdr, -CEPHFS_ENOTDIR);
+    return;
+  }
+
+  std::string_view snapname = req->get_filepath().last_dentry();
+
+  if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
+    dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
+    respond_to_request(mdr, -CEPHFS_EPERM);
+    return;
+  }
+
+  dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
+
+  // does snap exist?
+  if (snapname.length() == 0 || snapname[0] == '_') {
+    respond_to_request(mdr, -CEPHFS_EINVAL);   // can't prune a parent snap, currently.
+    return;
+  }
+  if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
+    respond_to_request(mdr, -CEPHFS_ENOENT);
+    return;
+  }
+  snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
+  dout(10) << " snapname " << snapname << " is " << snapid << dendl;
+
+  if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+    MutationImpl::LockOpVec lov;
+    lov.add_xlock(&diri->snaplock);
+    if (!mds->locker->acquire_locks(mdr, lov))
+      return;
+    if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
+      if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
+	return;
+    }
+    mdr->locking_state |= MutationImpl::ALL_LOCKED;
+  }
+
+  if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
+    return;
+
+  // prepare
+  if (!mdr->more()->stid) {
+    mds->snapclient->prepare_destroy(diri->ino(), snapid,
+				     &mdr->more()->stid, &mdr->more()->snapidbl,
+				     new C_MDS_RetryRequest(mdcache, mdr));
+    return;
+  }
+  version_t stid = mdr->more()->stid;
+  auto p = mdr->more()->snapidbl.cbegin();
+  snapid_t seq;
+  decode(seq, p);  
+  dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
+
+  ceph_assert(mds->snapclient->get_cached_version() >= stid);
+
+  // journal
+  auto pi = diri->project_inode(mdr, false, true);
+  pi.inode->version = diri->pre_dirty();
+  pi.inode->ctime = mdr->get_op_stamp();
+  if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+    pi.inode->rstat.rctime = mdr->get_op_stamp();
+  pi.inode->rstat.rsnaps--;
+  
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "rmsnap");
+  mdlog->start_entry(le);
+  
+  // project the snaprealm
+  auto &newnode = *pi.snapnode;
+  newnode.snaps.erase(snapid);
+  newnode.seq = seq;
+  newnode.last_destroyed = seq;
+
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  le->metablob.add_table_transaction(TABLE_SNAP, stid);
+  mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
+
+  submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
+                     mdr, __func__);
+  mdlog->flush();
+}
+
+void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
+{
+  dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
+  snapid_t stid = mdr->more()->stid;
+  auto p = mdr->more()->snapidbl.cbegin();
+  snapid_t seq;
+  decode(seq, p);  
+
+  mdr->apply();
+
+  mds->snapclient->commit(stid, mdr->ls);
+
+  dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
+
+  // notify other mds
+  mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
+
+  mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
+
+  // yay
+  mdr->in[0] = diri;
+  respond_to_request(mdr, 0);
+
+  // purge snapshot data
+  diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
+}
+
+struct C_MDS_renamesnap_finish : public ServerLogContext {
+  CInode *diri;
+  snapid_t snapid;
+  C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
+    ServerLogContext(s, r), diri(di), snapid(sn) {}
+  void finish(int r) override {
+    server->_renamesnap_finish(mdr, diri, snapid);
+  }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_renamesnap(MDRequestRef& mdr)
+{
+  const cref_t<MClientRequest> &req = mdr->client_request;
+  if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+
+  CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+  if (!diri)
+    return;
+
+  if (!diri->is_dir()) { // dir only
+    respond_to_request(mdr, -CEPHFS_ENOTDIR);
+    return;
+  }
+
+  if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
+      mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
+    respond_to_request(mdr, -CEPHFS_EPERM);
+    return;
+  }
+
+  std::string_view dstname = req->get_filepath().last_dentry();
+  std::string_view srcname = req->get_filepath2().last_dentry();
+  dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
+
+  if (srcname.length() == 0 || srcname[0] == '_') {
+    respond_to_request(mdr, -CEPHFS_EINVAL);   // can't rename a parent snap.
+    return;
+  }
+  if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
+    respond_to_request(mdr, -CEPHFS_ENOENT);
+    return;
+  }
+  if (dstname.length() == 0 || dstname[0] == '_') {
+    respond_to_request(mdr, -CEPHFS_EINVAL);
+    return;
+  }
+  if (diri->snaprealm->exists(dstname)) {
+    respond_to_request(mdr, -CEPHFS_EEXIST);
+    return;
+  }
+
+  snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
+  dout(10) << " snapname " << srcname << " is " << snapid << dendl;
+
+  // lock snap
+  if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+    MutationImpl::LockOpVec lov;
+    lov.add_xlock(&diri->snaplock);
+    if (!mds->locker->acquire_locks(mdr, lov))
+      return;
+    if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
+      if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
+	return;
+    }
+    mdr->locking_state |= MutationImpl::ALL_LOCKED;
+  }
+
+  if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
+    return;
+
+    // prepare
+  if (!mdr->more()->stid) {
+    mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
+				    &mdr->more()->stid,
+				    new C_MDS_RetryRequest(mdcache, mdr));
+    return;
+  }
+
+  version_t stid = mdr->more()->stid;
+  dout(10) << " stid is " << stid << dendl;
+
+  ceph_assert(mds->snapclient->get_cached_version() >= stid);
+
+  // journal
+  auto pi = diri->project_inode(mdr, false, true);
+  pi.inode->ctime = mdr->get_op_stamp();
+  if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+    pi.inode->rstat.rctime = mdr->get_op_stamp();
+  pi.inode->version = diri->pre_dirty();
+
+  // project the snaprealm
+  auto &newsnap = *pi.snapnode;
+  auto it = newsnap.snaps.find(snapid);
+  ceph_assert(it != newsnap.snaps.end());
+  it->second.name = dstname;
+
+  // journal the inode changes
+  mdr->ls = mdlog->get_current_segment();
+  EUpdate *le = new EUpdate(mdlog, "renamesnap");
+  mdlog->start_entry(le);
+
+  le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+  le->metablob.add_table_transaction(TABLE_SNAP, stid);
+  mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
+  mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
+
+  // journal the snaprealm changes
+  submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
+                     mdr, __func__);
+  mdlog->flush();
+}
+
+void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
+{
+  dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
+
+  mdr->apply();
+
+  mds->snapclient->commit(mdr->more()->stid, mdr->ls);
+
+  dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
+
+  // notify other mds
+  mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
+
+  mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
+
+  // yay
+  mdr->in[0] = diri;
+  mdr->tracei = diri;
+  mdr->snapid = snapid;
+  respond_to_request(mdr, 0);
+}
+
+/**
+ * Return true if server is in state RECONNECT and this
+ * client has not yet reconnected.
+ */
+bool Server::waiting_for_reconnect(client_t c) const
+{
+  return client_reconnect_gather.count(c) > 0;
+}
+
+void Server::dump_reconnect_status(Formatter *f) const
+{
+  f->open_object_section("reconnect_status");
+  f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
+  f->close_section();
+}
diff --git a/src/mds/Server.h b/src/mds/Server.h
new file mode 100644
index 000000000..e4791dc6b
--- /dev/null
+++ b/src/mds/Server.h
@@ -0,0 +1,534 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_SERVER_H
+#define CEPH_MDS_SERVER_H
+
+#include <string_view>
+
+#include <common/DecayCounter.h>
+
+#include "include/common_fwd.h"
+
+#include "messages/MClientReconnect.h"
+#include "messages/MClientReply.h"
+#include "messages/MClientRequest.h"
+#include "messages/MClientSession.h"
+#include "messages/MClientSnap.h"
+#include "messages/MClientReclaim.h"
+#include "messages/MClientReclaimReply.h"
+#include "messages/MLock.h"
+
+#include "CInode.h"
+#include "MDSRank.h"
+#include "Mutation.h"
+#include "MDSContext.h"
+
+class OSDMap;
+class LogEvent;
+class EMetaBlob;
+class EUpdate;
+class MDLog;
+struct SnapInfo;
+class MetricsHandler;
+
+enum {
+  l_mdss_first = 1000,
+  l_mdss_dispatch_client_request,
+  l_mdss_dispatch_peer_request,
+  l_mdss_handle_client_request,
+  l_mdss_handle_client_session,
+  l_mdss_handle_peer_request,
+  l_mdss_req_create_latency,
+  l_mdss_req_getattr_latency,
+  l_mdss_req_getfilelock_latency,
+  l_mdss_req_link_latency,
+  l_mdss_req_lookup_latency,
+  l_mdss_req_lookuphash_latency,
+  l_mdss_req_lookupino_latency,
+  l_mdss_req_lookupname_latency,
+  l_mdss_req_lookupparent_latency,
+  l_mdss_req_lookupsnap_latency,
+  l_mdss_req_lssnap_latency,
+  l_mdss_req_mkdir_latency,
+  l_mdss_req_mknod_latency,
+  l_mdss_req_mksnap_latency,
+  l_mdss_req_open_latency,
+  l_mdss_req_readdir_latency,
+  l_mdss_req_rename_latency,
+  l_mdss_req_renamesnap_latency,
+  l_mdss_req_rmdir_latency,
+  l_mdss_req_rmsnap_latency,
+  l_mdss_req_rmxattr_latency,
+  l_mdss_req_setattr_latency,
+  l_mdss_req_setdirlayout_latency,
+  l_mdss_req_setfilelock_latency,
+  l_mdss_req_setlayout_latency,
+  l_mdss_req_setxattr_latency,
+  l_mdss_req_symlink_latency,
+  l_mdss_req_unlink_latency,
+  l_mdss_cap_revoke_eviction,
+  l_mdss_cap_acquisition_throttle,
+  l_mdss_req_getvxattr_latency,
+  l_mdss_last,
+};
+
+class Server {
+public:
+  using clock = ceph::coarse_mono_clock;
+  using time = ceph::coarse_mono_time;
+
+  enum class RecallFlags : uint64_t {
+    NONE = 0,
+    STEADY = (1<<0),
+    ENFORCE_MAX = (1<<1),
+    TRIM = (1<<2),
+    ENFORCE_LIVENESS = (1<<3),
+  };
+
+  explicit Server(MDSRank *m, MetricsHandler *metrics_handler);
+  ~Server() {
+    g_ceph_context->get_perfcounters_collection()->remove(logger);
+    delete logger;
+    delete reconnect_done;
+  }
+
+  void create_logger();
+
+  // message handler
+  void dispatch(const cref_t<Message> &m);
+
+  void handle_osd_map();
+
+  // -- sessions and recovery --
+  bool waiting_for_reconnect(client_t c) const;
+  void dump_reconnect_status(Formatter *f) const;
+
+  time last_recalled() const {
+    return last_recall_state;
+  }
+
+  void handle_client_session(const cref_t<MClientSession> &m);
+  void _session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
+		       const interval_set<inodeno_t>& inos_to_free, version_t piv,
+		       const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls);
+  version_t prepare_force_open_sessions(map<client_t,entity_inst_t> &cm,
+					map<client_t,client_metadata_t>& cmm,
+					map<client_t,pair<Session*,uint64_t> >& smap);
+  void finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
+				  bool dec_import=true);
+  void flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather);
+  void finish_flush_session(Session *session, version_t seq);
+  void terminate_sessions();
+  void find_idle_sessions();
+
+  void kill_session(Session *session, Context *on_safe);
+  size_t apply_blocklist();
+  void journal_close_session(Session *session, int state, Context *on_safe);
+
+  size_t get_num_pending_reclaim() const { return client_reclaim_gather.size(); }
+  Session *find_session_by_uuid(std::string_view uuid);
+  void reclaim_session(Session *session, const cref_t<MClientReclaim> &m);
+  void finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply=nullptr);
+  void handle_client_reclaim(const cref_t<MClientReclaim> &m);
+
+  void reconnect_clients(MDSContext *reconnect_done_);
+  void handle_client_reconnect(const cref_t<MClientReconnect> &m);
+  void infer_supported_features(Session *session, client_metadata_t& client_metadata);
+  void update_required_client_features();
+
+  //void process_reconnect_cap(CInode *in, int from, ceph_mds_cap_reconnect& capinfo);
+  void reconnect_gather_finish();
+  void reconnect_tick();
+  void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
+
+  std::pair<bool, uint64_t> recall_client_state(MDSGatherBuilder* gather, RecallFlags=RecallFlags::NONE);
+  void force_clients_readonly();
+
+  // -- requests --
+  void handle_client_request(const cref_t<MClientRequest> &m);
+
+  void journal_and_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn,
+			 LogEvent *le, MDSLogContextBase *fin);
+  void submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin,
+                          MDRequestRef& mdr, std::string_view event);
+  void dispatch_client_request(MDRequestRef& mdr);
+  void perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat);
+  void early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn);
+  void respond_to_request(MDRequestRef& mdr, int r = 0);
+  void set_trace_dist(const ref_t<MClientReply> &reply, CInode *in, CDentry *dn,
+		      MDRequestRef& mdr);
+
+  void handle_peer_request(const cref_t<MMDSPeerRequest> &m);
+  void handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m);
+  void dispatch_peer_request(MDRequestRef& mdr);
+  void handle_peer_auth_pin(MDRequestRef& mdr);
+  void handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack);
+
+  // some helpers
+  bool check_fragment_space(MDRequestRef& mdr, CDir *in);
+  bool check_dir_max_entries(MDRequestRef& mdr, CDir *in);
+  bool check_access(MDRequestRef& mdr, CInode *in, unsigned mask);
+  bool _check_access(Session *session, CInode *in, unsigned mask, int caller_uid, int caller_gid, int setattr_uid, int setattr_gid);
+  CDentry *prepare_stray_dentry(MDRequestRef& mdr, CInode *in);
+  CInode* prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
+			    const file_layout_t *layout=nullptr);
+  void journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob);
+  void apply_allocated_inos(MDRequestRef& mdr, Session *session);
+
+  void _try_open_ino(MDRequestRef& mdr, int r, inodeno_t ino);
+  CInode* rdlock_path_pin_ref(MDRequestRef& mdr, bool want_auth,
+			      bool no_want_auth=false);
+  CDentry* rdlock_path_xlock_dentry(MDRequestRef& mdr, bool create,
+				    bool okexist=false, bool want_layout=false);
+  std::pair<CDentry*, CDentry*>
+	    rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn);
+
+  CDir* try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr);
+
+  // requests on existing inodes.
+  void handle_client_getattr(MDRequestRef& mdr, bool is_lookup);
+  void handle_client_lookup_ino(MDRequestRef& mdr,
+				bool want_parent, bool want_dentry);
+  void _lookup_snap_ino(MDRequestRef& mdr);
+  void _lookup_ino_2(MDRequestRef& mdr, int r);
+  void handle_client_readdir(MDRequestRef& mdr);
+  void handle_client_file_setlock(MDRequestRef& mdr);
+  void handle_client_file_readlock(MDRequestRef& mdr);
+
+  bool xlock_policylock(MDRequestRef& mdr, CInode *in,
+			bool want_layout=false, bool xlock_snaplock=false);
+  CInode* try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino);
+  void handle_client_setattr(MDRequestRef& mdr);
+  void handle_client_setlayout(MDRequestRef& mdr);
+  void handle_client_setdirlayout(MDRequestRef& mdr);
+
+  int parse_quota_vxattr(string name, string value, quota_info_t *quota);
+  void create_quota_realm(CInode *in);
+  int parse_layout_vxattr_json(std::string name, std::string value,
+			       const OSDMap& osdmap, file_layout_t *layout);
+  int parse_layout_vxattr_string(std::string name, std::string value, const OSDMap& osdmap,
+				 file_layout_t *layout);
+  int parse_layout_vxattr(std::string name, std::string value, const OSDMap& osdmap,
+			  file_layout_t *layout, bool validate=true);
+  int check_layout_vxattr(MDRequestRef& mdr,
+                          string name,
+                          string value,
+                          file_layout_t *layout);
+  void handle_set_vxattr(MDRequestRef& mdr, CInode *cur);
+  void handle_remove_vxattr(MDRequestRef& mdr, CInode *cur);
+  void handle_client_getvxattr(MDRequestRef& mdr);
+  void handle_client_setxattr(MDRequestRef& mdr);
+  void handle_client_removexattr(MDRequestRef& mdr);
+
+  void handle_client_fsync(MDRequestRef& mdr);
+
+  bool is_unlink_pending(CDentry *dn);
+  void wait_for_pending_unlink(CDentry *dn, MDRequestRef& mdr);
+
+  // open
+  void handle_client_open(MDRequestRef& mdr);
+  void handle_client_openc(MDRequestRef& mdr);  // O_CREAT variant.
+  void do_open_truncate(MDRequestRef& mdr, int cmode);  // O_TRUNC variant.
+
+  // namespace changes
+  void handle_client_mknod(MDRequestRef& mdr);
+  void handle_client_mkdir(MDRequestRef& mdr);
+  void handle_client_symlink(MDRequestRef& mdr);
+
+  // link
+  void handle_client_link(MDRequestRef& mdr);
+  void _link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm);
+  void _link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
+			  version_t, version_t, bool);
+
+  void _link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti);
+  void _link_remote_finish(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti,
+			   version_t);
+
+  void handle_peer_link_prep(MDRequestRef& mdr);
+  void _logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm);
+  void _commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti);
+  void _committed_peer(MDRequestRef& mdr);  // use for rename, too
+  void handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m);
+  void do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr);
+  void _link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
+			     map<client_t,ref_t<MClientSnap>>& split);
+
+  // unlink
+  void handle_client_unlink(MDRequestRef& mdr);
+  bool _dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *rmdiri);
+  bool _dir_is_nonempty(MDRequestRef& mdr, CInode *rmdiri);
+  void _unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn);
+  void _unlink_local_finish(MDRequestRef& mdr,
+			    CDentry *dn, CDentry *straydn,
+			    version_t);
+  bool _rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn);
+  void handle_peer_rmdir_prep(MDRequestRef& mdr);
+  void _logged_peer_rmdir(MDRequestRef& mdr, CDentry *srcdn, CDentry *straydn);
+  void _commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn);
+  void handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack);
+  void do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr);
+  void _rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn);
+
+  // rename
+  void handle_client_rename(MDRequestRef& mdr);
+  void _rename_finish(MDRequestRef& mdr,
+		      CDentry *srcdn, CDentry *destdn, CDentry *straydn);
+
+  void handle_client_lssnap(MDRequestRef& mdr);
+  void handle_client_mksnap(MDRequestRef& mdr);
+  void _mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info);
+  void handle_client_rmsnap(MDRequestRef& mdr);
+  void _rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid);
+  void handle_client_renamesnap(MDRequestRef& mdr);
+  void _renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid);
+
+  // helpers
+  bool _rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
+			       vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn);
+  version_t _rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl);
+  bool _need_force_journal(CInode *diri, bool empty);
+  void _rename_prepare(MDRequestRef& mdr,
+		       EMetaBlob *metablob, bufferlist *client_map_bl,
+		       CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
+                       CDentry *straydn);
+  /* set not_journaling=true if you're going to discard the results --
+   * this bypasses the asserts to make sure we're journaling the right
+   * things on the right nodes */
+  void _rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
+
+  // slaving
+  void handle_peer_rename_prep(MDRequestRef& mdr);
+  void handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m);
+  void handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m);
+  void _peer_rename_sessions_flushed(MDRequestRef& mdr);
+  void _logged_peer_rename(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
+  void _commit_peer_rename(MDRequestRef& mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
+  void do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr, bool finish_mdr=false);
+  void _rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn, version_t srcdnpv,
+			       CDentry *destdn, CDentry *staydn, map<client_t,ref_t<MClientSnap>> splits[2],
+			       bool finish_mdr);
+
+  void evict_cap_revoke_non_responders();
+  void handle_conf_change(const std::set<std::string>& changed);
+
+  bool terminating_sessions = false;
+
+  set<client_t> client_reclaim_gather;
+
+private:
+  friend class MDSContinuation;
+  friend class ServerContext;
+  friend class ServerLogContext;
+  friend class Batch_Getattr_Lookup;
+
+  // placeholder for validation handler to store xattr specific
+  // data
+  struct XattrInfo {
+    virtual ~XattrInfo() {
+    }
+  };
+
+  struct MirrorXattrInfo : XattrInfo {
+    std::string cluster_id;
+    std::string fs_id;
+
+    static const std::string MIRROR_INFO_REGEX;
+    static const std::string CLUSTER_ID;
+    static const std::string FS_ID;
+
+    MirrorXattrInfo(std::string_view cluster_id,
+                    std::string_view fs_id)
+      : cluster_id(cluster_id),
+        fs_id(fs_id) {
+    }
+  };
+
+  struct XattrOp {
+    int op;
+    std::string xattr_name;
+    const bufferlist &xattr_value;
+    int flags = 0;
+
+    std::unique_ptr<XattrInfo> xinfo;
+
+    XattrOp(int op, std::string_view xattr_name, const bufferlist &xattr_value, int flags)
+      : op(op),
+        xattr_name(xattr_name),
+        xattr_value(xattr_value),
+        flags (flags) {
+    }
+  };
+
+  struct XattrHandler {
+    const std::string xattr_name;
+    const std::string description;
+
+    // basic checks are to be done in this handler. return -errno to
+    // reject xattr request (set or remove), zero to proceed. handlers
+    // may parse xattr value for verification if needed and have an
+    // option to store custom data in XattrOp::xinfo.
+    int (Server::*validate)(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
+                            XattrOp *xattr_op);
+
+    // set xattr for an inode in xattr_map
+    void (Server::*setxattr)(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+                             const XattrOp &xattr_op);
+
+    // remove xattr for an inode from xattr_map
+    void (Server::*removexattr)(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+                                const XattrOp &xattr_op);
+  };
+
+  inline static const std::string DEFAULT_HANDLER = "<default>";
+  static const XattrHandler xattr_handlers[];
+
+  const XattrHandler* get_xattr_or_default_handler(std::string_view xattr_name);
+
+  // generic variant to set/remove xattr in/from xattr_map
+  int xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
+                     const std::string &xattr_name, int op, int flags);
+  void xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
+                 const bufferlist &xattr_value);
+  void xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name);
+
+  // default xattr handlers
+  int default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
+                             XattrOp *xattr_op);
+  void default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+                                const XattrOp &xattr_op);
+  void default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+                                   const XattrOp &xattr_op);
+
+  // mirror info xattr handler
+  int parse_mirror_info_xattr(const std::string &name, const std::string &value,
+                              std::string &cluster_id, std::string &fs_id);
+  int mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
+                                 XattrOp *xattr_op);
+  void mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+                                    const XattrOp &xattr_op);
+  void mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+                                       const XattrOp &xattr_op);
+
+  static bool is_ceph_vxattr(std::string_view xattr_name) {
+    return xattr_name.rfind("ceph.dir.layout", 0) == 0 ||
+           xattr_name.rfind("ceph.file.layout", 0) == 0 ||
+           xattr_name.rfind("ceph.quota", 0) == 0 ||
+           xattr_name == "ceph.dir.subvolume"sv ||
+           xattr_name == "ceph.dir.pin"sv ||
+           xattr_name == "ceph.dir.pin.random"sv ||
+           xattr_name == "ceph.dir.pin.distributed"sv;
+  }
+
+  static bool is_ceph_dir_vxattr(std::string_view xattr_name) {
+    return (xattr_name == "ceph.dir.layout" ||
+	    xattr_name == "ceph.dir.layout.json" ||
+	    xattr_name == "ceph.dir.layout.object_size" ||
+	    xattr_name == "ceph.dir.layout.stripe_unit" ||
+	    xattr_name == "ceph.dir.layout.stripe_count" ||
+	    xattr_name == "ceph.dir.layout.pool" ||
+	    xattr_name == "ceph.dir.layout.pool_name" ||
+	    xattr_name == "ceph.dir.layout.pool_id" ||
+	    xattr_name == "ceph.dir.layout.pool_namespace" ||
+	    xattr_name == "ceph.dir.pin" ||
+	    xattr_name == "ceph.dir.pin.random" ||
+	    xattr_name == "ceph.dir.pin.distributed");
+  }
+
+  static bool is_ceph_file_vxattr(std::string_view xattr_name) {
+    return (xattr_name == "ceph.file.layout" ||
+	    xattr_name == "ceph.file.layout.json" ||
+	    xattr_name == "ceph.file.layout.object_size" ||
+	    xattr_name == "ceph.file.layout.stripe_unit" ||
+	    xattr_name == "ceph.file.layout.stripe_count" ||
+	    xattr_name == "ceph.file.layout.pool" ||
+	    xattr_name == "ceph.file.layout.pool_name" ||
+	    xattr_name == "ceph.file.layout.pool_id" ||
+	    xattr_name == "ceph.file.layout.pool_namespace");
+  }
+
+  static bool is_allowed_ceph_xattr(std::string_view xattr_name) {
+    // not a ceph xattr -- allow!
+    if (xattr_name.rfind("ceph.", 0) != 0) {
+      return true;
+    }
+
+    return xattr_name == "ceph.mirror.info" ||
+           xattr_name == "ceph.mirror.dirty_snap_id";
+  }
+
+  void reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply);
+  void flush_session(Session *session, MDSGatherBuilder& gather);
+
+  MDSRank *mds;
+  MDCache *mdcache;
+  MDLog *mdlog;
+  PerfCounters *logger = nullptr;
+
+  // OSDMap full status, used to generate CEPHFS_ENOSPC on some operations
+  bool is_full = false;
+
+  // State for while in reconnect
+  MDSContext *reconnect_done = nullptr;
+  int failed_reconnects = 0;
+  bool reconnect_evicting = false;  // true if I am waiting for evictions to complete
+                            // before proceeding to reconnect_gather_finish
+  time reconnect_start = clock::zero();
+  time reconnect_last_seen = clock::zero();
+  set<client_t> client_reconnect_gather;  // clients i need a reconnect msg from.
+  set<client_t> client_reconnect_denied;  // clients whose reconnect msg have been denied .
+
+  feature_bitset_t supported_features;
+  feature_bitset_t supported_metric_spec;
+  feature_bitset_t required_client_features;
+
+  bool forward_all_requests_to_auth = false;
+  bool replay_unsafe_with_closed_session = false;
+  double cap_revoke_eviction_timeout = 0;
+  uint64_t max_snaps_per_dir = 100;
+  unsigned delegate_inos_pct = 0;
+  uint64_t dir_max_entries = 0;
+
+  DecayCounter recall_throttle;
+  time last_recall_state;
+
+  MetricsHandler *metrics_handler;
+
+  // Cache cap acquisition throttle configs
+  uint64_t max_caps_per_client;
+  uint64_t cap_acquisition_throttle;
+  double max_caps_throttle_ratio;
+  double caps_throttle_retry_request_timeout;
+
+  size_t alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
+};
+
+static inline constexpr auto operator|(Server::RecallFlags a, Server::RecallFlags b) {
+  using T = std::underlying_type<Server::RecallFlags>::type;
+  return static_cast<Server::RecallFlags>(static_cast<T>(a) | static_cast<T>(b));
+}
+static inline constexpr auto operator&(Server::RecallFlags a, Server::RecallFlags b) {
+  using T = std::underlying_type<Server::RecallFlags>::type;
+  return static_cast<Server::RecallFlags>(static_cast<T>(a) & static_cast<T>(b));
+}
+static inline std::ostream& operator<<(std::ostream& os, const Server::RecallFlags& f) {
+  using T = std::underlying_type<Server::RecallFlags>::type;
+  return os << "0x" << std::hex << static_cast<T>(f) << std::dec;
+}
+static inline constexpr bool operator!(const Server::RecallFlags& f) {
+  using T = std::underlying_type<Server::RecallFlags>::type;
+  return static_cast<T>(f) == static_cast<T>(0);
+}
+#endif
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc
new file mode 100644
index 000000000..b96fc37d6
--- /dev/null
+++ b/src/mds/SessionMap.cc
@@ -0,0 +1,1255 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "MDSRank.h"
+#include "MDCache.h"
+#include "Mutation.h"
+#include "SessionMap.h"
+#include "osdc/Filer.h"
+#include "common/Finisher.h"
+
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/DecayCounter.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << rank << ".sessionmap "
+
+namespace {
+class SessionMapIOContext : public MDSIOContextBase
+{
+  protected:
+    SessionMap *sessionmap;
+    MDSRank *get_mds() override {return sessionmap->mds;}
+  public:
+    explicit SessionMapIOContext(SessionMap *sessionmap_) : sessionmap(sessionmap_) {
+      ceph_assert(sessionmap != NULL);
+    }
+};
+};
+
+void SessionMap::register_perfcounters()
+{
+  PerfCountersBuilder plb(g_ceph_context, "mds_sessions",
+      l_mdssm_first, l_mdssm_last);
+
+  plb.add_u64(l_mdssm_session_count, "session_count",
+      "Session count", "sess", PerfCountersBuilder::PRIO_INTERESTING);
+
+  plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+  plb.add_u64_counter(l_mdssm_session_add, "session_add",
+      "Sessions added");
+  plb.add_u64_counter(l_mdssm_session_remove, "session_remove",
+      "Sessions removed");
+  plb.add_u64(l_mdssm_session_open, "sessions_open",
+              "Sessions currently open");
+  plb.add_u64(l_mdssm_session_stale, "sessions_stale",
+              "Sessions currently stale");
+  plb.add_u64(l_mdssm_total_load, "total_load", "Total Load");
+  plb.add_u64(l_mdssm_avg_load, "average_load", "Average Load");
+  plb.add_u64(l_mdssm_avg_session_uptime, "avg_session_uptime",
+               "Average session uptime");
+
+  logger = plb.create_perf_counters();
+  g_ceph_context->get_perfcounters_collection()->add(logger);
+}
+
+void SessionMap::dump()
+{
+  dout(10) << "dump" << dendl;
+  for (ceph::unordered_map<entity_name_t,Session*>::iterator p = session_map.begin();
+       p != session_map.end();
+       ++p) 
+    dout(10) << p->first << " " << p->second
+	     << " state " << p->second->get_state_name()
+	     << " completed " << p->second->info.completed_requests
+	     << " free_prealloc_inos " << p->second->free_prealloc_inos
+	     << " delegated_inos " << p->second->delegated_inos
+	     << dendl;
+}
+
+
+// ----------------
+// LOAD
+
+
+object_t SessionMap::get_object_name() const
+{
+  char s[30];
+  snprintf(s, sizeof(s), "mds%d_sessionmap", int(mds->get_nodeid()));
+  return object_t(s);
+}
+
+namespace {
+class C_IO_SM_Load : public SessionMapIOContext {
+public:
+  const bool first;  //< Am I the initial (header) load?
+  int header_r;  //< Return value from OMAP header read
+  int values_r;  //< Return value from OMAP value read
+  bufferlist header_bl;
+  std::map<std::string, bufferlist> session_vals;
+  bool more_session_vals = false;
+
+  C_IO_SM_Load(SessionMap *cm, const bool f)
+    : SessionMapIOContext(cm), first(f), header_r(0), values_r(0) {}
+
+  void finish(int r) override {
+    sessionmap->_load_finish(r, header_r, values_r, first, header_bl, session_vals,
+      more_session_vals);
+  }
+  void print(ostream& out) const override {
+    out << "session_load";
+  }
+};
+}
+
+
+/**
+ * Decode OMAP header.  Call this once when loading.
+ */
+void SessionMapStore::decode_header(
+      bufferlist &header_bl)
+{
+  auto q = header_bl.cbegin();
+  DECODE_START(1, q)
+  decode(version, q);
+  DECODE_FINISH(q);
+}
+
+void SessionMapStore::encode_header(
+    bufferlist *header_bl)
+{
+  ENCODE_START(1, 1, *header_bl);
+  encode(version, *header_bl);
+  ENCODE_FINISH(*header_bl);
+}
+
+/**
+ * Decode and insert some serialized OMAP values.  Call this
+ * repeatedly to insert batched loads.
+ */
+void SessionMapStore::decode_values(std::map<std::string, bufferlist> &session_vals)
+{
+  for (std::map<std::string, bufferlist>::iterator i = session_vals.begin();
+       i != session_vals.end(); ++i) {
+
+    entity_inst_t inst;
+
+    bool parsed = inst.name.parse(i->first);
+    if (!parsed) {
+      derr << "Corrupt entity name '" << i->first << "' in sessionmap" << dendl;
+      throw buffer::malformed_input("Corrupt entity name in sessionmap");
+    }
+
+    Session *s = get_or_add_session(inst);
+    if (s->is_closed()) {
+      s->set_state(Session::STATE_OPEN);
+      s->set_load_avg_decay_rate(decay_rate);
+    }
+    auto q = i->second.cbegin();
+    s->decode(q);
+  }
+}
+
+/**
+ * An OMAP read finished.
+ */
+void SessionMap::_load_finish(
+    int operation_r,
+    int header_r,
+    int values_r,
+    bool first,
+    bufferlist &header_bl,
+    std::map<std::string, bufferlist> &session_vals,
+    bool more_session_vals)
+{
+  if (operation_r < 0) {
+    derr << "_load_finish got " << cpp_strerror(operation_r) << dendl;
+    mds->clog->error() << "error reading sessionmap '" << get_object_name()
+                       << "' " << operation_r << " ("
+                       << cpp_strerror(operation_r) << ")";
+    mds->damaged();
+    ceph_abort();  // Should be unreachable because damaged() calls respawn()
+  }
+
+  // Decode header
+  if (first) {
+    if (header_r != 0) {
+      derr << __func__ << ": header error: " << cpp_strerror(header_r) << dendl;
+      mds->clog->error() << "error reading sessionmap header "
+                         << header_r << " (" << cpp_strerror(header_r) << ")";
+      mds->damaged();
+      ceph_abort();  // Should be unreachable because damaged() calls respawn()
+    }
+
+    if(header_bl.length() == 0) {
+      dout(4) << __func__ << ": header missing, loading legacy..." << dendl;
+      load_legacy();
+      return;
+    }
+
+    try {
+      decode_header(header_bl);
+    } catch (buffer::error &e) {
+      mds->clog->error() << "corrupt sessionmap header: " << e.what();
+      mds->damaged();
+      ceph_abort();  // Should be unreachable because damaged() calls respawn()
+    }
+    dout(10) << __func__ << " loaded version " << version << dendl;
+  }
+
+  if (values_r != 0) {
+    derr << __func__ << ": error reading values: "
+      << cpp_strerror(values_r) << dendl;
+    mds->clog->error() << "error reading sessionmap values: " 
+                       << values_r << " (" << cpp_strerror(values_r) << ")";
+    mds->damaged();
+    ceph_abort();  // Should be unreachable because damaged() calls respawn()
+  }
+
+  // Decode session_vals
+  try {
+    decode_values(session_vals);
+  } catch (buffer::error &e) {
+    mds->clog->error() << "corrupt sessionmap values: " << e.what();
+    mds->damaged();
+    ceph_abort();  // Should be unreachable because damaged() calls respawn()
+  }
+
+  if (more_session_vals) {
+    // Issue another read if we're not at the end of the omap
+    const std::string last_key = session_vals.rbegin()->first;
+    dout(10) << __func__ << ": continue omap load from '"
+             << last_key << "'" << dendl;
+    object_t oid = get_object_name();
+    object_locator_t oloc(mds->get_metadata_pool());
+    C_IO_SM_Load *c = new C_IO_SM_Load(this, false);
+    ObjectOperation op;
+    op.omap_get_vals(last_key, "", g_conf()->mds_sessionmap_keys_per_op,
+		     &c->session_vals, &c->more_session_vals, &c->values_r);
+    mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, NULL, 0,
+        new C_OnFinisher(c, mds->finisher));
+  } else {
+    // I/O is complete.  Update `by_state`
+    dout(10) << __func__ << ": omap load complete" << dendl;
+    for (ceph::unordered_map<entity_name_t, Session*>::iterator i = session_map.begin();
+         i != session_map.end(); ++i) {
+      Session *s = i->second;
+      auto by_state_entry = by_state.find(s->get_state());
+      if (by_state_entry == by_state.end())
+	by_state_entry = by_state.emplace(s->get_state(),
+					  new xlist<Session*>).first;
+      by_state_entry->second->push_back(&s->item_session_list);
+    }
+
+    // Population is complete.  Trigger load waiters.
+    dout(10) << __func__ << ": v " << version 
+	   << ", " << session_map.size() << " sessions" << dendl;
+    projected = committing = committed = version;
+    dump();
+    finish_contexts(g_ceph_context, waiting_for_load);
+  }
+}
+
+/**
+ * Populate session state from OMAP records in this
+ * rank's sessionmap object.
+ */
+void SessionMap::load(MDSContext *onload)
+{
+  dout(10) << "load" << dendl;
+
+  if (onload)
+    waiting_for_load.push_back(onload);
+  
+  C_IO_SM_Load *c = new C_IO_SM_Load(this, true);
+  object_t oid = get_object_name();
+  object_locator_t oloc(mds->get_metadata_pool());
+
+  ObjectOperation op;
+  op.omap_get_header(&c->header_bl, &c->header_r);
+  op.omap_get_vals("", "", g_conf()->mds_sessionmap_keys_per_op,
+		   &c->session_vals, &c->more_session_vals, &c->values_r);
+
+  mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, NULL, 0, new C_OnFinisher(c, mds->finisher));
+}
+
+namespace {
+class C_IO_SM_LoadLegacy : public SessionMapIOContext {
+public:
+  bufferlist bl;
+  explicit C_IO_SM_LoadLegacy(SessionMap *cm) : SessionMapIOContext(cm) {}
+  void finish(int r) override {
+    sessionmap->_load_legacy_finish(r, bl);
+  }
+  void print(ostream& out) const override {
+    out << "session_load_legacy";
+  }
+};
+}
+
+
+/**
+ * Load legacy (object data blob) SessionMap format, assuming
+ * that waiting_for_load has already been populated with
+ * the relevant completion.  This is the fallback if we do not
+ * find an OMAP header when attempting to load normally.
+ */
+void SessionMap::load_legacy()
+{
+  dout(10) << __func__ << dendl;
+
+  C_IO_SM_LoadLegacy *c = new C_IO_SM_LoadLegacy(this);
+  object_t oid = get_object_name();
+  object_locator_t oloc(mds->get_metadata_pool());
+
+  mds->objecter->read_full(oid, oloc, CEPH_NOSNAP, &c->bl, 0,
+			   new C_OnFinisher(c, mds->finisher));
+}
+
+void SessionMap::_load_legacy_finish(int r, bufferlist &bl)
+{ 
+  auto blp = bl.cbegin();
+  if (r < 0) {
+    derr << "_load_finish got " << cpp_strerror(r) << dendl;
+    ceph_abort_msg("failed to load sessionmap");
+  }
+  dump();
+  decode_legacy(blp);  // note: this sets last_cap_renew = now()
+  dout(10) << "_load_finish v " << version 
+	   << ", " << session_map.size() << " sessions, "
+	   << bl.length() << " bytes"
+	   << dendl;
+  projected = committing = committed = version;
+  dump();
+
+  // Mark all sessions dirty, so that on next save() we will write
+  // a complete OMAP version of the data loaded from the legacy format
+  for (ceph::unordered_map<entity_name_t, Session*>::iterator i = session_map.begin();
+       i != session_map.end(); ++i) {
+    // Don't use mark_dirty because on this occasion we want to ignore the
+    // keys_per_op limit and do one big write (upgrade must be atomic)
+    dirty_sessions.insert(i->first);
+  }
+  loaded_legacy = true;
+
+  finish_contexts(g_ceph_context, waiting_for_load);
+}
+
+
+// ----------------
+// SAVE
+
+namespace {
+class C_IO_SM_Save : public SessionMapIOContext {
+  version_t version;
+public:
+  C_IO_SM_Save(SessionMap *cm, version_t v) : SessionMapIOContext(cm), version(v) {}
+  void finish(int r) override {
+    if (r != 0) {
+      get_mds()->handle_write_error(r);
+    } else {
+      sessionmap->_save_finish(version);
+    }
+  }
+  void print(ostream& out) const override {
+    out << "session_save";
+  }
+};
+}
+
+void SessionMap::save(MDSContext *onsave, version_t needv)
+{
+  dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl;
+ 
+  if (needv && committing >= needv) {
+    ceph_assert(committing > committed);
+    commit_waiters[committing].push_back(onsave);
+    return;
+  }
+
+  commit_waiters[version].push_back(onsave);
+
+  committing = version;
+  SnapContext snapc;
+  object_t oid = get_object_name();
+  object_locator_t oloc(mds->get_metadata_pool());
+
+  ObjectOperation op;
+
+  /* Compose OSD OMAP transaction for full write */
+  bufferlist header_bl;
+  encode_header(&header_bl);
+  op.omap_set_header(header_bl);
+
+  /* If we loaded a legacy sessionmap, then erase the old data.  If
+   * an old-versioned MDS tries to read it, it'll fail out safely
+   * with an end_of_buffer exception */
+  if (loaded_legacy) {
+    dout(4) << __func__ << " erasing legacy sessionmap" << dendl;
+    op.truncate(0);
+    loaded_legacy = false;  // only need to truncate once.
+  }
+
+  dout(20) << " updating keys:" << dendl;
+  map<string, bufferlist> to_set;
+  for(std::set<entity_name_t>::iterator i = dirty_sessions.begin();
+      i != dirty_sessions.end(); ++i) {
+    const entity_name_t name = *i;
+    Session *session = session_map[name];
+
+    if (session->is_open() ||
+	session->is_closing() ||
+	session->is_stale() ||
+	session->is_killing()) {
+      dout(20) << "  " << name << dendl;
+      // Serialize K
+      CachedStackStringStream css;
+      *css << name;
+
+      // Serialize V
+      bufferlist bl;
+      session->info.encode(bl, mds->mdsmap->get_up_features());
+
+      // Add to RADOS op
+      to_set[std::string(css->strv())] = bl;
+
+      session->clear_dirty_completed_requests();
+    } else {
+      dout(20) << "  " << name << " (ignoring)" << dendl;
+    }
+  }
+  if (!to_set.empty()) {
+    op.omap_set(to_set);
+  }
+
+  dout(20) << " removing keys:" << dendl;
+  set<string> to_remove;
+  for(std::set<entity_name_t>::const_iterator i = null_sessions.begin();
+      i != null_sessions.end(); ++i) {
+    dout(20) << "  " << *i << dendl;
+    CachedStackStringStream css;
+    *css << *i;
+    to_remove.insert(css->str());
+  }
+  if (!to_remove.empty()) {
+    op.omap_rm_keys(to_remove);
+  }
+
+  dirty_sessions.clear();
+  null_sessions.clear();
+
+  mds->objecter->mutate(oid, oloc, op, snapc,
+			ceph::real_clock::now(),
+			0,
+			new C_OnFinisher(new C_IO_SM_Save(this, version),
+					 mds->finisher));
+}
+
+void SessionMap::_save_finish(version_t v)
+{
+  dout(10) << "_save_finish v" << v << dendl;
+  committed = v;
+
+  finish_contexts(g_ceph_context, commit_waiters[v]);
+  commit_waiters.erase(v);
+}
+
+
+/**
+ * Deserialize sessions, and update by_state index
+ */
+void SessionMap::decode_legacy(bufferlist::const_iterator &p)
+{
+  // Populate `sessions`
+  SessionMapStore::decode_legacy(p);
+
+  // Update `by_state`
+  for (ceph::unordered_map<entity_name_t, Session*>::iterator i = session_map.begin();
+       i != session_map.end(); ++i) {
+    Session *s = i->second;
+    auto by_state_entry = by_state.find(s->get_state());
+    if (by_state_entry == by_state.end())
+      by_state_entry = by_state.emplace(s->get_state(),
+					new xlist<Session*>).first;
+    by_state_entry->second->push_back(&s->item_session_list);
+  }
+}
+
+uint64_t SessionMap::set_state(Session *session, int s) {
+  if (session->state != s) {
+    session->set_state(s);
+    auto by_state_entry = by_state.find(s);
+    if (by_state_entry == by_state.end())
+      by_state_entry = by_state.emplace(s, new xlist<Session*>).first;
+    by_state_entry->second->push_back(&session->item_session_list);
+
+    if (session->is_open() || session->is_stale()) {
+      session->set_load_avg_decay_rate(decay_rate);
+    }
+
+    // refresh number of sessions for states which have perf
+    // couters associated
+    logger->set(l_mdssm_session_open,
+                get_session_count_in_state(Session::STATE_OPEN));
+    logger->set(l_mdssm_session_stale,
+                get_session_count_in_state(Session::STATE_STALE));
+  }
+
+  return session->get_state_seq();
+}
+
+void SessionMapStore::decode_legacy(bufferlist::const_iterator& p)
+{
+  auto now = clock::now();
+  uint64_t pre;
+  decode(pre, p);
+  if (pre == (uint64_t)-1) {
+    DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, p);
+    ceph_assert(struct_v >= 2);
+    
+    decode(version, p);
+    
+    while (!p.end()) {
+      entity_inst_t inst;
+      decode(inst.name, p);
+      Session *s = get_or_add_session(inst);
+      if (s->is_closed()) {
+        s->set_state(Session::STATE_OPEN);
+        s->set_load_avg_decay_rate(decay_rate);
+      }
+      s->decode(p);
+    }
+
+    DECODE_FINISH(p);
+  } else {
+    // --- old format ----
+    version = pre;
+
+    // this is a meaningless upper bound.  can be ignored.
+    __u32 n;
+    decode(n, p);
+    
+    while (n-- && !p.end()) {
+      auto p2 = p;
+      Session *s = new Session(ConnectionRef());
+      s->info.decode(p);
+      {
+        auto& name = s->info.inst.name;
+        auto it = session_map.find(name);
+        if (it != session_map.end()) {
+	  // eager client connected too fast!  aie.
+	  dout(10) << " already had session for " << name << ", recovering" << dendl;
+	  delete s;
+	  s = it->second;
+	  p = p2;
+	  s->info.decode(p);
+        } else {
+	  it->second = s;
+        }
+      }
+      s->set_state(Session::STATE_OPEN);
+      s->set_load_avg_decay_rate(decay_rate);
+      s->last_cap_renew = now;
+    }
+  }
+}
+
+void Session::dump(Formatter *f, bool cap_dump) const
+{
+  f->dump_int("id", info.inst.name.num());
+  f->dump_object("entity", info.inst);
+  f->dump_string("state", get_state_name());
+  f->dump_int("num_leases", leases.size());
+  f->dump_int("num_caps", caps.size());
+  if (cap_dump) {
+    f->open_array_section("caps");
+    for (const auto& cap : caps) {
+      f->dump_object("cap", *cap);
+    }
+    f->close_section();
+  }
+  if (is_open() || is_stale()) {
+    f->dump_unsigned("request_load_avg", get_load_avg());
+  }
+  f->dump_float("uptime", get_session_uptime());
+  f->dump_unsigned("requests_in_flight", get_request_count());
+  f->dump_unsigned("num_completed_requests", get_num_completed_requests());
+  f->dump_unsigned("num_completed_flushes", get_num_completed_flushes());
+  f->dump_bool("reconnecting", reconnecting);
+  f->dump_object("recall_caps", recall_caps);
+  f->dump_object("release_caps", release_caps);
+  f->dump_object("recall_caps_throttle", recall_caps_throttle);
+  f->dump_object("recall_caps_throttle2o", recall_caps_throttle2o);
+  f->dump_object("session_cache_liveness", session_cache_liveness);
+  f->dump_object("cap_acquisition", cap_acquisition);
+
+  f->open_array_section("delegated_inos");
+  for (const auto& [start, len] : delegated_inos) {
+    f->open_object_section("ino_range");
+    f->dump_stream("start") << start;
+    f->dump_unsigned("length", len);
+    f->close_section();
+  }
+  f->close_section();
+
+  info.dump(f);
+}
+
+void SessionMapStore::dump(Formatter *f) const
+{
+  f->open_array_section("sessions");
+  for (const auto& p : session_map) {
+    f->dump_object("session", *p.second);
+  }
+  f->close_section(); // Sessions
+}
+
+void SessionMapStore::generate_test_instances(std::list<SessionMapStore*>& ls)
+{
+  // pretty boring for now
+  ls.push_back(new SessionMapStore());
+}
+
+void SessionMap::wipe()
+{
+  dout(1) << "wipe start" << dendl;
+  dump();
+  while (!session_map.empty()) {
+    Session *s = session_map.begin()->second;
+    remove_session(s);
+  }
+  version = ++projected;
+  dout(1) << "wipe result" << dendl;
+  dump();
+  dout(1) << "wipe done" << dendl;
+}
+
+void SessionMap::wipe_ino_prealloc()
+{
+  for (ceph::unordered_map<entity_name_t,Session*>::iterator p = session_map.begin(); 
+       p != session_map.end(); 
+       ++p) {
+    p->second->pending_prealloc_inos.clear();
+    p->second->free_prealloc_inos.clear();
+    p->second->delegated_inos.clear();
+    p->second->info.prealloc_inos.clear();
+  }
+  projected = ++version;
+}
+
+void SessionMap::add_session(Session *s)
+{
+  dout(10) << __func__ << " s=" << s << " name=" << s->info.inst.name << dendl;
+
+  ceph_assert(session_map.count(s->info.inst.name) == 0);
+  session_map[s->info.inst.name] = s;
+  auto by_state_entry = by_state.find(s->state);
+  if (by_state_entry == by_state.end())
+    by_state_entry = by_state.emplace(s->state, new xlist<Session*>).first;
+  by_state_entry->second->push_back(&s->item_session_list);
+  s->get();
+
+  update_average_birth_time(*s);
+
+  logger->set(l_mdssm_session_count, session_map.size());
+  logger->inc(l_mdssm_session_add);
+}
+
+void SessionMap::remove_session(Session *s)
+{
+  dout(10) << __func__ << " s=" << s << " name=" << s->info.inst.name << dendl;
+
+  update_average_birth_time(*s, false);
+
+  s->trim_completed_requests(0);
+  s->item_session_list.remove_myself();
+  session_map.erase(s->info.inst.name);
+  dirty_sessions.erase(s->info.inst.name);
+  null_sessions.insert(s->info.inst.name);
+  s->put();
+
+  logger->set(l_mdssm_session_count, session_map.size());
+  logger->inc(l_mdssm_session_remove);
+}
+
+void SessionMap::touch_session(Session *session)
+{
+  dout(10) << __func__ << " s=" << session << " name=" << session->info.inst.name << dendl;
+
+  // Move to the back of the session list for this state (should
+  // already be on a list courtesy of add_session and set_state)
+  ceph_assert(session->item_session_list.is_on_list());
+  auto by_state_entry = by_state.find(session->state);
+  if (by_state_entry == by_state.end())
+    by_state_entry = by_state.emplace(session->state,
+				      new xlist<Session*>).first;
+  by_state_entry->second->push_back(&session->item_session_list);
+
+  session->last_cap_renew = clock::now();
+}
+
+void SessionMap::_mark_dirty(Session *s, bool may_save)
+{
+  if (dirty_sessions.count(s->info.inst.name))
+    return;
+
+  if (may_save &&
+      dirty_sessions.size() >= g_conf()->mds_sessionmap_keys_per_op) {
+    // Pre-empt the usual save() call from journal segment trim, in
+    // order to avoid building up an oversized OMAP update operation
+    // from too many sessions modified at once
+    save(new C_MDSInternalNoop, version);
+  }
+
+  null_sessions.erase(s->info.inst.name);
+  dirty_sessions.insert(s->info.inst.name);
+}
+
+void SessionMap::mark_dirty(Session *s, bool may_save)
+{
+  dout(20) << __func__ << " s=" << s << " name=" << s->info.inst.name
+    << " v=" << version << dendl;
+
+  _mark_dirty(s, may_save);
+  version++;
+  s->pop_pv(version);
+}
+
+void SessionMap::replay_dirty_session(Session *s)
+{
+  dout(20) << __func__ << " s=" << s << " name=" << s->info.inst.name
+    << " v=" << version << dendl;
+
+  _mark_dirty(s, false);
+
+  replay_advance_version();
+}
+
+void SessionMap::replay_advance_version()
+{
+  version++;
+  projected = version;
+}
+
+void SessionMap::replay_open_sessions(version_t event_cmapv,
+			    map<client_t,entity_inst_t>& client_map,
+			    map<client_t,client_metadata_t>& client_metadata_map)
+{
+  unsigned already_saved;
+
+  if (version + client_map.size() < event_cmapv)
+    goto bad;
+
+  // Server::finish_force_open_sessions() marks sessions dirty one by one.
+  // Marking a session dirty may flush all existing dirty sessions. So it's
+  // possible that some sessions are already saved in sessionmap.
+  already_saved = client_map.size() - (event_cmapv - version);
+  for (const auto& p : client_map) {
+    Session *s = get_or_add_session(p.second);
+    auto q = client_metadata_map.find(p.first);
+    if (q != client_metadata_map.end())
+      s->info.client_metadata.merge(q->second);
+
+    if (already_saved > 0) {
+      if (s->is_closed())
+	goto bad;
+
+      --already_saved;
+      continue;
+    }
+
+    set_state(s, Session::STATE_OPEN);
+    replay_dirty_session(s);
+  }
+  return;
+
+bad:
+  mds->clog->error() << "error replaying open sessions(" << client_map.size()
+		     << ") sessionmap v " << event_cmapv << " table " << version;
+  ceph_assert(g_conf()->mds_wipe_sessions);
+  mds->sessionmap.wipe();
+  mds->sessionmap.set_version(event_cmapv);
+}
+
+version_t SessionMap::mark_projected(Session *s)
+{
+  dout(20) << __func__ << " s=" << s << " name=" << s->info.inst.name
+    << " pv=" << projected << " -> " << projected + 1 << dendl;
+  ++projected;
+  s->push_pv(projected);
+  return projected;
+}
+
+namespace {
+class C_IO_SM_Save_One : public SessionMapIOContext {
+  MDSContext *on_safe;
+public:
+  C_IO_SM_Save_One(SessionMap *cm, MDSContext *on_safe_)
+    : SessionMapIOContext(cm), on_safe(on_safe_) {}
+  void finish(int r) override {
+    if (r != 0) {
+      get_mds()->handle_write_error(r);
+    } else {
+      on_safe->complete(r);
+    }
+  }
+  void print(ostream& out) const override {
+    out << "session_save_one";
+  }
+};
+}
+
+
+void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
+                               MDSGatherBuilder *gather_bld)
+{
+  ceph_assert(gather_bld != NULL);
+
+  std::vector<entity_name_t> write_sessions;
+
+  // Decide which sessions require a write
+  for (std::set<entity_name_t>::iterator i = tgt_sessions.begin();
+       i != tgt_sessions.end(); ++i) {
+    const entity_name_t &session_id = *i;
+
+    if (session_map.count(session_id) == 0) {
+      // Session isn't around any more, never mind.
+      continue;
+    }
+
+    Session *session = session_map[session_id];
+    if (!session->has_dirty_completed_requests()) {
+      // Session hasn't had completed_requests
+      // modified since last write, no need to
+      // write it now.
+      continue;
+    }
+
+    if (dirty_sessions.count(session_id) > 0) {
+      // Session is already dirtied, will be written, no
+      // need to pre-empt that.
+      continue;
+    }
+    // Okay, passed all our checks, now we write
+    // this session out.  The version we write
+    // into the OMAP may now be higher-versioned
+    // than the version in the header, but that's
+    // okay because it's never a problem to have
+    // an overly-fresh copy of a session.
+    write_sessions.push_back(*i);
+  }
+
+  dout(4) << __func__ << ": writing " << write_sessions.size() << dendl;
+
+  // Batch writes into mds_sessionmap_keys_per_op
+  const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op;
+  map<string, bufferlist> to_set;
+  for (uint32_t i = 0; i < write_sessions.size(); ++i) {
+    const entity_name_t &session_id = write_sessions[i];
+    Session *session = session_map[session_id];
+    session->clear_dirty_completed_requests();
+
+    // Serialize K
+    CachedStackStringStream css;
+    *css << session_id;
+
+    // Serialize V
+    bufferlist bl;
+    session->info.encode(bl, mds->mdsmap->get_up_features());
+
+    // Add to RADOS op
+    to_set[css->str()] = bl;
+
+    // Complete this write transaction?
+    if (i == write_sessions.size() - 1
+        || i % kpo == kpo - 1) {
+      ObjectOperation op;
+      op.omap_set(to_set);
+      to_set.clear(); // clear to start a new transaction      
+
+      SnapContext snapc;
+      object_t oid = get_object_name();
+      object_locator_t oloc(mds->get_metadata_pool());
+      MDSContext *on_safe = gather_bld->new_sub();
+      mds->objecter->mutate(oid, oloc, op, snapc,
+			    ceph::real_clock::now(), 0,
+			    new C_OnFinisher(
+			      new C_IO_SM_Save_One(this, on_safe),
+			      mds->finisher));
+    }
+  }
+}
+
+// =================
+// Session
+
+#undef dout_prefix
+#define dout_prefix *_dout << "Session "
+
+/**
+ * Calculate the length of the `requests` member list,
+ * because elist does not have a size() method.
+ *
+ * O(N) runtime.
+ */
+size_t Session::get_request_count() const
+{
+  size_t result = 0;
+  for (auto p = requests.begin(); !p.end(); ++p)
+    ++result;
+  return result;
+}
+
+/**
+ * Capped in response to a CEPH_MSG_CLIENT_CAPRELEASE message,
+ * with n_caps equal to the number of caps that were released
+ * in the message.  Used to update state about how many caps a
+ * client has released since it was last instructed to RECALL_STATE.
+ */
+void Session::notify_cap_release(size_t n_caps)
+{
+  recall_caps.hit(-(double)n_caps);
+  release_caps.hit(n_caps);
+}
+
+/**
+ * Called when a CEPH_MSG_CLIENT_SESSION->CEPH_SESSION_RECALL_STATE
+ * message is sent to the client.  Update our recall-related state
+ * in order to generate health metrics if the session doesn't see
+ * a commensurate number of calls to ::notify_cap_release
+ */
+uint64_t Session::notify_recall_sent(size_t new_limit)
+{
+  const auto num_caps = caps.size();
+  ceph_assert(new_limit < num_caps);  // Behaviour of Server::recall_client_state
+  const auto count = num_caps-new_limit;
+  uint64_t new_change;
+  if (recall_limit != new_limit) {
+    new_change = count;
+  } else {
+    new_change = 0; /* no change! */
+  }
+
+  /* Always hit the session counter as a RECALL message is still sent to the
+   * client and we do not want the MDS to burn its global counter tokens on a
+   * session that is not releasing caps (i.e. allow the session counter to
+   * throttle future RECALL messages).
+   */
+  recall_caps_throttle.hit(count);
+  recall_caps_throttle2o.hit(count);
+  recall_caps.hit(count);
+  return new_change;
+}
+
+/**
+ * Use client metadata to generate a somewhat-friendlier
+ * name for the client than its session ID.
+ *
+ * This is *not* guaranteed to be unique, and any machine
+ * consumers of session-related output should always use
+ * the session ID as a primary capacity and use this only
+ * as a presentation hint.
+ */
+void Session::_update_human_name()
+{
+  auto info_client_metadata_entry = info.client_metadata.find("hostname");
+  if (info_client_metadata_entry != info.client_metadata.end()) {
+    // Happy path, refer to clients by hostname
+    human_name = info_client_metadata_entry->second;
+    if (!info.auth_name.has_default_id()) {
+      // When a non-default entity ID is set by the user, assume they
+      // would like to see it in references to the client, if it's
+      // reasonable short.  Limit the length because we don't want
+      // to put e.g. uuid-generated names into a "human readable"
+      // rendering.
+      const int arbitrarily_short = 16;
+      if (info.auth_name.get_id().size() < arbitrarily_short) {
+        human_name += std::string(":") + info.auth_name.get_id();
+      }
+    }
+  } else {
+    // Fallback, refer to clients by ID e.g. client.4567
+    human_name = stringify(info.inst.name.num());
+  }
+}
+
+void Session::decode(bufferlist::const_iterator &p)
+{
+  info.decode(p);
+
+  free_prealloc_inos = info.prealloc_inos;
+
+  _update_human_name();
+}
+
+int Session::check_access(CInode *in, unsigned mask,
+			  int caller_uid, int caller_gid,
+			  const vector<uint64_t> *caller_gid_list,
+			  int new_uid, int new_gid)
+{
+  string path;
+  CInode *diri = NULL;
+  if (!in->is_base())
+    diri = in->get_projected_parent_dn()->get_dir()->get_inode();
+  if (diri && diri->is_stray()){
+    path = in->get_projected_inode()->stray_prior_path;
+    dout(20) << __func__ << " stray_prior_path " << path << dendl;
+  } else {
+    in->make_path_string(path, true);
+    dout(20) << __func__ << " path " << path << dendl;
+  }
+  if (path.length())
+    path = path.substr(1);    // drop leading /
+
+  const auto& inode = in->get_inode();
+  if (in->is_dir() &&
+      inode->has_layout() &&
+      inode->layout.pool_ns.length() &&
+      !connection->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) {
+    dout(10) << __func__ << " client doesn't support FS_FILE_LAYOUT_V2" << dendl;
+    return -CEPHFS_EIO;
+  }
+
+  if (!auth_caps.is_capable(path, inode->uid, inode->gid, inode->mode,
+			    caller_uid, caller_gid, caller_gid_list, mask,
+			    new_uid, new_gid,
+			    info.inst.addr)) {
+    return -CEPHFS_EACCES;
+  }
+  return 0;
+}
+
+// track total and per session load
+void SessionMap::hit_session(Session *session) {
+  uint64_t sessions = get_session_count_in_state(Session::STATE_OPEN) +
+                      get_session_count_in_state(Session::STATE_STALE) +
+                      get_session_count_in_state(Session::STATE_CLOSING);
+  ceph_assert(sessions != 0);
+
+  double total_load = total_load_avg.hit();
+  double avg_load = total_load / sessions;
+
+  logger->set(l_mdssm_total_load, (uint64_t)total_load);
+  logger->set(l_mdssm_avg_load, (uint64_t)avg_load);
+
+  session->hit_session();
+}
+
+void SessionMap::handle_conf_change(const std::set<std::string>& changed)
+{
+  auto apply_to_open_sessions = [this](auto f) {
+    if (auto it = by_state.find(Session::STATE_OPEN); it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        f(session);
+      }
+    }
+    if (auto it = by_state.find(Session::STATE_STALE); it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        f(session);
+      }
+    }
+  };
+
+  if (changed.count("mds_request_load_average_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_request_load_average_decay_rate");
+
+    decay_rate = d;
+    total_load_avg = DecayCounter(d);
+
+    auto mut = [d](auto s) {
+      s->set_load_avg_decay_rate(d);
+    };
+    apply_to_open_sessions(mut);
+  }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_recall_max_decay_rate");
+    auto mut = [d](auto s) {
+      s->recall_caps_throttle = DecayCounter(d);
+    };
+    apply_to_open_sessions(mut);
+  }
+  if (changed.count("mds_recall_warning_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_recall_warning_decay_rate");
+    auto mut = [d](auto s) {
+      s->recall_caps = DecayCounter(d);
+      s->release_caps = DecayCounter(d);
+    };
+    apply_to_open_sessions(mut);
+  }
+  if (changed.count("mds_session_cache_liveness_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_session_cache_liveness_decay_rate");
+    auto mut = [d](auto s) {
+      s->session_cache_liveness = DecayCounter(d);
+      s->session_cache_liveness.hit(s->caps.size()); /* so the MDS doesn't immediately start trimming a new session */
+    };
+    apply_to_open_sessions(mut);
+  }
+  if (changed.count("mds_session_cap_acquisition_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_session_cap_acquisition_decay_rate");
+    auto mut = [d](auto s) {
+      s->cap_acquisition = DecayCounter(d);
+    };
+    apply_to_open_sessions(mut);
+  }
+}
+
+void SessionMap::update_average_session_age() {
+  if (!session_map.size()) {
+    return;
+  }
+
+  double avg_uptime = std::chrono::duration<double>(clock::now()-avg_birth_time).count();
+  logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime);
+}
+
+int SessionFilter::parse(
+    const std::vector<std::string> &args,
+    std::ostream *ss)
+{
+  ceph_assert(ss != NULL);
+
+  for (const auto &s : args) {
+    dout(20) << __func__ << " parsing filter '" << s << "'" << dendl;
+
+    auto eq = s.find("=");
+    if (eq == std::string::npos || eq == s.size()) {
+      // allow this to be a bare id for compatibility with pre-octopus asok
+      // 'session evict'.
+      std::string err;
+      id = strict_strtoll(s.c_str(), 10, &err);
+      if (!err.empty()) {
+	*ss << "Invalid filter '" << s << "'";
+	return -CEPHFS_EINVAL;
+      }
+      return 0;
+    }
+
+    // Keys that start with this are to be taken as referring
+    // to freeform client metadata fields.
+    const std::string metadata_prefix("client_metadata.");
+
+    auto k = s.substr(0, eq);
+    auto v = s.substr(eq + 1);
+
+    dout(20) << __func__ << " parsed k='" << k << "', v='" << v << "'" << dendl;
+
+    if (k.compare(0, metadata_prefix.size(), metadata_prefix) == 0
+        && k.size() > metadata_prefix.size()) {
+      // Filter on arbitrary metadata key (no fixed schema for this,
+      // so anything after the dot is a valid field to filter on)
+      auto metadata_key = k.substr(metadata_prefix.size());
+      metadata.insert(std::make_pair(metadata_key, v));
+    } else if (k == "auth_name") {
+      // Filter on client entity name
+      auth_name = v;
+    } else if (k == "state") {
+      state = v;
+    } else if (k == "id") {
+      std::string err;
+      id = strict_strtoll(v.c_str(), 10, &err);
+      if (!err.empty()) {
+        *ss << err;
+        return -CEPHFS_EINVAL;
+      }
+    } else if (k == "reconnecting") {
+
+      /**
+       * Strict boolean parser.  Allow true/false/0/1.
+       * Anything else is -CEPHFS_EINVAL.
+       */
+      auto is_true = [](std::string_view bstr, bool *out) -> bool
+      {
+        ceph_assert(out != nullptr);
+
+        if (bstr == "true" || bstr == "1") {
+          *out = true;
+          return 0;
+        } else if (bstr == "false" || bstr == "0") {
+          *out = false;
+          return 0;
+        } else {
+          return -CEPHFS_EINVAL;
+        }
+      };
+
+      bool bval;
+      int r = is_true(v, &bval);
+      if (r == 0) {
+        set_reconnecting(bval);
+      } else {
+        *ss << "Invalid boolean value '" << v << "'";
+        return -CEPHFS_EINVAL;
+      }
+    } else {
+      *ss << "Invalid filter key '" << k << "'";
+      return -CEPHFS_EINVAL;
+    }
+  }
+
+  return 0;
+}
+
+bool SessionFilter::match(
+    const Session &session,
+    std::function<bool(client_t)> is_reconnecting) const
+{
+  for (const auto &m : metadata) {
+    const auto &k = m.first;
+    const auto &v = m.second;
+    auto it = session.info.client_metadata.find(k);
+    if (it == session.info.client_metadata.end()) {
+      return false;
+    }
+    if (it->second != v) {
+      return false;
+    }
+  }
+
+  if (!auth_name.empty() && auth_name != session.info.auth_name.get_id()) {
+    return false;
+  }
+
+  if (!state.empty() && state != session.get_state_name()) {
+    return false;
+  }
+
+  if (id != 0 && id != session.info.inst.name.num()) {
+    return false;
+  }
+
+  if (reconnecting.first) {
+    const bool am_reconnecting = is_reconnecting(session.info.inst.name.num());
+    if (reconnecting.second != am_reconnecting) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+std::ostream& operator<<(std::ostream &out, const Session &s)
+{
+ if (s.get_human_name() == stringify(s.get_client())) {
+   out << s.get_human_name();
+ } else {
+   out << s.get_human_name() << " (" << std::dec << s.get_client() << ")";
+ }
+ return out;
+}
+
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h
new file mode 100644
index 000000000..e59f7f264
--- /dev/null
+++ b/src/mds/SessionMap.h
@@ -0,0 +1,844 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_SESSIONMAP_H
+#define CEPH_MDS_SESSIONMAP_H
+
+#include <set>
+
+#include "include/unordered_map.h"
+
+#include "include/Context.h"
+#include "include/xlist.h"
+#include "include/elist.h"
+#include "include/interval_set.h"
+#include "mdstypes.h"
+#include "mds/MDSAuthCaps.h"
+#include "common/perf_counters.h"
+#include "common/DecayCounter.h"
+
+#include "CInode.h"
+#include "Capability.h"
+#include "MDSContext.h"
+#include "msg/Message.h"
+
+struct MDRequestImpl;
+
+enum {
+  l_mdssm_first = 5500,
+  l_mdssm_session_count,
+  l_mdssm_session_add,
+  l_mdssm_session_remove,
+  l_mdssm_session_open,
+  l_mdssm_session_stale,
+  l_mdssm_total_load,
+  l_mdssm_avg_load,
+  l_mdssm_avg_session_uptime,
+  l_mdssm_last,
+};
+
+class CInode;
+
+/* 
+ * session
+ */
+
+class Session : public RefCountedObject {
+  // -- state etc --
+public:
+  /*
+                    
+        <deleted> <-- closed <------------+
+             ^         |                  |
+             |         v                  |
+          killing <-- opening <----+      |
+             ^         |           |      |
+             |         v           |      |
+           stale <--> open --> closing ---+
+
+    + additional dimension of 'importing' (with counter)
+
+  */
+
+  using clock = ceph::coarse_mono_clock;
+  using time = ceph::coarse_mono_time;
+
+  enum {
+    STATE_CLOSED = 0,
+    STATE_OPENING = 1,   // journaling open
+    STATE_OPEN = 2,
+    STATE_CLOSING = 3,   // journaling close
+    STATE_STALE = 4,
+    STATE_KILLING = 5
+  };
+
+  Session() = delete;
+  Session(ConnectionRef con) :
+    item_session_list(this),
+    requests(member_offset(MDRequestImpl, item_session_request)),
+    recall_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")),
+    release_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")),
+    recall_caps_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
+    recall_caps_throttle2o(0.5),
+    session_cache_liveness(g_conf().get_val<double>("mds_session_cache_liveness_decay_rate")),
+    cap_acquisition(g_conf().get_val<double>("mds_session_cap_acquisition_decay_rate")),
+    birth_time(clock::now())
+  {
+    set_connection(std::move(con));
+  }
+  ~Session() override {
+    ceph_assert(!item_session_list.is_on_list());
+    preopen_out_queue.clear();
+  }
+
+  static std::string_view get_state_name(int s) {
+    switch (s) {
+    case STATE_CLOSED: return "closed";
+    case STATE_OPENING: return "opening";
+    case STATE_OPEN: return "open";
+    case STATE_CLOSING: return "closing";
+    case STATE_STALE: return "stale";
+    case STATE_KILLING: return "killing";
+    default: return "???";
+    }
+  }
+
+  void dump(ceph::Formatter *f, bool cap_dump=false) const;
+  void push_pv(version_t pv)
+  {
+    ceph_assert(projected.empty() || projected.back() != pv);
+    projected.push_back(pv);
+  }
+
+  void pop_pv(version_t v)
+  {
+    ceph_assert(!projected.empty());
+    ceph_assert(projected.front() == v);
+    projected.pop_front();
+  }
+
+  int get_state() const { return state; }
+  void set_state(int new_state)
+  {
+    if (state != new_state) {
+      state = new_state;
+      state_seq++;
+    }
+  }
+
+  void set_reconnecting(bool s) { reconnecting = s; }
+
+  void decode(ceph::buffer::list::const_iterator &p);
+  template<typename T>
+  void set_client_metadata(T&& meta)
+  {
+    info.client_metadata = std::forward<T>(meta);
+    _update_human_name();
+  }
+
+  const std::string& get_human_name() const {return human_name;}
+
+  size_t get_request_count() const;
+
+  void notify_cap_release(size_t n_caps);
+  uint64_t notify_recall_sent(size_t new_limit);
+  auto get_recall_caps_throttle() const {
+    return recall_caps_throttle.get();
+  }
+  auto get_recall_caps_throttle2o() const {
+    return recall_caps_throttle2o.get();
+  }
+  auto get_recall_caps() const {
+    return recall_caps.get();
+  }
+  auto get_release_caps() const {
+    return release_caps.get();
+  }
+  auto get_session_cache_liveness() const {
+    return session_cache_liveness.get();
+  }
+  auto get_cap_acquisition() const {
+    return cap_acquisition.get();
+  }
+
+  inodeno_t take_ino(inodeno_t ino = 0) {
+    if (ino) {
+      if (!info.prealloc_inos.contains(ino))
+        return 0;
+      if (delegated_inos.contains(ino)) {
+	delegated_inos.erase(ino);
+      } else if (free_prealloc_inos.contains(ino)) {
+	free_prealloc_inos.erase(ino);
+      } else {
+	ceph_assert(0);
+      }
+    } else if (!free_prealloc_inos.empty()) {
+      ino = free_prealloc_inos.range_start();
+      free_prealloc_inos.erase(ino);
+    }
+    return ino;
+  }
+
+  void delegate_inos(int want, interval_set<inodeno_t>& inos) {
+    want -= (int)delegated_inos.size();
+    if (want <= 0)
+      return;
+
+    for (auto it = free_prealloc_inos.begin(); it != free_prealloc_inos.end(); ) {
+      if (want < (int)it.get_len()) {
+	inos.insert(it.get_start(), (inodeno_t)want);
+	delegated_inos.insert(it.get_start(), (inodeno_t)want);
+	free_prealloc_inos.erase(it.get_start(), (inodeno_t)want);
+	break;
+      }
+      want -= (int)it.get_len();
+      inos.insert(it.get_start(), it.get_len());
+      delegated_inos.insert(it.get_start(), it.get_len());
+      free_prealloc_inos.erase(it++);
+      if (want <= 0)
+	break;
+    }
+  }
+
+  // sans any delegated ones
+  int get_num_prealloc_inos() const {
+    return free_prealloc_inos.size();
+  }
+
+  int get_num_projected_prealloc_inos() const {
+    return get_num_prealloc_inos() + pending_prealloc_inos.size();
+  }
+
+  client_t get_client() const {
+    return info.get_client();
+  }
+
+  std::string_view get_state_name() const { return get_state_name(state); }
+  uint64_t get_state_seq() const { return state_seq; }
+  bool is_closed() const { return state == STATE_CLOSED; }
+  bool is_opening() const { return state == STATE_OPENING; }
+  bool is_open() const { return state == STATE_OPEN; }
+  bool is_closing() const { return state == STATE_CLOSING; }
+  bool is_stale() const { return state == STATE_STALE; }
+  bool is_killing() const { return state == STATE_KILLING; }
+
+  void inc_importing() {
+    ++importing_count;
+  }
+  void dec_importing() {
+    ceph_assert(importing_count > 0);
+    --importing_count;
+  }
+  bool is_importing() const { return importing_count > 0; }
+
+  void set_load_avg_decay_rate(double rate) {
+    ceph_assert(is_open() || is_stale());
+    load_avg = DecayCounter(rate);
+  }
+  uint64_t get_load_avg() const {
+    return (uint64_t)load_avg.get();
+  }
+  void hit_session() {
+    load_avg.adjust();
+  }
+
+  double get_session_uptime() const {
+    std::chrono::duration<double> uptime = clock::now() - birth_time;
+    return uptime.count();
+  }
+
+  time get_birth_time() const {
+    return birth_time;
+  }
+
+  void inc_cap_gen() { ++cap_gen; }
+  uint32_t get_cap_gen() const { return cap_gen; }
+
+  version_t inc_push_seq() { return ++cap_push_seq; }
+  version_t get_push_seq() const { return cap_push_seq; }
+
+  version_t wait_for_flush(MDSContext* c) {
+    waitfor_flush[get_push_seq()].push_back(c);
+    return get_push_seq();
+  }
+  void finish_flush(version_t seq, MDSContext::vec& ls) {
+    while (!waitfor_flush.empty()) {
+      auto it = waitfor_flush.begin();
+      if (it->first > seq)
+	break;
+      auto& v = it->second;
+      ls.insert(ls.end(), v.begin(), v.end());
+      waitfor_flush.erase(it);
+    }
+  }
+
+  void touch_readdir_cap(uint32_t count) {
+    cap_acquisition.hit(count);
+  }
+
+  void touch_cap(Capability *cap) {
+    session_cache_liveness.hit(1.0);
+    caps.push_front(&cap->item_session_caps);
+  }
+
+  void touch_cap_bottom(Capability *cap) {
+    session_cache_liveness.hit(1.0);
+    caps.push_back(&cap->item_session_caps);
+  }
+
+  void touch_lease(ClientLease *r) {
+    session_cache_liveness.hit(1.0);
+    leases.push_back(&r->item_session_lease);
+  }
+
+  bool is_any_flush_waiter() {
+    return !waitfor_flush.empty();
+  }
+
+  void add_completed_request(ceph_tid_t t, inodeno_t created) {
+    info.completed_requests[t] = created;
+    completed_requests_dirty = true;
+  }
+  bool trim_completed_requests(ceph_tid_t mintid) {
+    // trim
+    bool erased_any = false;
+    while (!info.completed_requests.empty() && 
+	   (mintid == 0 || info.completed_requests.begin()->first < mintid)) {
+      info.completed_requests.erase(info.completed_requests.begin());
+      erased_any = true;
+    }
+
+    if (erased_any) {
+      completed_requests_dirty = true;
+    }
+    return erased_any;
+  }
+  bool have_completed_request(ceph_tid_t tid, inodeno_t *pcreated) const {
+    auto p = info.completed_requests.find(tid);
+    if (p == info.completed_requests.end())
+      return false;
+    if (pcreated)
+      *pcreated = p->second;
+    return true;
+  }
+
+  void add_completed_flush(ceph_tid_t tid) {
+    info.completed_flushes.insert(tid);
+  }
+  bool trim_completed_flushes(ceph_tid_t mintid) {
+    bool erased_any = false;
+    while (!info.completed_flushes.empty() &&
+	(mintid == 0 || *info.completed_flushes.begin() < mintid)) {
+      info.completed_flushes.erase(info.completed_flushes.begin());
+      erased_any = true;
+    }
+    if (erased_any) {
+      completed_requests_dirty = true;
+    }
+    return erased_any;
+  }
+  bool have_completed_flush(ceph_tid_t tid) const {
+    return info.completed_flushes.count(tid);
+  }
+
+  uint64_t get_num_caps() const {
+    return caps.size();
+  }
+
+  unsigned get_num_completed_flushes() const { return info.completed_flushes.size(); }
+  unsigned get_num_trim_flushes_warnings() const {
+    return num_trim_flushes_warnings;
+  }
+  void inc_num_trim_flushes_warnings() { ++num_trim_flushes_warnings; }
+  void reset_num_trim_flushes_warnings() { num_trim_flushes_warnings = 0; }
+
+  unsigned get_num_completed_requests() const { return info.completed_requests.size(); }
+  unsigned get_num_trim_requests_warnings() const {
+    return num_trim_requests_warnings;
+  }
+  void inc_num_trim_requests_warnings() { ++num_trim_requests_warnings; }
+  void reset_num_trim_requests_warnings() { num_trim_requests_warnings = 0; }
+
+  bool has_dirty_completed_requests() const
+  {
+    return completed_requests_dirty;
+  }
+
+  void clear_dirty_completed_requests()
+  {
+    completed_requests_dirty = false;
+  }
+
+  int check_access(CInode *in, unsigned mask, int caller_uid, int caller_gid,
+		   const std::vector<uint64_t> *gid_list, int new_uid, int new_gid);
+
+  bool fs_name_capable(std::string_view fs_name, unsigned mask) const {
+    return auth_caps.fs_name_capable(fs_name, mask);
+  }
+
+  void set_connection(ConnectionRef con) {
+    connection = std::move(con);
+    auto& c = connection;
+    if (c) {
+      info.auth_name = c->get_peer_entity_name();
+      info.inst.addr = c->get_peer_socket_addr();
+      info.inst.name = entity_name_t(c->get_peer_type(), c->get_peer_global_id());
+    }
+  }
+  const ConnectionRef& get_connection() const {
+    return connection;
+  }
+
+  void clear() {
+    pending_prealloc_inos.clear();
+    free_prealloc_inos.clear();
+    delegated_inos.clear();
+    info.clear_meta();
+
+    cap_push_seq = 0;
+    last_cap_renew = clock::zero();
+  }
+
+  Session *reclaiming_from = nullptr;
+  session_info_t info;                         ///< durable bits
+  MDSAuthCaps auth_caps;
+
+  xlist<Session*>::item item_session_list;
+
+  std::list<ceph::ref_t<Message>> preopen_out_queue;  ///< messages for client, queued before they connect
+
+  /* This is mutable to allow get_request_count to be const. elist does not
+   * support const iterators yet.
+   */
+  mutable elist<MDRequestImpl*> requests;
+
+  interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos
+  interval_set<inodeno_t> free_prealloc_inos; //
+  interval_set<inodeno_t> delegated_inos; // hand these out to client
+
+  xlist<Capability*> caps;     // inodes with caps; front=most recently used
+  xlist<ClientLease*> leases;  // metadata leases to clients
+  time last_cap_renew = clock::zero();
+  time last_seen = clock::zero();
+
+  // -- leases --
+  uint32_t lease_seq = 0;
+
+protected:
+  ConnectionRef connection;
+
+private:
+  friend class SessionMap;
+
+  // Human (friendly) name is soft state generated from client metadata
+  void _update_human_name();
+
+  int state = STATE_CLOSED;
+  bool reconnecting = false;
+  uint64_t state_seq = 0;
+  int importing_count = 0;
+
+  std::string human_name;
+
+  // Versions in this session was projected: used to verify
+  // that appropriate mark_dirty calls follow.
+  std::deque<version_t> projected;
+
+  // request load average for this session
+  DecayCounter load_avg;
+
+  // Ephemeral state for tracking progress of capability recalls
+  // caps being recalled recently by this session; used for Beacon warnings
+  DecayCounter recall_caps;  // caps that have been released
+  DecayCounter release_caps;
+  // throttle on caps recalled
+  DecayCounter recall_caps_throttle;
+  // second order throttle that prevents recalling too quickly
+  DecayCounter recall_caps_throttle2o;
+  // New limit in SESSION_RECALL
+  uint32_t recall_limit = 0;
+
+  // session caps liveness
+  DecayCounter session_cache_liveness;
+
+  // cap acquisition via readdir
+  DecayCounter cap_acquisition;
+
+  // session start time -- used to track average session time
+  // note that this is initialized in the constructor rather
+  // than at the time of adding a session to the sessionmap
+  // as journal replay of sessionmap will not call add_session().
+  time birth_time;
+
+  // -- caps --
+  uint32_t cap_gen = 0;
+  version_t cap_push_seq = 0;        // cap push seq #
+  std::map<version_t, MDSContext::vec > waitfor_flush; // flush session messages
+
+  // Has completed_requests been modified since the last time we
+  // wrote this session out?
+  bool completed_requests_dirty = false;
+
+  unsigned num_trim_flushes_warnings = 0;
+  unsigned num_trim_requests_warnings = 0;
+};
+
+class SessionFilter
+{
+public:
+  SessionFilter() : reconnecting(false, false) {}
+
+  bool match(
+      const Session &session,
+      std::function<bool(client_t)> is_reconnecting) const;
+  int parse(const std::vector<std::string> &args, std::ostream *ss);
+  void set_reconnecting(bool v)
+  {
+    reconnecting.first = true;
+    reconnecting.second = v;
+  }
+
+  std::map<std::string, std::string> metadata;
+  std::string auth_name;
+  std::string state;
+  int64_t id = 0;
+protected:
+  // First is whether to filter, second is filter value
+  std::pair<bool, bool> reconnecting;
+};
+
+/*
+ * session map
+ */
+
+class MDSRank;
+
+/**
+ * Encapsulate the serialized state associated with SessionMap.  Allows
+ * encode/decode outside of live MDS instance.
+ */
+class SessionMapStore {
+public:
+  using clock = Session::clock;
+  using time = Session::time;
+
+  SessionMapStore(): total_load_avg(decay_rate) {}
+  virtual ~SessionMapStore() {};
+
+  version_t get_version() const {return version;}
+
+  virtual void encode_header(ceph::buffer::list *header_bl);
+  virtual void decode_header(ceph::buffer::list &header_bl);
+  virtual void decode_values(std::map<std::string, ceph::buffer::list> &session_vals);
+  virtual void decode_legacy(ceph::buffer::list::const_iterator& blp);
+  void dump(ceph::Formatter *f) const;
+
+  void set_rank(mds_rank_t r)
+  {
+    rank = r;
+  }
+
+  Session* get_or_add_session(const entity_inst_t& i) {
+    Session *s;
+    auto session_map_entry = session_map.find(i.name);
+    if (session_map_entry != session_map.end()) {
+      s = session_map_entry->second;
+    } else {
+      s = session_map[i.name] = new Session(ConnectionRef());
+      s->info.inst = i;
+      s->last_cap_renew = Session::clock::now();
+      if (logger) {
+        logger->set(l_mdssm_session_count, session_map.size());
+        logger->inc(l_mdssm_session_add);
+      }
+    }
+
+    return s;
+  }
+
+  static void generate_test_instances(std::list<SessionMapStore*>& ls);
+
+  void reset_state()
+  {
+    session_map.clear();
+  }
+
+  mds_rank_t rank = MDS_RANK_NONE;
+
+protected:
+  version_t version = 0;
+  ceph::unordered_map<entity_name_t, Session*> session_map;
+  PerfCounters *logger =nullptr;
+
+  // total request load avg
+  double decay_rate = g_conf().get_val<double>("mds_request_load_average_decay_rate");
+  DecayCounter total_load_avg;
+};
+
+class SessionMap : public SessionMapStore {
+public:
+  SessionMap() = delete;
+  explicit SessionMap(MDSRank *m) : mds(m) {}
+
+  ~SessionMap() override
+  {
+    for (auto p : by_state)
+      delete p.second;
+
+    if (logger) {
+      g_ceph_context->get_perfcounters_collection()->remove(logger);
+    }
+
+    delete logger;
+  }
+
+  uint64_t set_state(Session *session, int state);
+  void update_average_session_age();
+
+  void register_perfcounters();
+
+  void set_version(const version_t v)
+  {
+    version = projected = v;
+  }
+
+  void set_projected(const version_t v)
+  {
+    projected = v;
+  }
+
+  version_t get_projected() const
+  {
+    return projected;
+  }
+
+  version_t get_committed() const
+  {
+    return committed;
+  }
+
+  version_t get_committing() const
+  {
+    return committing;
+  }
+
+  // sessions
+  void decode_legacy(ceph::buffer::list::const_iterator& blp) override;
+  bool empty() const { return session_map.empty(); }
+  const auto& get_sessions() const {
+    return session_map;
+  }
+
+  bool is_any_state(int state) const {
+    auto it = by_state.find(state);
+    if (it == by_state.end() || it->second->empty())
+      return false;
+    return true;
+  }
+
+  bool have_unclosed_sessions() const {
+    return
+      is_any_state(Session::STATE_OPENING) ||
+      is_any_state(Session::STATE_OPEN) ||
+      is_any_state(Session::STATE_CLOSING) ||
+      is_any_state(Session::STATE_STALE) ||
+      is_any_state(Session::STATE_KILLING);
+  }
+  bool have_session(entity_name_t w) const {
+    return session_map.count(w);
+  }
+  Session* get_session(entity_name_t w) {
+    auto session_map_entry = session_map.find(w);
+    return (session_map_entry != session_map.end() ?
+	    session_map_entry-> second : nullptr);
+  }
+  const Session* get_session(entity_name_t w) const {
+    ceph::unordered_map<entity_name_t, Session*>::const_iterator p = session_map.find(w);
+    if (p == session_map.end()) {
+      return NULL;
+    } else {
+      return p->second;
+    }
+  }
+
+  void add_session(Session *s);
+  void remove_session(Session *s);
+  void touch_session(Session *session);
+
+  Session *get_oldest_session(int state) {
+    auto by_state_entry = by_state.find(state);
+    if (by_state_entry == by_state.end() || by_state_entry->second->empty())
+      return 0;
+    return by_state_entry->second->front();
+  }
+
+  void dump();
+
+  template<typename F>
+  void get_client_sessions(F&& f) const {
+    for (const auto& p : session_map) {
+      auto& session = p.second;
+      if (session->info.inst.name.is_client())
+	f(session);
+    }
+  }
+  template<typename C>
+  void get_client_session_set(C& c) const {
+    auto f = [&c](auto& s) {
+      c.insert(s);
+    };
+    get_client_sessions(f);
+  }
+
+  // helpers
+  entity_inst_t& get_inst(entity_name_t w) {
+    ceph_assert(session_map.count(w));
+    return session_map[w]->info.inst;
+  }
+  version_t get_push_seq(client_t client) {
+    return get_session(entity_name_t::CLIENT(client.v))->get_push_seq();
+  }
+  bool have_completed_request(metareqid_t rid) {
+    Session *session = get_session(rid.name);
+    return session && session->have_completed_request(rid.tid, NULL);
+  }
+  void trim_completed_requests(entity_name_t c, ceph_tid_t tid) {
+    Session *session = get_session(c);
+    ceph_assert(session);
+    session->trim_completed_requests(tid);
+  }
+
+  void wipe();
+  void wipe_ino_prealloc();
+
+  object_t get_object_name() const;
+
+  void load(MDSContext *onload);
+  void _load_finish(
+      int operation_r,
+      int header_r,
+      int values_r,
+      bool first,
+      ceph::buffer::list &header_bl,
+      std::map<std::string, ceph::buffer::list> &session_vals,
+      bool more_session_vals);
+
+  void load_legacy();
+  void _load_legacy_finish(int r, ceph::buffer::list &bl);
+
+  void save(MDSContext *onsave, version_t needv=0);
+  void _save_finish(version_t v);
+
+  /**
+   * Advance the version, and mark this session
+   * as dirty within the new version.
+   *
+   * Dirty means journalled but needing writeback
+   * to the backing store.  Must have called
+   * mark_projected previously for this session.
+   */
+  void mark_dirty(Session *session, bool may_save=true);
+
+  /**
+   * Advance the projected version, and mark this
+   * session as projected within the new version
+   *
+   * Projected means the session is updated in memory
+   * but we're waiting for the journal write of the update
+   * to finish.  Must subsequently call mark_dirty
+   * for sessions in the same global order as calls
+   * to mark_projected.
+   */
+  version_t mark_projected(Session *session);
+
+  /**
+   * During replay, advance versions to account
+   * for a session modification, and mark the
+   * session dirty.
+   */
+  void replay_dirty_session(Session *session);
+
+  /**
+   * During replay, if a session no longer present
+   * would have consumed a version, advance `version`
+   * and `projected` to account for that.
+   */
+  void replay_advance_version();
+
+  /**
+   * During replay, open sessions, advance versions and
+   * mark these sessions as dirty.
+   */
+  void replay_open_sessions(version_t event_cmapv,
+			    std::map<client_t,entity_inst_t>& client_map,
+			    std::map<client_t,client_metadata_t>& client_metadata_map);
+
+  /**
+   * For these session IDs, if a session exists with this ID, and it has
+   * dirty completed_requests, then persist it immediately
+   * (ahead of usual project/dirty versioned writes
+   *  of the map).
+   */
+  void save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
+                     MDSGatherBuilder *gather_bld);
+
+  void hit_session(Session *session);
+  void handle_conf_change(const std::set <std::string> &changed);
+
+  MDSRank *mds;
+  std::map<int,xlist<Session*>*> by_state;
+  std::map<version_t, MDSContext::vec> commit_waiters;
+
+  // -- loading, saving --
+  inodeno_t ino;
+  MDSContext::vec waiting_for_load;
+
+protected:
+  void _mark_dirty(Session *session, bool may_save);
+
+  version_t projected = 0, committing = 0, committed = 0;
+  std::set<entity_name_t> dirty_sessions;
+  std::set<entity_name_t> null_sessions;
+  bool loaded_legacy = false;
+
+private:
+  uint64_t get_session_count_in_state(int state) {
+    return !is_any_state(state) ? 0 : by_state[state]->size();
+  }
+
+  void update_average_birth_time(const Session &s, bool added=true) {
+    uint32_t sessions = session_map.size();
+    time birth_time = s.get_birth_time();
+
+    if (sessions == 1) {
+      avg_birth_time = added ? birth_time : clock::zero();
+      return;
+    }
+
+    if (added) {
+      avg_birth_time = clock::time_point(
+        ((avg_birth_time - clock::zero()) / sessions) * (sessions - 1) +
+        (birth_time - clock::zero()) / sessions);
+    } else {
+      avg_birth_time = clock::time_point(
+        ((avg_birth_time - clock::zero()) / (sessions - 1)) * sessions -
+        (birth_time - clock::zero()) / (sessions - 1));
+    }
+  }
+
+  time avg_birth_time = clock::zero();
+};
+
+std::ostream& operator<<(std::ostream &out, const Session &s);
+#endif
diff --git a/src/mds/SimpleLock.cc b/src/mds/SimpleLock.cc
new file mode 100644
index 000000000..76448ee9d
--- /dev/null
+++ b/src/mds/SimpleLock.cc
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "SimpleLock.h"
+#include "Mutation.h"
+
+void SimpleLock::dump(ceph::Formatter *f) const {
+  ceph_assert(f != NULL);
+  if (is_sync_and_unlocked()) {
+    return;
+  }
+
+  f->open_array_section("gather_set");
+  if (have_more()) {
+    for(const auto &i : more()->gather_set) {
+      f->dump_int("rank", i);
+    }
+  }
+  f->close_section();
+
+  f->dump_string("state", get_state_name(get_state()));
+  f->dump_bool("is_leased", is_leased());
+  f->dump_int("num_rdlocks", get_num_rdlocks());
+  f->dump_int("num_wrlocks", get_num_wrlocks());
+  f->dump_int("num_xlocks", get_num_xlocks());
+  f->open_object_section("xlock_by");
+  if (get_xlock_by()) {
+    get_xlock_by()->dump(f);
+  }
+  f->close_section();
+}
+
+int SimpleLock::get_wait_shift() const {
+  switch (get_type()) {
+    case CEPH_LOCK_DN:       return 8;
+    case CEPH_LOCK_DVERSION: return 8 + 1*SimpleLock::WAIT_BITS;
+    case CEPH_LOCK_IAUTH:    return 8 + 2*SimpleLock::WAIT_BITS;
+    case CEPH_LOCK_ILINK:    return 8 + 3*SimpleLock::WAIT_BITS;
+    case CEPH_LOCK_IDFT:     return 8 + 4*SimpleLock::WAIT_BITS;
+    case CEPH_LOCK_IFILE:    return 8 + 5*SimpleLock::WAIT_BITS;
+    case CEPH_LOCK_IVERSION: return 8 + 6*SimpleLock::WAIT_BITS;
+    case CEPH_LOCK_IXATTR:   return 8 + 7*SimpleLock::WAIT_BITS;
+    case CEPH_LOCK_ISNAP:    return 8 + 8*SimpleLock::WAIT_BITS;
+    case CEPH_LOCK_INEST:    return 8 + 9*SimpleLock::WAIT_BITS;
+    case CEPH_LOCK_IFLOCK:   return 8 +10*SimpleLock::WAIT_BITS;
+    case CEPH_LOCK_IPOLICY:  return 8 +11*SimpleLock::WAIT_BITS;
+    default:
+      ceph_abort();
+  }
+}
+
+int SimpleLock::get_cap_shift() const {
+  switch (get_type()) {
+    case CEPH_LOCK_IAUTH: return CEPH_CAP_SAUTH;
+    case CEPH_LOCK_ILINK: return CEPH_CAP_SLINK;
+    case CEPH_LOCK_IFILE: return CEPH_CAP_SFILE;
+    case CEPH_LOCK_IXATTR: return CEPH_CAP_SXATTR;
+    default: return 0;
+  }
+}
+
+int SimpleLock::get_cap_mask() const {
+  switch (get_type()) {
+    case CEPH_LOCK_IFILE: return (1 << CEPH_CAP_FILE_BITS) - 1;
+    default: return (1 << CEPH_CAP_SIMPLE_BITS) - 1;
+  }
+}
+
+SimpleLock::unstable_bits_t::unstable_bits_t() :
+  lock_caches(member_offset(MDLockCache::LockItem, item_lock)) {}
+
+void SimpleLock::add_cache(MDLockCacheItem& item) {
+  more()->lock_caches.push_back(&item.item_lock);
+  state_flags |= CACHED;
+}
+
+void SimpleLock::remove_cache(MDLockCacheItem& item) {
+  auto& lock_caches = more()->lock_caches;
+  item.item_lock.remove_myself();
+  if (lock_caches.empty()) {
+    state_flags &= ~CACHED;
+    try_clear_more();
+  }
+}
+
+std::vector<MDLockCache*> SimpleLock::get_active_caches() {
+  std::vector<MDLockCache*> result;
+  if (have_more()) {
+    for (auto it = more()->lock_caches.begin_use_current(); !it.end(); ++it) {
+      auto lock_cache = (*it)->parent;
+      if (!lock_cache->invalidating)
+	result.push_back(lock_cache);
+    }
+  }
+  return result;
+}
diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h
new file mode 100644
index 000000000..725c4488c
--- /dev/null
+++ b/src/mds/SimpleLock.h
@@ -0,0 +1,667 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_SIMPLELOCK_H
+#define CEPH_SIMPLELOCK_H
+
+#include <boost/intrusive_ptr.hpp>
+
+#include "MDSCacheObject.h"
+#include "MDSContext.h"
+
+// -- lock types --
+// see CEPH_LOCK_*
+
+extern "C" {
+#include "locks.h"
+}
+
+#define CAP_ANY     0
+#define CAP_LONER   1
+#define CAP_XLOCKER 2
+
+struct MDLockCache;
+struct MDLockCacheItem;
+struct MutationImpl;
+typedef boost::intrusive_ptr<MutationImpl> MutationRef;
+
+struct LockType {
+  explicit LockType(int t) : type(t) {
+    switch (type) {
+    case CEPH_LOCK_DN:
+    case CEPH_LOCK_IAUTH:
+    case CEPH_LOCK_ILINK:
+    case CEPH_LOCK_IXATTR:
+    case CEPH_LOCK_ISNAP:
+    case CEPH_LOCK_IFLOCK:
+    case CEPH_LOCK_IPOLICY:
+      sm = &sm_simplelock;
+      break;
+    case CEPH_LOCK_IDFT:
+    case CEPH_LOCK_INEST:
+      sm = &sm_scatterlock;
+      break;
+    case CEPH_LOCK_IFILE:
+      sm = &sm_filelock;
+      break;
+    case CEPH_LOCK_DVERSION:
+    case CEPH_LOCK_IVERSION:
+      sm = &sm_locallock;
+      break;
+    default:
+      sm = 0;
+    }
+  }
+
+  int type;
+  const sm_t *sm;
+};
+
+
+class SimpleLock {
+public:
+  // waiting
+  static const uint64_t WAIT_RD          = (1<<0);  // to read
+  static const uint64_t WAIT_WR          = (1<<1);  // to write
+  static const uint64_t WAIT_XLOCK       = (1<<2);  // to xlock   (** dup)
+  static const uint64_t WAIT_STABLE      = (1<<2);  // for a stable state
+  static const uint64_t WAIT_REMOTEXLOCK = (1<<3);  // for a remote xlock
+  static const int WAIT_BITS        = 4;
+  static const uint64_t WAIT_ALL         = ((1<<WAIT_BITS)-1);
+
+  static std::string_view get_state_name(int n) {
+    switch (n) {
+    case LOCK_UNDEF: return "UNDEF";
+    case LOCK_SYNC: return "sync";
+    case LOCK_LOCK: return "lock";
+
+    case LOCK_PREXLOCK: return "prexlock";
+    case LOCK_XLOCK: return "xlock";
+    case LOCK_XLOCKDONE: return "xlockdone";
+    case LOCK_XLOCKSNAP: return "xlocksnap";
+    case LOCK_LOCK_XLOCK: return "lock->xlock";
+
+    case LOCK_SYNC_LOCK: return "sync->lock";
+    case LOCK_LOCK_SYNC: return "lock->sync";
+    case LOCK_REMOTEXLOCK: return "remote_xlock";
+    case LOCK_EXCL: return "excl";
+    case LOCK_EXCL_SYNC: return "excl->sync";
+    case LOCK_EXCL_LOCK: return "excl->lock";
+    case LOCK_SYNC_EXCL: return "sync->excl";
+    case LOCK_LOCK_EXCL: return "lock->excl";      
+
+    case LOCK_XSYN: return "xsyn";
+    case LOCK_XSYN_EXCL: return "xsyn->excl";
+    case LOCK_EXCL_XSYN: return "excl->xsyn";
+    case LOCK_XSYN_SYNC: return "xsyn->sync";
+    case LOCK_XSYN_LOCK: return "xsyn->lock";
+    case LOCK_XSYN_MIX: return "xsyn->mix";
+
+    case LOCK_SYNC_MIX: return "sync->mix";
+    case LOCK_SYNC_MIX2: return "sync->mix(2)";
+    case LOCK_LOCK_TSYN: return "lock->tsyn";
+      
+    case LOCK_MIX_LOCK: return "mix->lock";
+    case LOCK_MIX_LOCK2: return "mix->lock(2)";
+    case LOCK_MIX: return "mix";
+    case LOCK_MIX_TSYN: return "mix->tsyn";
+      
+    case LOCK_TSYN_MIX: return "tsyn->mix";
+    case LOCK_TSYN_LOCK: return "tsyn->lock";
+    case LOCK_TSYN: return "tsyn";
+
+    case LOCK_MIX_SYNC: return "mix->sync";
+    case LOCK_MIX_SYNC2: return "mix->sync(2)";
+    case LOCK_EXCL_MIX: return "excl->mix";
+    case LOCK_MIX_EXCL: return "mix->excl";
+
+    case LOCK_PRE_SCAN: return "*->scan";
+    case LOCK_SCAN: return "scan";
+
+    case LOCK_SNAP_SYNC: return "snap->sync";
+
+    default: ceph_abort(); return std::string_view();
+    }
+  }
+
+  static std::string_view get_lock_type_name(int t) {
+    switch (t) {
+      case CEPH_LOCK_DN: return "dn";
+      case CEPH_LOCK_DVERSION: return "dversion";
+      case CEPH_LOCK_IVERSION: return "iversion";
+      case CEPH_LOCK_IFILE: return "ifile";
+      case CEPH_LOCK_IAUTH: return "iauth";
+      case CEPH_LOCK_ILINK: return "ilink";
+      case CEPH_LOCK_IDFT: return "idft";
+      case CEPH_LOCK_INEST: return "inest";
+      case CEPH_LOCK_IXATTR: return "ixattr";
+      case CEPH_LOCK_ISNAP: return "isnap";
+      case CEPH_LOCK_IFLOCK: return "iflock";
+      case CEPH_LOCK_IPOLICY: return "ipolicy";
+      default: return "unknown";
+    }
+  }
+
+  static std::string_view get_lock_action_name(int a) {
+    switch (a) {
+      case LOCK_AC_SYNC: return "sync";
+      case LOCK_AC_MIX: return "mix";
+      case LOCK_AC_LOCK: return "lock";
+      case LOCK_AC_LOCKFLUSHED: return "lockflushed";
+
+      case LOCK_AC_SYNCACK: return "syncack";
+      case LOCK_AC_MIXACK: return "mixack";
+      case LOCK_AC_LOCKACK: return "lockack";
+
+      case LOCK_AC_REQSCATTER: return "reqscatter";
+      case LOCK_AC_REQUNSCATTER: return "requnscatter";
+      case LOCK_AC_NUDGE: return "nudge";
+      case LOCK_AC_REQRDLOCK: return "reqrdlock";
+      default: return "???";
+    }
+  }
+
+  SimpleLock(MDSCacheObject *o, LockType *lt) :
+    type(lt),
+    parent(o)
+  {}
+  virtual ~SimpleLock() {}
+
+  client_t get_excl_client() const {
+    return have_more() ? more()->excl_client : -1;
+  }
+  void set_excl_client(client_t c) {
+    if (c < 0 && !have_more())
+      return;  // default is -1
+    more()->excl_client = c;
+  }
+
+  virtual bool is_scatterlock() const {
+    return false;
+  }
+  virtual bool is_locallock() const {
+    return false;
+  }
+
+  // parent
+  MDSCacheObject *get_parent() { return parent; }
+  int get_type() const { return type->type; }
+  const sm_t* get_sm() const { return type->sm; }
+
+  int get_wait_shift() const;
+  int get_cap_shift() const;
+  int get_cap_mask() const;
+
+  void decode_locked_state(const ceph::buffer::list& bl) {
+    parent->decode_lock_state(type->type, bl);
+  }
+  void encode_locked_state(ceph::buffer::list& bl) {
+    parent->encode_lock_state(type->type, bl);
+  }
+  void finish_waiters(uint64_t mask, int r=0) {
+    parent->finish_waiting(mask << get_wait_shift(), r);
+  }
+  void take_waiting(uint64_t mask, MDSContext::vec& ls) {
+    parent->take_waiting(mask << get_wait_shift(), ls);
+  }
+  void add_waiter(uint64_t mask, MDSContext *c) {
+    parent->add_waiter((mask << get_wait_shift()) | MDSCacheObject::WAIT_ORDERED, c);
+  }
+  bool is_waiter_for(uint64_t mask) const {
+    return parent->is_waiter_for(mask << get_wait_shift());
+  }
+
+  bool is_cached() const {
+    return state_flags & CACHED;
+  }
+  void add_cache(MDLockCacheItem& item);
+  void remove_cache(MDLockCacheItem& item);
+  std::vector<MDLockCache*> get_active_caches();
+
+  // state
+  int get_state() const { return state; }
+  int set_state(int s) { 
+    state = s; 
+    //assert(!is_stable() || gather_set.size() == 0);  // gather should be empty in stable states.
+    return s;
+  }
+  void set_state_rejoin(int s, MDSContext::vec& waiters, bool survivor) {
+    ceph_assert(!get_parent()->is_auth());
+
+    // If lock in the replica object was not in SYNC state when auth mds of the object failed.
+    // Auth mds of the object may take xlock on the lock and change the object when replaying
+    // unsafe requests.
+    if (!survivor || state != LOCK_SYNC)
+      mark_need_recover();
+
+    state = s;
+
+    if (is_stable())
+      take_waiting(SimpleLock::WAIT_ALL, waiters);
+  }
+
+  bool is_stable() const {
+    return get_sm()->states[state].next == 0;
+  }
+  bool is_unstable_and_locked() const {
+    return (!is_stable() && is_locked());
+  }
+  bool is_locked() const {
+    return is_rdlocked() || is_wrlocked() || is_xlocked();
+  }
+  int get_next_state() {
+    return get_sm()->states[state].next;
+  }
+
+  bool is_sync_and_unlocked() const {
+    return
+      get_state() == LOCK_SYNC &&
+      !is_rdlocked() &&
+      !is_leased() &&
+      !is_wrlocked() &&
+      !is_xlocked();
+  }
+
+  /*
+  bool fw_rdlock_to_auth() {
+    return get_sm()->states[state].can_rdlock == FW;
+  }
+  */
+  bool req_rdlock_from_auth() {
+    return get_sm()->states[state].can_rdlock == REQ;
+  }
+
+  // gather set
+  static std::set<int32_t> empty_gather_set;
+
+  // int32_t: <0 is client, >=0 is MDS rank
+  const std::set<int32_t>& get_gather_set() const {
+    return have_more() ? more()->gather_set : empty_gather_set;
+  }
+
+  void init_gather() {
+    for (const auto& p : parent->get_replicas()) {
+      more()->gather_set.insert(p.first);
+    }
+  }
+  bool is_gathering() const {
+    return have_more() && !more()->gather_set.empty();
+  }
+  bool is_gathering(int32_t i) const {
+    return have_more() && more()->gather_set.count(i);
+  }
+  void clear_gather() {
+    if (have_more())
+      more()->gather_set.clear();
+  }
+  void remove_gather(int32_t i) {
+    if (have_more())
+      more()->gather_set.erase(i);
+  }
+
+  virtual bool is_dirty() const { return false; }
+  virtual bool is_stale() const { return false; }
+  virtual bool is_flushing() const { return false; }
+  virtual bool is_flushed() const { return false; }
+  virtual void clear_flushed() { }
+
+  // can_*
+  bool can_lease(client_t client) const {
+    return get_sm()->states[state].can_lease == ANY ||
+      (get_sm()->states[state].can_lease == AUTH && parent->is_auth()) ||
+      (get_sm()->states[state].can_lease == XCL && client >= 0 && get_xlock_by_client() == client);
+  }
+  bool can_read(client_t client) const {
+    return get_sm()->states[state].can_read == ANY ||
+      (get_sm()->states[state].can_read == AUTH && parent->is_auth()) ||
+      (get_sm()->states[state].can_read == XCL && client >= 0 && get_xlock_by_client() == client);
+  }
+  bool can_read_projected(client_t client) const {
+    return get_sm()->states[state].can_read_projected == ANY ||
+      (get_sm()->states[state].can_read_projected == AUTH && parent->is_auth()) ||
+      (get_sm()->states[state].can_read_projected == XCL && client >= 0 && get_xlock_by_client() == client);
+  }
+  bool can_rdlock(client_t client) const {
+    return get_sm()->states[state].can_rdlock == ANY ||
+      (get_sm()->states[state].can_rdlock == AUTH && parent->is_auth()) ||
+      (get_sm()->states[state].can_rdlock == XCL && client >= 0 && get_xlock_by_client() == client);
+  }
+  bool can_wrlock(client_t client) const {
+    return get_sm()->states[state].can_wrlock == ANY ||
+      (get_sm()->states[state].can_wrlock == AUTH && parent->is_auth()) ||
+      (get_sm()->states[state].can_wrlock == XCL && client >= 0 && (get_xlock_by_client() == client ||
+								    get_excl_client() == client));
+  }
+  bool can_force_wrlock(client_t client) const {
+    return get_sm()->states[state].can_force_wrlock == ANY ||
+      (get_sm()->states[state].can_force_wrlock == AUTH && parent->is_auth()) ||
+      (get_sm()->states[state].can_force_wrlock == XCL && client >= 0 && (get_xlock_by_client() == client ||
+									  get_excl_client() == client));
+  }
+  bool can_xlock(client_t client) const {
+    return get_sm()->states[state].can_xlock == ANY ||
+      (get_sm()->states[state].can_xlock == AUTH && parent->is_auth()) ||
+      (get_sm()->states[state].can_xlock == XCL && client >= 0 && get_xlock_by_client() == client);
+  }
+
+  // rdlock
+  bool is_rdlocked() const { return num_rdlock > 0; }
+  int get_rdlock() { 
+    if (!num_rdlock)
+      parent->get(MDSCacheObject::PIN_LOCK);
+    return ++num_rdlock; 
+  }
+  int put_rdlock() {
+    ceph_assert(num_rdlock>0);
+    --num_rdlock;
+    if (num_rdlock == 0)
+      parent->put(MDSCacheObject::PIN_LOCK);
+    return num_rdlock;
+  }
+  int get_num_rdlocks() const {
+    return num_rdlock;
+  }
+
+  // wrlock
+  void get_wrlock(bool force=false) {
+    //assert(can_wrlock() || force);
+    if (more()->num_wrlock == 0)
+      parent->get(MDSCacheObject::PIN_LOCK);
+    ++more()->num_wrlock;
+  }
+  void put_wrlock() {
+    --more()->num_wrlock;
+    if (more()->num_wrlock == 0) {
+      parent->put(MDSCacheObject::PIN_LOCK);
+      try_clear_more();
+    }
+  }
+  bool is_wrlocked() const {
+    return have_more() && more()->num_wrlock > 0;
+  }
+  int get_num_wrlocks() const {
+    return have_more() ? more()->num_wrlock : 0;
+  }
+
+  // xlock
+  void get_xlock(MutationRef who, client_t client) { 
+    ceph_assert(get_xlock_by() == MutationRef());
+    ceph_assert(state == LOCK_XLOCK || is_locallock() ||
+	   state == LOCK_LOCK /* if we are a peer */);
+    parent->get(MDSCacheObject::PIN_LOCK);
+    more()->num_xlock++;
+    more()->xlock_by = who; 
+    more()->xlock_by_client = client;
+  }
+  void set_xlock_done() {
+    ceph_assert(more()->xlock_by);
+    ceph_assert(state == LOCK_XLOCK || is_locallock() ||
+	   state == LOCK_LOCK /* if we are a peer */);
+    if (!is_locallock())
+      state = LOCK_XLOCKDONE;
+    more()->xlock_by.reset();
+  }
+  void put_xlock() {
+    ceph_assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE ||
+	   state == LOCK_XLOCKSNAP || state == LOCK_LOCK_XLOCK ||
+	   state == LOCK_LOCK  || /* if we are a leader of a peer */
+	   is_locallock());
+    --more()->num_xlock;
+    parent->put(MDSCacheObject::PIN_LOCK);
+    if (more()->num_xlock == 0) {
+      more()->xlock_by.reset();
+      more()->xlock_by_client = -1;
+      try_clear_more();
+    }
+  }
+  bool is_xlocked() const {
+    return have_more() && more()->num_xlock > 0;
+  }
+  int get_num_xlocks() const {
+    return have_more() ? more()->num_xlock : 0;
+  }
+  client_t get_xlock_by_client() const {
+    return have_more() ? more()->xlock_by_client : -1;
+  }
+  bool is_xlocked_by_client(client_t c) const {
+    return have_more() ? more()->xlock_by_client == c : false;
+  }
+  MutationRef get_xlock_by() const {
+    return have_more() ? more()->xlock_by : MutationRef();
+  }
+  
+  // lease
+  bool is_leased() const {
+    return state_flags & LEASED;
+  }
+  void get_client_lease() {
+    ceph_assert(!is_leased());
+    state_flags |= LEASED;
+  }
+  void put_client_lease() {
+    ceph_assert(is_leased());
+    state_flags &= ~LEASED;
+  }
+
+  bool needs_recover() const {
+    return state_flags & NEED_RECOVER;
+  }
+  void mark_need_recover() {
+    state_flags |= NEED_RECOVER;
+  }
+  void clear_need_recover() {
+    state_flags &= ~NEED_RECOVER;
+  }
+
+  // encode/decode
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(2, 2, bl);
+    encode(state, bl);
+    if (have_more())
+      encode(more()->gather_set, bl);
+    else
+      encode(empty_gather_set, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    DECODE_START(2, p);
+    decode(state, p);
+    std::set<__s32> g;
+    decode(g, p);
+    if (!g.empty())
+      more()->gather_set.swap(g);
+    DECODE_FINISH(p);
+  }
+  void encode_state_for_replica(ceph::buffer::list& bl) const {
+    __s16 s = get_replica_state();
+    using ceph::encode;
+    encode(s, bl);
+  }
+  void decode_state(ceph::buffer::list::const_iterator& p, bool is_new=true) {
+    using ceph::decode;
+    __s16 s;
+    decode(s, p);
+    if (is_new)
+      state = s;
+  }
+  void decode_state_rejoin(ceph::buffer::list::const_iterator& p, MDSContext::vec& waiters, bool survivor) {
+    __s16 s;
+    using ceph::decode;
+    decode(s, p);
+    set_state_rejoin(s, waiters, survivor);
+  }
+
+  // caps
+  bool is_loner_mode() const {
+    return get_sm()->states[state].loner;
+  }
+  int gcaps_allowed_ever() const {
+    return parent->is_auth() ? get_sm()->allowed_ever_auth : get_sm()->allowed_ever_replica;
+  }
+  int gcaps_allowed(int who, int s=-1) const {
+    if (s < 0) s = state;
+    if (parent->is_auth()) {
+      if (get_xlock_by_client() >= 0 && who == CAP_XLOCKER)
+	return get_sm()->states[s].xlocker_caps | get_sm()->states[s].caps; // xlocker always gets more
+      else if (is_loner_mode() && who == CAP_ANY)
+	return get_sm()->states[s].caps;
+      else 
+	return get_sm()->states[s].loner_caps | get_sm()->states[s].caps;  // loner always gets more
+    } else 
+      return get_sm()->states[s].replica_caps;
+  }
+  int gcaps_careful() const {
+    if (get_num_wrlocks())
+      return get_sm()->careful;
+    return 0;
+  }
+
+  int gcaps_xlocker_mask(client_t client) const {
+    if (client == get_xlock_by_client())
+      return type->type == CEPH_LOCK_IFILE ? 0xf : (CEPH_CAP_GSHARED|CEPH_CAP_GEXCL);
+    return 0;
+  }
+
+  // simplelock specifics
+  int get_replica_state() const {
+    return get_sm()->states[state].replica_state;
+  }
+  void export_twiddle() {
+    clear_gather();
+    state = get_replica_state();
+  }
+
+  bool remove_replica(int from) {
+    if (is_gathering(from)) {
+      remove_gather(from);
+      if (!is_gathering())
+	return true;
+    }
+    return false;
+  }
+  bool do_import(int from, int to) {
+    if (!is_stable()) {
+      remove_gather(from);
+      remove_gather(to);
+      if (!is_gathering())
+	return true;
+    }
+    if (!is_stable() && !is_gathering())
+      return true;
+    return false;
+  }
+
+  void _print(std::ostream& out) const {
+    out << get_lock_type_name(get_type()) << " ";
+    out << get_state_name(get_state());
+    if (!get_gather_set().empty())
+      out << " g=" << get_gather_set();
+    if (is_leased())
+      out << " l";
+    if (is_rdlocked()) 
+      out << " r=" << get_num_rdlocks();
+    if (is_wrlocked()) 
+      out << " w=" << get_num_wrlocks();
+    if (is_xlocked()) {
+      out << " x=" << get_num_xlocks();
+      if (get_xlock_by())
+	out << " by " << get_xlock_by();
+    }
+    /*if (is_stable())
+      out << " stable";
+    else
+      out << " unstable";
+    */
+  }
+
+  /**
+   * Write bare values (caller must be in an object section)
+   * to formatter, or nothing if is_sync_and_unlocked.
+   */
+  void dump(ceph::Formatter *f) const;
+
+  virtual void print(std::ostream& out) const {
+    out << "(";
+    _print(out);
+    out << ")";
+  }
+
+  LockType *type;
+
+protected:
+  // parent (what i lock)
+  MDSCacheObject *parent;
+
+  // lock state
+  __s16 state = LOCK_SYNC;
+  __s16 state_flags = 0;
+
+  enum {
+    LEASED		= 1 << 0,
+    NEED_RECOVER	= 1 << 1,
+    CACHED		= 1 << 2,
+  };
+
+private:
+  // XXX not in mempool
+  struct unstable_bits_t {
+    unstable_bits_t();
+
+    bool empty() {
+      return
+	gather_set.empty() &&
+	num_wrlock == 0 &&
+	num_xlock == 0 &&
+	xlock_by.get() == NULL &&
+	xlock_by_client == -1 &&
+	excl_client == -1 &&
+	lock_caches.empty();
+    }
+
+    std::set<__s32> gather_set;  // auth+rep.  >= 0 is mds, < 0 is client
+
+    // local state
+    int num_wrlock = 0, num_xlock = 0;
+    MutationRef xlock_by;
+    client_t xlock_by_client = -1;
+    client_t excl_client = -1;
+
+    elist<MDLockCacheItem*> lock_caches;
+  };
+
+  bool have_more() const { return _unstable ? true : false; }
+  unstable_bits_t *more() const {
+    if (!_unstable)
+      _unstable.reset(new unstable_bits_t);
+    return _unstable.get();
+  }
+  void try_clear_more() {
+    if (_unstable && _unstable->empty()) {
+      _unstable.reset();
+    }
+  }
+
+  int num_rdlock = 0;
+
+  mutable std::unique_ptr<unstable_bits_t> _unstable;
+};
+WRITE_CLASS_ENCODER(SimpleLock)
+
+inline std::ostream& operator<<(std::ostream& out, const SimpleLock& l) 
+{
+  l.print(out);
+  return out;
+}
+#endif
diff --git a/src/mds/SnapClient.cc b/src/mds/SnapClient.cc
new file mode 100644
index 000000000..9f83ed79e
--- /dev/null
+++ b/src/mds/SnapClient.cc
@@ -0,0 +1,316 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "MDSMap.h"
+#include "MDSRank.h"
+#include "msg/Messenger.h"
+#include "messages/MMDSTableRequest.h"
+#include "SnapClient.h"
+
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".snapclient "
+
+void SnapClient::resend_queries()
+{
+  if (!waiting_for_version.empty() || (!synced && sync_reqid > 0)) {
+    version_t want;
+    if (!waiting_for_version.empty())
+      want = std::max<version_t>(cached_version, waiting_for_version.rbegin()->first);
+    else
+      want = std::max<version_t>(cached_version, 1);
+    refresh(want, NULL);
+    if (!synced)
+      sync_reqid = last_reqid;
+  }
+}
+
+void SnapClient::handle_query_result(const cref_t<MMDSTableRequest> &m)
+{
+  dout(10) << __func__ << " " << *m << dendl;
+
+  char type;
+  using ceph::decode;
+  auto p = m->bl.cbegin();
+  decode(type, p);
+
+  switch (type) {
+  case 'U': // uptodate
+    ceph_assert(cached_version == m->get_tid());
+    break;
+  case 'F': // full
+    {
+      decode(cached_snaps, p);
+      decode(cached_pending_update, p);
+      decode(cached_pending_destroy, p);
+
+      snapid_t last_created, last_destroyed;
+      decode(last_created, p);
+      decode(last_destroyed, p);
+
+      if (last_created > cached_last_created)
+	cached_last_created = last_created;
+      if (last_destroyed > cached_last_destroyed)
+	cached_last_destroyed = last_destroyed;
+
+      cached_version = m->get_tid();
+    }
+    break;
+  default:
+    ceph_abort();
+  };
+
+  if (!committing_tids.empty()) {
+    for (auto p = committing_tids.begin();
+	 p != committing_tids.end() && *p <= cached_version; ) {
+      if (cached_pending_update.count(*p)) {
+	if (cached_pending_update[*p].snapid > cached_last_created)
+	  cached_last_created = cached_pending_update[*p].snapid;
+	++p;
+      } else if (cached_pending_destroy.count(*p)) {
+	if (cached_pending_destroy[*p].second > cached_last_destroyed)
+	  cached_last_destroyed = cached_pending_destroy[*p].second;
+	++p;
+      } else {
+	// pending update/destroy have been committed.
+	committing_tids.erase(p++);
+      }
+    }
+  }
+
+  if (m->op == TABLESERVER_OP_QUERY_REPLY && m->reqid >= sync_reqid)
+    synced = true;
+
+  if (synced && !waiting_for_version.empty()) {
+    MDSContext::vec finished;
+    while (!waiting_for_version.empty()) {
+      auto it = waiting_for_version.begin();
+      if (it->first > cached_version)
+	break;
+      auto& v = it->second;
+      finished.insert(finished.end(), v.begin(), v.end());
+      waiting_for_version.erase(it);
+    }
+    if (!finished.empty())
+      mds->queue_waiters(finished);
+  }
+}
+
+void SnapClient::handle_notify_prep(const cref_t<MMDSTableRequest> &m)
+{
+  dout(10) << __func__ << " " << *m << dendl;
+  handle_query_result(m);
+  auto ack = make_message<MMDSTableRequest>(table, TABLESERVER_OP_NOTIFY_ACK, 0, m->get_tid());
+  mds->send_message(ack, m->get_connection());
+}
+
+void SnapClient::notify_commit(version_t tid)
+{
+  dout(10) << __func__ << " tid " << tid << dendl;
+
+  ceph_assert(cached_version == 0 || cached_version >= tid);
+  if (cached_version == 0) {
+    committing_tids.insert(tid);
+  } else if (cached_pending_update.count(tid)) {
+    committing_tids.insert(tid);
+    if (cached_pending_update[tid].snapid > cached_last_created)
+      cached_last_created = cached_pending_update[tid].snapid;
+  } else if (cached_pending_destroy.count(tid)) {
+    committing_tids.insert(tid);
+    if (cached_pending_destroy[tid].second > cached_last_destroyed)
+      cached_last_destroyed = cached_pending_destroy[tid].second;
+  } else if (cached_version > tid) {
+    // no need to record the tid if it has already been committed.
+  } else {
+    ceph_abort();
+  }
+}
+
+void SnapClient::refresh(version_t want, MDSContext *onfinish)
+{
+  dout(10) << __func__ << " want " << want << dendl;
+
+  ceph_assert(want >= cached_version);
+  if (onfinish)
+    waiting_for_version[want].push_back(onfinish);
+
+  if (!server_ready)
+    return;
+
+  mds_rank_t ts = mds->mdsmap->get_tableserver();
+  auto req = make_message<MMDSTableRequest>(table, TABLESERVER_OP_QUERY, ++last_reqid, 0);
+  using ceph::encode;
+  char op = 'F';
+  encode(op, req->bl);
+  encode(cached_version, req->bl);
+  mds->send_message_mds(req, ts);
+}
+
+void SnapClient::sync(MDSContext *onfinish)
+{
+  dout(10) << __func__ << dendl;
+
+  refresh(std::max<version_t>(cached_version, 1), onfinish);
+  synced = false;
+  if (server_ready)
+    sync_reqid = last_reqid;
+  else
+    sync_reqid = (last_reqid == ~0ULL) ? 1 : last_reqid + 1;
+}
+
+void SnapClient::get_snaps(set<snapid_t>& result) const
+{
+  ceph_assert(cached_version > 0);
+  for (auto& p : cached_snaps)
+    result.insert(p.first);
+
+  for (auto tid : committing_tids) {
+    auto q = cached_pending_update.find(tid);
+    if (q != cached_pending_update.end())
+      result.insert(q->second.snapid);
+
+    auto r = cached_pending_destroy.find(tid);
+    if (r != cached_pending_destroy.end())
+      result.erase(r->second.first);
+  }
+}
+
+set<snapid_t> SnapClient::filter(const set<snapid_t>& snaps) const
+{
+  ceph_assert(cached_version > 0);
+  if (snaps.empty())
+    return snaps;
+
+  set<snapid_t> result;
+
+  for (auto p : snaps) {
+    if (cached_snaps.count(p))
+      result.insert(p);
+  }
+
+  for (auto tid : committing_tids) {
+    auto q = cached_pending_update.find(tid);
+    if (q != cached_pending_update.end()) {
+      if (snaps.count(q->second.snapid))
+	result.insert(q->second.snapid);
+    }
+
+    auto r = cached_pending_destroy.find(tid);
+    if (r != cached_pending_destroy.end())
+      result.erase(r->second.first);
+  }
+
+  dout(10) << __func__ << " " << snaps << " -> " << result <<  dendl;
+  return result;
+}
+
+const SnapInfo* SnapClient::get_snap_info(snapid_t snapid) const
+{
+  ceph_assert(cached_version > 0);
+
+  const SnapInfo* result = NULL;
+  auto it = cached_snaps.find(snapid);
+  if (it != cached_snaps.end())
+    result = &it->second;
+
+  for (auto tid : committing_tids) {
+    auto q = cached_pending_update.find(tid);
+    if (q != cached_pending_update.end() && q->second.snapid == snapid) {
+      result = &q->second;
+      break;
+    }
+
+    auto r = cached_pending_destroy.find(tid);
+    if (r != cached_pending_destroy.end() && r->second.first == snapid) {
+      result = NULL;
+      break;
+    }
+  }
+
+  dout(10) << __func__ << " snapid " << snapid << " -> " << result <<  dendl;
+  return result;
+}
+
+void SnapClient::get_snap_infos(map<snapid_t, const SnapInfo*>& infomap,
+			        const set<snapid_t>& snaps) const
+{
+  ceph_assert(cached_version > 0);
+
+  if (snaps.empty())
+    return;
+
+  map<snapid_t, const SnapInfo*> result;
+  for (auto p : snaps) {
+    auto it = cached_snaps.find(p);
+    if (it != cached_snaps.end())
+      result[p] = &it->second;
+  }
+
+  for (auto tid : committing_tids) {
+    auto q = cached_pending_update.find(tid);
+    if (q != cached_pending_update.end()) {
+      if (snaps.count(q->second.snapid))
+	result[q->second.snapid] = &q->second;
+    }
+
+    auto r = cached_pending_destroy.find(tid);
+    if (r != cached_pending_destroy.end())
+      result.erase(r->second.first);
+  }
+
+  infomap.insert(result.begin(), result.end());
+}
+
+int SnapClient::dump_cache(Formatter *f) const
+{
+  if (!is_synced()) {
+    dout(5) << "dump_cache: not synced" << dendl;
+    return -CEPHFS_EINVAL;
+  }
+
+  map<snapid_t, const SnapInfo*> snaps;
+  for (auto& p : cached_snaps)
+    snaps[p.first] = &p.second;
+
+  for (auto tid : committing_tids) {
+    auto q = cached_pending_update.find(tid);
+    if (q != cached_pending_update.end())
+      snaps[q->second.snapid] = &q->second;
+
+    auto r = cached_pending_destroy.find(tid);
+    if (r != cached_pending_destroy.end())
+      snaps.erase(r->second.first);
+  }
+
+  f->open_object_section("snapclient");
+
+  f->dump_int("last_created", get_last_created());
+  f->dump_int("last_destroyed", get_last_destroyed());
+
+  f->open_array_section("snaps");
+  for (auto p : snaps) {
+    f->open_object_section("snap");
+    p.second->dump(f);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->close_section();
+
+  return 0;
+}
diff --git a/src/mds/SnapClient.h b/src/mds/SnapClient.h
new file mode 100644
index 000000000..e259f7924
--- /dev/null
+++ b/src/mds/SnapClient.h
@@ -0,0 +1,112 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_SNAPCLIENT_H
+#define CEPH_SNAPCLIENT_H
+
+#include <string_view>
+
+#include "MDSTableClient.h"
+#include "snap.h"
+#include "MDSContext.h"
+
+class MDSRank;
+class LogSegment;
+
+class SnapClient : public MDSTableClient {
+public:
+  explicit SnapClient(MDSRank *m) :
+    MDSTableClient(m, TABLE_SNAP) {}
+
+  void resend_queries() override;
+  void handle_query_result(const cref_t<MMDSTableRequest> &m) override;
+  void handle_notify_prep(const cref_t<MMDSTableRequest> &m) override;
+  void notify_commit(version_t tid) override;
+
+  void prepare_create(inodeno_t dirino, std::string_view name, utime_t stamp,
+		      version_t *pstid, bufferlist *pbl, MDSContext *onfinish) {
+    bufferlist bl;
+    __u32 op = TABLE_OP_CREATE;
+    encode(op, bl);
+    encode(dirino, bl);
+    encode(name, bl);
+    encode(stamp, bl);
+    _prepare(bl, pstid, pbl, onfinish);
+  }
+
+  void prepare_create_realm(inodeno_t ino, version_t *pstid, bufferlist *pbl, MDSContext *onfinish) {
+    bufferlist bl;
+    __u32 op = TABLE_OP_CREATE;
+    encode(op, bl);
+    encode(ino, bl);
+    _prepare(bl, pstid, pbl, onfinish);
+  }
+
+  void prepare_destroy(inodeno_t ino, snapid_t snapid, version_t *pstid, bufferlist *pbl, MDSContext *onfinish) {
+    bufferlist bl;
+    __u32 op = TABLE_OP_DESTROY;
+    encode(op, bl);
+    encode(ino, bl);
+    encode(snapid, bl);
+    _prepare(bl, pstid, pbl, onfinish);
+  }
+
+  void prepare_update(inodeno_t ino, snapid_t snapid, std::string_view name, utime_t stamp,
+		      version_t *pstid, MDSContext *onfinish) {
+    bufferlist bl;
+    __u32 op = TABLE_OP_UPDATE;
+    encode(op, bl);
+    encode(ino, bl);
+    encode(snapid, bl);
+    encode(name, bl);
+    encode(stamp, bl);
+    _prepare(bl, pstid, NULL, onfinish);
+  }
+
+  version_t get_cached_version() const { return cached_version; }
+  void refresh(version_t want, MDSContext *onfinish);
+
+  void sync(MDSContext *onfinish);
+
+  bool is_synced() const { return synced; }
+  void wait_for_sync(MDSContext *c) {
+    ceph_assert(!synced);
+    waiting_for_version[std::max<version_t>(cached_version, 1)].push_back(c);
+  }
+
+  snapid_t get_last_created() const { return cached_last_created; }
+  snapid_t get_last_destroyed() const { return cached_last_destroyed; }
+
+  void get_snaps(set<snapid_t>& snaps) const;
+  set<snapid_t> filter(const set<snapid_t>& snaps) const;
+  const SnapInfo* get_snap_info(snapid_t snapid) const;
+  void get_snap_infos(map<snapid_t, const SnapInfo*>& infomap, const set<snapid_t>& snaps) const;
+
+  int dump_cache(Formatter *f) const;
+
+private:
+  version_t cached_version = 0;
+  snapid_t cached_last_created = 0, cached_last_destroyed = 0;
+  map<snapid_t, SnapInfo> cached_snaps;
+  map<version_t, SnapInfo> cached_pending_update;
+  map<version_t, pair<snapid_t,snapid_t> > cached_pending_destroy;
+
+  set<version_t> committing_tids;
+
+  map<version_t, MDSContext::vec > waiting_for_version;
+
+  uint64_t sync_reqid = 0;
+  bool synced = false;
+};
+#endif
diff --git a/src/mds/SnapRealm.cc b/src/mds/SnapRealm.cc
new file mode 100644
index 000000000..09ddd9181
--- /dev/null
+++ b/src/mds/SnapRealm.cc
@@ -0,0 +1,450 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "SnapRealm.h"
+#include "MDCache.h"
+#include "MDSRank.h"
+#include "SnapClient.h"
+
+#include <string_view>
+
+
+/*
+ * SnapRealm
+ */
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mdcache->mds->get_nodeid(), inode, srnode.seq, this)
+static ostream& _prefix(std::ostream *_dout, int whoami, const CInode *inode,
+			uint64_t seq, const SnapRealm *realm) {
+  return *_dout << " mds." << whoami
+		<< ".cache.snaprealm(" << inode->ino()
+		<< " seq " << seq << " " << realm << ") ";
+}
+
+ostream& operator<<(ostream& out, const SnapRealm& realm) 
+{
+  out << "snaprealm(" << realm.inode->ino()
+      << " seq " << realm.srnode.seq
+      << " lc " << realm.srnode.last_created
+      << " cr " << realm.srnode.created;
+  if (realm.srnode.created != realm.srnode.current_parent_since)
+    out << " cps " << realm.srnode.current_parent_since;
+  out << " snaps=" << realm.srnode.snaps;
+  out << " past_parent_snaps=" << realm.srnode.past_parent_snaps;
+
+  if (realm.srnode.is_parent_global())
+    out << " global ";
+  out << " " << &realm << ")";
+  return out;
+}
+
+SnapRealm::SnapRealm(MDCache *c, CInode *in) :
+    mdcache(c), inode(in), inodes_with_caps(member_offset(CInode, item_caps))
+{
+  global = (inode->ino() == CEPH_INO_GLOBAL_SNAPREALM);
+}
+
+/*
+ * get list of snaps for this realm.  we must include parents' snaps
+ * for the intervals during which they were our parent.
+ */
+void SnapRealm::build_snap_set() const
+{
+  dout(10) << "build_snap_set on " << *this << dendl;
+
+  cached_snaps.clear();
+
+  if (global) {
+    mdcache->mds->snapclient->get_snaps(cached_snaps);
+    return;
+  }
+
+  // include my snaps
+  for (const auto& p : srnode.snaps)
+    cached_snaps.insert(p.first);
+
+  if (!srnode.past_parent_snaps.empty()) {
+    set<snapid_t> snaps = mdcache->mds->snapclient->filter(srnode.past_parent_snaps);
+    if (!snaps.empty()) {
+      snapid_t last = *snaps.rbegin();
+      cached_seq = std::max(cached_seq, last);
+      cached_last_created = std::max(cached_last_created, last);
+    }
+    cached_snaps.insert(snaps.begin(), snaps.end());
+  }
+
+  snapid_t parent_seq = parent ? parent->get_newest_seq() : snapid_t(0);
+  if (parent_seq >= srnode.current_parent_since) {
+    auto& snaps = parent->get_snaps();
+    auto p = snaps.lower_bound(srnode.current_parent_since);
+    cached_snaps.insert(p, snaps.end());
+    cached_seq = std::max(cached_seq, parent_seq);
+    cached_last_created = std::max(cached_last_created, parent->get_last_created());
+  }
+}
+
+void SnapRealm::check_cache() const
+{
+  snapid_t seq;
+  snapid_t last_created;
+  snapid_t last_destroyed = mdcache->mds->snapclient->get_last_destroyed();
+  if (global || srnode.is_parent_global()) {
+    last_created = mdcache->mds->snapclient->get_last_created();
+    seq = std::max(last_created, last_destroyed);
+  } else {
+    last_created = srnode.last_created;
+    seq = srnode.seq;
+  }
+  if (cached_seq >= seq &&
+      cached_last_destroyed == last_destroyed)
+    return;
+
+  cached_snap_context.clear();
+
+  cached_seq = seq;
+  cached_last_created = last_created;
+  cached_last_destroyed = last_destroyed;
+
+  cached_subvolume_ino = 0;
+  if (parent)
+    cached_subvolume_ino = parent->get_subvolume_ino();
+  if (!cached_subvolume_ino && srnode.is_subvolume())
+    cached_subvolume_ino = inode->ino();
+
+  build_snap_set();
+
+  build_snap_trace();
+  
+  dout(10) << "check_cache rebuilt " << cached_snaps
+	   << " seq " << seq
+	   << " cached_seq " << cached_seq
+	   << " cached_last_created " << cached_last_created
+	   << " cached_last_destroyed " << cached_last_destroyed
+	   << ")" << dendl;
+}
+
+const set<snapid_t>& SnapRealm::get_snaps() const
+{
+  check_cache();
+  dout(10) << "get_snaps " << cached_snaps
+	   << " (seq " << srnode.seq << " cached_seq " << cached_seq << ")"
+	   << dendl;
+  return cached_snaps;
+}
+
+/*
+ * build vector in reverse sorted order
+ */
+const SnapContext& SnapRealm::get_snap_context() const
+{
+  check_cache();
+
+  if (!cached_snap_context.seq) {
+    cached_snap_context.seq = cached_seq;
+    cached_snap_context.snaps.resize(cached_snaps.size());
+    unsigned i = 0;
+    for (set<snapid_t>::reverse_iterator p = cached_snaps.rbegin();
+	 p != cached_snaps.rend();
+	 ++p)
+      cached_snap_context.snaps[i++] = *p;
+  }
+
+  return cached_snap_context;
+}
+
+void SnapRealm::get_snap_info(map<snapid_t, const SnapInfo*>& infomap, snapid_t first, snapid_t last)
+{
+  const set<snapid_t>& snaps = get_snaps();
+  dout(10) << "get_snap_info snaps " << snaps << dendl;
+
+  // include my snaps within interval [first,last]
+  for (auto p = srnode.snaps.lower_bound(first); // first element >= first
+       p != srnode.snaps.end() && p->first <= last;
+       ++p)
+    infomap[p->first] = &p->second;
+
+  if (!srnode.past_parent_snaps.empty()) {
+    set<snapid_t> snaps;
+    for (auto p = srnode.past_parent_snaps.lower_bound(first); // first element >= first
+	p != srnode.past_parent_snaps.end() && *p <= last;
+	++p) {
+      snaps.insert(*p);
+    }
+
+    map<snapid_t, const SnapInfo*> _infomap;
+    mdcache->mds->snapclient->get_snap_infos(_infomap, snaps);
+    infomap.insert(_infomap.begin(), _infomap.end());
+  }
+
+  if (srnode.current_parent_since <= last && parent)
+    parent->get_snap_info(infomap, std::max(first, srnode.current_parent_since), last);
+}
+
+std::string_view SnapRealm::get_snapname(snapid_t snapid, inodeno_t atino)
+{
+  auto srnode_snaps_entry = srnode.snaps.find(snapid);
+  if (srnode_snaps_entry != srnode.snaps.end()) {
+    if (atino == inode->ino())
+      return srnode_snaps_entry->second.name;
+    else
+      return srnode_snaps_entry->second.get_long_name();
+  }
+
+  if (!srnode.past_parent_snaps.empty()) {
+    if (srnode.past_parent_snaps.count(snapid)) {
+      const SnapInfo *sinfo = mdcache->mds->snapclient->get_snap_info(snapid);
+      if (sinfo) {
+	if (atino == sinfo->ino)
+	  return sinfo->name;
+	else
+	  return sinfo->get_long_name();
+      }
+    }
+  }
+
+  ceph_assert(srnode.current_parent_since <= snapid);
+  ceph_assert(parent);
+  return parent->get_snapname(snapid, atino);
+}
+
+snapid_t SnapRealm::resolve_snapname(std::string_view n, inodeno_t atino, snapid_t first, snapid_t last)
+{
+  // first try me
+  dout(10) << "resolve_snapname '" << n << "' in [" << first << "," << last << "]" << dendl;
+
+  bool actual = (atino == inode->ino());
+  string pname;
+  inodeno_t pino;
+  if (n.length() && n[0] == '_') {
+    size_t next_ = n.find_last_of('_');
+    if (next_ > 1 && next_ + 1 < n.length()) {
+      pname = n.substr(1, next_ - 1);
+      pino = atoll(n.data() + next_ + 1);
+      dout(10) << " " << n << " parses to name '" << pname << "' dirino " << pino << dendl;
+    }
+  }
+
+  for (auto p = srnode.snaps.lower_bound(first); // first element >= first
+       p != srnode.snaps.end() && p->first <= last;
+       ++p) {
+    dout(15) << " ? " << p->second << dendl;
+    //if (num && p->second.snapid == num)
+    //return p->first;
+    if (actual && p->second.name == n)
+	return p->first;
+    if (!actual && p->second.name == pname && p->second.ino == pino)
+      return p->first;
+  }
+
+  if (!srnode.past_parent_snaps.empty()) {
+    set<snapid_t> snaps;
+    for (auto p = srnode.past_parent_snaps.lower_bound(first); // first element >= first
+	 p != srnode.past_parent_snaps.end() && *p <= last;
+	 ++p)
+      snaps.insert(*p);
+
+    map<snapid_t, const SnapInfo*> _infomap;
+    mdcache->mds->snapclient->get_snap_infos(_infomap, snaps);
+
+    for (auto& it : _infomap) {
+      dout(15) << " ? " << *it.second << dendl;
+      actual = (it.second->ino == atino);
+      if (actual && it.second->name == n)
+	return it.first;
+      if (!actual && it.second->name == pname && it.second->ino == pino)
+	return it.first;
+    }
+  }
+
+  if (parent && srnode.current_parent_since <= last)
+    return parent->resolve_snapname(n, atino, std::max(first, srnode.current_parent_since), last);
+  return 0;
+}
+
+
+void SnapRealm::adjust_parent()
+{
+  SnapRealm *newparent;
+  if (srnode.is_parent_global()) {
+    newparent = mdcache->get_global_snaprealm();
+  } else {
+    CDentry *pdn = inode->get_parent_dn();
+    newparent = pdn ? pdn->get_dir()->get_inode()->find_snaprealm() : NULL;
+  }
+  if (newparent != parent) {
+    dout(10) << "adjust_parent " << parent << " -> " << newparent << dendl;
+    if (parent)
+      parent->open_children.erase(this);
+    parent = newparent;
+    if (parent)
+      parent->open_children.insert(this);
+    
+    invalidate_cached_snaps();
+  }
+}
+
+void SnapRealm::split_at(SnapRealm *child)
+{
+  dout(10) << "split_at " << *child 
+	   << " on " << *child->inode << dendl;
+
+  if (inode->is_mdsdir() || !child->inode->is_dir()) {
+    // it's not a dir.
+    if (child->inode->containing_realm) {
+      //  - no open children.
+      //  - only need to move this child's inode's caps.
+      child->inode->move_to_realm(child);
+    } else {
+      // no caps, nothing to move/split.
+      dout(20) << " split no-op, no caps to move on file " << *child->inode << dendl;
+      ceph_assert(!child->inode->is_any_caps());
+    }
+    return;
+  }
+
+  // it's a dir.
+
+  // split open_children
+  dout(10) << " open_children are " << open_children << dendl;
+  for (set<SnapRealm*>::iterator p = open_children.begin();
+       p != open_children.end(); ) {
+    SnapRealm *realm = *p;
+    if (realm != child &&
+	child->inode->is_ancestor_of(realm->inode)) {
+      dout(20) << " child gets child realm " << *realm << " on " << *realm->inode << dendl;
+      realm->parent = child;
+      child->open_children.insert(realm);
+      open_children.erase(p++);
+    } else {
+      dout(20) << "    keeping child realm " << *realm << " on " << *realm->inode << dendl;
+      ++p;
+    }
+  }
+
+  // split inodes_with_caps
+  for (auto p = inodes_with_caps.begin(); !p.end(); ) {
+    CInode *in = *p;
+    ++p;
+    // does inode fall within the child realm?
+    if (child->inode->is_ancestor_of(in)) {
+      dout(20) << " child gets " << *in << dendl;
+      in->move_to_realm(child);
+    } else {
+      dout(20) << "    keeping " << *in << dendl;
+    }
+  }
+}
+
+void SnapRealm::merge_to(SnapRealm *newparent)
+{
+  if (!newparent)
+    newparent = parent;
+  dout(10) << "merge to " << *newparent << " on " << *newparent->inode << dendl;
+
+  dout(10) << " open_children are " << open_children << dendl;
+  for (auto realm : open_children) {
+    dout(20) << " child realm " << *realm << " on " << *realm->inode << dendl;
+    newparent->open_children.insert(realm);
+    realm->parent = newparent;
+  }
+  open_children.clear();
+
+  for (auto p = inodes_with_caps.begin(); !p.end(); ) {
+    CInode *in = *p;
+    ++p;
+    in->move_to_realm(newparent);
+  }
+  ceph_assert(inodes_with_caps.empty());
+
+  // delete this
+  inode->close_snaprealm();
+}
+
+const bufferlist& SnapRealm::get_snap_trace() const
+{
+  check_cache();
+  return cached_snap_trace;
+}
+
+void SnapRealm::build_snap_trace() const
+{
+  cached_snap_trace.clear();
+
+  if (global) {
+    SnapRealmInfo info(inode->ino(), 0, cached_seq, 0);
+    info.my_snaps.reserve(cached_snaps.size());
+    for (auto p = cached_snaps.rbegin(); p != cached_snaps.rend(); ++p)
+      info.my_snaps.push_back(*p);
+
+    dout(10) << "build_snap_trace my_snaps " << info.my_snaps << dendl;
+    encode(info, cached_snap_trace);
+    return;
+  }
+
+  SnapRealmInfo info(inode->ino(), srnode.created, srnode.seq, srnode.current_parent_since);
+  if (parent) {
+    info.h.parent = parent->inode->ino();
+
+    set<snapid_t> past;
+    if (!srnode.past_parent_snaps.empty()) {
+      past = mdcache->mds->snapclient->filter(srnode.past_parent_snaps);
+      if (srnode.is_parent_global()) {
+	auto p = past.lower_bound(srnode.current_parent_since);
+	past.erase(p, past.end());
+      }
+    }
+
+    if (!past.empty()) {
+      info.prior_parent_snaps.reserve(past.size());
+      for (set<snapid_t>::reverse_iterator p = past.rbegin(); p != past.rend(); ++p)
+	info.prior_parent_snaps.push_back(*p);
+      dout(10) << "build_snap_trace prior_parent_snaps from [1," << *past.rbegin() << "] "
+	       << info.prior_parent_snaps << dendl;
+    }
+  }
+
+  info.my_snaps.reserve(srnode.snaps.size());
+  for (auto p = srnode.snaps.rbegin();
+       p != srnode.snaps.rend();
+       ++p)
+    info.my_snaps.push_back(p->first);
+  dout(10) << "build_snap_trace my_snaps " << info.my_snaps << dendl;
+
+  encode(info, cached_snap_trace);
+
+  if (parent)
+    cached_snap_trace.append(parent->get_snap_trace());
+}
+
+void SnapRealm::prune_past_parent_snaps()
+{
+  dout(10) << __func__ << dendl;
+  check_cache();
+
+  for (auto p = srnode.past_parent_snaps.begin();
+       p != srnode.past_parent_snaps.end(); ) {
+    auto q = cached_snaps.find(*p);
+    if (q == cached_snaps.end()) {
+      dout(10) << __func__ << " pruning " << *p << dendl;
+      srnode.past_parent_snaps.erase(p++);
+    } else {
+      dout(10) << __func__ << " keeping " << *p << dendl;
+      ++p;
+    }
+  }
+}
+
diff --git a/src/mds/SnapRealm.h b/src/mds/SnapRealm.h
new file mode 100644
index 000000000..7ddffe0ba
--- /dev/null
+++ b/src/mds/SnapRealm.h
@@ -0,0 +1,150 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_SNAPREALM_H
+#define CEPH_MDS_SNAPREALM_H
+
+#include <string_view>
+
+#include "mdstypes.h"
+#include "snap.h"
+#include "include/xlist.h"
+#include "include/elist.h"
+#include "common/snap_types.h"
+#include "MDSContext.h"
+
+struct SnapRealm {
+public:
+  SnapRealm(MDCache *c, CInode *in);
+
+  bool exists(std::string_view name) const {
+    for (auto p = srnode.snaps.begin(); p != srnode.snaps.end(); ++p) {
+      if (p->second.name == name)
+	return true;
+    }
+    return false;
+  }
+
+  void prune_past_parent_snaps();
+  bool has_past_parent_snaps() const {
+    return !srnode.past_parent_snaps.empty();
+  }
+
+  void build_snap_set() const;
+  void get_snap_info(std::map<snapid_t, const SnapInfo*>& infomap, snapid_t first=0, snapid_t last=CEPH_NOSNAP);
+
+  const ceph::buffer::list& get_snap_trace() const;
+  void build_snap_trace() const;
+
+  std::string_view get_snapname(snapid_t snapid, inodeno_t atino);
+  snapid_t resolve_snapname(std::string_view name, inodeno_t atino, snapid_t first=0, snapid_t last=CEPH_NOSNAP);
+
+  const std::set<snapid_t>& get_snaps() const;
+  const SnapContext& get_snap_context() const;
+  void invalidate_cached_snaps() {
+    cached_seq = 0;
+  }
+  snapid_t get_last_created() {
+    check_cache();
+    return cached_last_created;
+  }
+  snapid_t get_last_destroyed() {
+    check_cache();
+    return cached_last_destroyed;
+  }
+  snapid_t get_newest_snap() {
+    check_cache();
+    if (cached_snaps.empty())
+      return 0;
+    else
+      return *cached_snaps.rbegin();
+  }
+  snapid_t get_newest_seq() {
+    check_cache();
+    return cached_seq;
+  }
+
+  snapid_t get_snap_following(snapid_t follows) {
+    check_cache();
+    const std::set<snapid_t>& s = get_snaps();
+    auto p = s.upper_bound(follows);
+    if (p != s.end())
+      return *p;
+    return CEPH_NOSNAP;
+  }
+
+  bool has_snaps_in_range(snapid_t first, snapid_t last) {
+    check_cache();
+    const auto& s = get_snaps();
+    auto p = s.lower_bound(first);
+    return (p != s.end() && *p <= last);
+  }
+
+  inodeno_t get_subvolume_ino() {
+    check_cache();
+    return cached_subvolume_ino;
+  }
+
+  void adjust_parent();
+
+  void split_at(SnapRealm *child);
+  void merge_to(SnapRealm *newparent);
+
+  void add_cap(client_t client, Capability *cap) {
+    auto client_caps_entry = client_caps.find(client);
+    if (client_caps_entry == client_caps.end())
+      client_caps_entry = client_caps.emplace(client,
+					      new xlist<Capability*>).first;
+    client_caps_entry->second->push_back(&cap->item_snaprealm_caps);
+  }
+  void remove_cap(client_t client, Capability *cap) {
+    cap->item_snaprealm_caps.remove_myself();
+    auto found = client_caps.find(client);
+    if (found != client_caps.end() && found->second->empty()) {
+      delete found->second;
+      client_caps.erase(found);
+    }
+  }
+
+  // realm state
+  sr_t srnode;
+
+  // in-memory state
+  MDCache *mdcache;
+  CInode *inode;
+
+  SnapRealm *parent = nullptr;
+  std::set<SnapRealm*> open_children;    // active children that are currently open
+
+  elist<CInode*> inodes_with_caps;             // for efficient realm splits
+  std::map<client_t, xlist<Capability*>* > client_caps;   // to identify clients who need snap notifications
+
+protected:
+  void check_cache() const;
+
+private:
+  bool global;
+
+  // cache
+  mutable snapid_t cached_seq;           // max seq over self and all past+present parents.
+  mutable snapid_t cached_last_created;  // max last_created over all past+present parents
+  mutable snapid_t cached_last_destroyed;
+  mutable std::set<snapid_t> cached_snaps;
+  mutable SnapContext cached_snap_context;
+  mutable ceph::buffer::list cached_snap_trace;
+  mutable inodeno_t cached_subvolume_ino = 0;
+};
+
+std::ostream& operator<<(std::ostream& out, const SnapRealm &realm);
+#endif
diff --git a/src/mds/SnapServer.cc b/src/mds/SnapServer.cc
new file mode 100644
index 000000000..f16480035
--- /dev/null
+++ b/src/mds/SnapServer.cc
@@ -0,0 +1,510 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "SnapServer.h"
+#include "MDSRank.h"
+#include "osd/OSDMap.h"
+#include "osdc/Objecter.h"
+#include "mon/MonClient.h"
+
+#include "include/types.h"
+#include "messages/MMDSTableRequest.h"
+#include "messages/MRemoveSnaps.h"
+
+#include "msg/Messenger.h"
+
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << rank << ".snap "
+
+
+void SnapServer::reset_state()
+{
+  last_snap = 1;  /* snapid 1 reserved for initial root snaprealm */
+  snaps.clear();
+  need_to_purge.clear();
+  pending_update.clear();
+  pending_destroy.clear();
+  pending_noop.clear();
+
+  // find any removed snapshot in data pools
+  if (mds) {  // only if I'm running in a live MDS
+    snapid_t first_free = 0;
+    mds->objecter->with_osdmap([&](const OSDMap& o) {
+	for (const auto p : mds->mdsmap->get_data_pools()) {
+	  const pg_pool_t *pi = o.get_pg_pool(p);
+	  if (!pi) {
+	    // If pool isn't in OSDMap yet then can't have any snaps
+	    // needing removal, skip.
+	    continue;
+	  }
+	  if (pi->snap_seq > first_free) {
+	    first_free = pi->snap_seq;
+	  }
+	}
+      });
+    if (first_free > last_snap)
+      last_snap = first_free;
+  }
+  last_created = last_snap;
+  last_destroyed = last_snap;
+  snaprealm_v2_since = last_snap + 1;
+
+  MDSTableServer::reset_state();
+}
+
+
+// SERVER
+
+void SnapServer::_prepare(const bufferlist& bl, uint64_t reqid, mds_rank_t bymds, bufferlist& out)
+{
+  using ceph::decode;
+  using ceph::encode;
+  auto p = bl.cbegin();
+  __u32 op;
+  decode(op, p);
+
+  switch (op) {
+  case TABLE_OP_CREATE:
+    {
+      SnapInfo info;
+      decode(info.ino, p);
+      if (!p.end()) {
+	decode(info.name, p);
+	decode(info.stamp, p);
+	info.snapid = ++last_snap;
+	pending_update[version] = info;
+	dout(10) << "prepare v" << version << " create " << info << dendl;
+      } else {
+	pending_noop.insert(version);
+	dout(10) << "prepare v" << version << " noop" << dendl;
+      }
+
+      encode(last_snap, out);
+    }
+    break;
+
+  case TABLE_OP_DESTROY:
+    {
+      inodeno_t ino;
+      snapid_t snapid;
+      decode(ino, p);    // not used, currently.
+      decode(snapid, p);
+
+      // bump last_snap... we use it as a version value on the snaprealm.
+      ++last_snap;
+
+      pending_destroy[version] = pair<snapid_t,snapid_t>(snapid, last_snap);
+      dout(10) << "prepare v" << version << " destroy " << snapid << " seq " << last_snap << dendl;
+
+      encode(last_snap, out);
+    }
+    break;
+
+  case TABLE_OP_UPDATE:
+    {
+      SnapInfo info;
+      decode(info.ino, p);
+      decode(info.snapid, p);
+      decode(info.name, p);
+      decode(info.stamp, p);
+
+      pending_update[version] = info;
+      dout(10) << "prepare v" << version << " update " << info << dendl;
+    }
+    break;
+
+  default:
+    ceph_abort();
+  }
+  //dump();
+}
+
+void SnapServer::_get_reply_buffer(version_t tid, bufferlist *pbl) const
+{
+  using ceph::encode;
+  auto p = pending_update.find(tid);
+  if (p != pending_update.end()) {
+    if (pbl && !snaps.count(p->second.snapid)) // create
+      encode(p->second.snapid, *pbl);
+    return;
+  }
+  auto q = pending_destroy.find(tid);
+  if (q != pending_destroy.end()) {
+    if (pbl)
+      encode(q->second.second, *pbl);
+    return;
+  }
+  auto r = pending_noop.find(tid);
+  if (r != pending_noop.end()) {
+    if (pbl)
+      encode(last_snap, *pbl);
+    return;
+  }
+  assert (0 == "tid not found");
+}
+
+void SnapServer::_commit(version_t tid, cref_t<MMDSTableRequest> req)
+{
+  if (pending_update.count(tid)) {
+    SnapInfo &info = pending_update[tid];
+    string opname;
+    if (snaps.count(info.snapid)) {
+      opname = "update";
+      if (info.stamp == utime_t())
+	info.stamp = snaps[info.snapid].stamp;
+    } else {
+      opname = "create";
+      if (info.snapid > last_created)
+	last_created = info.snapid;
+    }
+    dout(7) << "commit " << tid << " " << opname << " " << info << dendl;
+    snaps[info.snapid] = info;
+    pending_update.erase(tid);
+  }
+
+  else if (pending_destroy.count(tid)) {
+    snapid_t sn = pending_destroy[tid].first;
+    snapid_t seq = pending_destroy[tid].second;
+    dout(7) << "commit " << tid << " destroy " << sn << " seq " << seq << dendl;
+    snaps.erase(sn);
+    if (seq > last_destroyed)
+      last_destroyed = seq;
+
+    for (const auto p : mds->mdsmap->get_data_pools()) {
+      need_to_purge[p].insert(sn);
+      need_to_purge[p].insert(seq);
+    }
+
+    pending_destroy.erase(tid);
+  }
+  else if (pending_noop.count(tid)) {
+    dout(7) << "commit " << tid << " noop" << dendl;
+    pending_noop.erase(tid);
+  }
+  else
+    ceph_abort();
+
+  //dump();
+}
+
+void SnapServer::_rollback(version_t tid) 
+{
+  if (pending_update.count(tid)) {
+    SnapInfo &info = pending_update[tid];
+    string opname;
+    if (snaps.count(info.snapid))
+      opname = "update";
+    else
+      opname = "create";
+    dout(7) << "rollback " << tid << " " << opname << " " << info << dendl;
+    pending_update.erase(tid);
+  } 
+
+  else if (pending_destroy.count(tid)) {
+    dout(7) << "rollback " << tid << " destroy " << pending_destroy[tid] << dendl;
+    pending_destroy.erase(tid);
+  }
+  
+  else if (pending_noop.count(tid)) {
+    dout(7) << "rollback " << tid << " noop" << dendl;
+    pending_noop.erase(tid);
+  }    
+
+  else
+    ceph_abort();
+
+  //dump();
+}
+
+void SnapServer::_server_update(bufferlist& bl)
+{
+  using ceph::decode;
+  auto p = bl.cbegin();
+  map<int, vector<snapid_t> > purge;
+  decode(purge, p);
+
+  dout(7) << "_server_update purged " << purge << dendl;
+  for (map<int, vector<snapid_t> >::iterator p = purge.begin();
+       p != purge.end();
+       ++p) {
+    for (vector<snapid_t>::iterator q = p->second.begin();
+	 q != p->second.end();
+	 ++q)
+      need_to_purge[p->first].erase(*q);
+    if (need_to_purge[p->first].empty())
+      need_to_purge.erase(p->first);
+  }
+}
+
+bool SnapServer::_notify_prep(version_t tid)
+{
+  using ceph::encode;
+  bufferlist bl;
+  char type = 'F';
+  encode(type, bl);
+  encode(snaps, bl);
+  encode(pending_update, bl);
+  encode(pending_destroy, bl);
+  encode(last_created, bl);
+  encode(last_destroyed, bl);
+  ceph_assert(version == tid);
+
+  for (auto &p : active_clients) {
+    auto m = make_message<MMDSTableRequest>(table, TABLESERVER_OP_NOTIFY_PREP, 0, version);
+    m->bl = bl;
+    mds->send_message_mds(m, p);
+  }
+  return true;
+}
+
+void SnapServer::handle_query(const cref_t<MMDSTableRequest> &req)
+{
+  using ceph::encode;
+  using ceph::decode;
+  char op;
+  auto p = req->bl.cbegin();
+  decode(op, p);
+
+  auto reply = make_message<MMDSTableRequest>(table, TABLESERVER_OP_QUERY_REPLY, req->reqid, version);
+
+  switch (op) {
+    case 'F': // full
+      version_t have_version;
+      decode(have_version, p);
+      ceph_assert(have_version <= version);
+      if (have_version == version) {
+	char type = 'U';
+	encode(type, reply->bl);
+      } else {
+	char type = 'F';
+	encode(type, reply->bl);
+	encode(snaps, reply->bl);
+	encode(pending_update, reply->bl);
+	encode(pending_destroy, reply->bl);
+	encode(last_created, reply->bl);
+	encode(last_destroyed, reply->bl);
+      }
+      // FIXME: implement incremental change
+      break;
+    default:
+      ceph_abort();
+  };
+
+  mds->send_message(reply, req->get_connection());
+}
+
+void SnapServer::check_osd_map(bool force)
+{
+  if (!force && version == last_checked_osdmap) {
+    dout(10) << "check_osd_map - version unchanged" << dendl;
+    return;
+  }
+  dout(10) << "check_osd_map need_to_purge=" << need_to_purge << dendl;
+
+  map<int32_t, vector<snapid_t> > all_purge;
+  map<int32_t, vector<snapid_t> > all_purged;
+
+  // NOTE: this is only needed for support during upgrades from pre-octopus,
+  // since starting with octopus we now get an explicit ack after we remove a
+  // snap.
+  mds->objecter->with_osdmap(
+    [this, &all_purged, &all_purge](const OSDMap& osdmap) {
+      for (const auto& p : need_to_purge) {
+	int id = p.first;
+	const pg_pool_t *pi = osdmap.get_pg_pool(id);
+	if (pi == NULL) {
+	  // The pool is gone.  So are the snapshots.
+	  all_purged[id] = std::vector<snapid_t>(p.second.begin(),
+						 p.second.end());
+	  continue;
+	}
+
+	for (const auto& q : p.second) {
+	  if (pi->is_removed_snap(q)) {
+	    dout(10) << " osdmap marks " << q << " as removed" << dendl;
+	    all_purged[id].push_back(q);
+	  } else {
+	    all_purge[id].push_back(q);
+	  }
+	}
+      }
+  });
+
+  if (!all_purged.empty()) {
+    // prepare to remove from need_to_purge list
+    bufferlist bl;
+    using ceph::encode;
+    encode(all_purged, bl);
+    do_server_update(bl);
+  }
+
+  if (!all_purge.empty()) {
+    dout(10) << "requesting removal of " << all_purge << dendl;
+    auto m = make_message<MRemoveSnaps>(all_purge);
+    mon_client->send_mon_message(m.detach());
+  }
+
+  last_checked_osdmap = version;
+}
+
+void SnapServer::handle_remove_snaps(const cref_t<MRemoveSnaps> &m)
+{
+  dout(10) << __func__ << " " << *m << dendl;
+
+  map<int32_t, vector<snapid_t> > all_purged;
+  int num = 0;
+
+  for (const auto& [id, snaps] : need_to_purge) {
+    auto i = m->snaps.find(id);
+    if (i == m->snaps.end()) {
+      continue;
+    }
+    for (const auto& q : snaps) {
+      if (std::find(i->second.begin(), i->second.end(), q) != i->second.end()) {
+	dout(10) << " mon reports " << q << " is removed" << dendl;
+	all_purged[id].push_back(q);
+	++num;
+      }
+    }
+  }
+
+  dout(10) << __func__ << " " << num << " now removed" << dendl;
+  if (num) {
+    bufferlist bl;
+    using ceph::encode;
+    encode(all_purged, bl);
+    do_server_update(bl);
+  }
+}
+
+
+void SnapServer::dump(Formatter *f) const
+{
+  f->open_object_section("snapserver");
+
+  f->dump_int("last_snap", last_snap);
+  f->dump_int("last_created", last_created);
+  f->dump_int("last_destroyed", last_destroyed);
+
+  f->open_array_section("pending_noop");
+  for(set<version_t>::const_iterator i = pending_noop.begin(); i != pending_noop.end(); ++i) {
+    f->dump_unsigned("version", *i);
+  }
+  f->close_section();
+
+  f->open_array_section("snaps");
+  for (map<snapid_t, SnapInfo>::const_iterator i = snaps.begin(); i != snaps.end(); ++i) {
+    f->open_object_section("snap");
+    i->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_object_section("need_to_purge");
+  for (map<int, set<snapid_t> >::const_iterator i = need_to_purge.begin(); i != need_to_purge.end(); ++i) {
+    CachedStackStringStream css;
+    *css << i->first;
+    f->open_array_section(css->strv());
+    for (set<snapid_t>::const_iterator s = i->second.begin(); s != i->second.end(); ++s) {
+      f->dump_unsigned("snapid", s->val);
+    }
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("pending_update");
+  for(map<version_t, SnapInfo>::const_iterator i = pending_update.begin(); i != pending_update.end(); ++i) {
+    f->open_object_section("snap");
+    f->dump_unsigned("version", i->first);
+    f->open_object_section("snapinfo");
+    i->second.dump(f);
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("pending_destroy");
+  for(map<version_t, pair<snapid_t, snapid_t> >::const_iterator i = pending_destroy.begin(); i != pending_destroy.end(); ++i) {
+    f->open_object_section("snap");
+    f->dump_unsigned("version", i->first);
+    f->dump_unsigned("removed_snap", i->second.first);
+    f->dump_unsigned("seq", i->second.second);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->close_section();
+}
+
+void SnapServer::generate_test_instances(std::list<SnapServer*>& ls)
+{
+  list<SnapInfo*> snapinfo_instances;
+  SnapInfo::generate_test_instances(snapinfo_instances);
+  SnapInfo populated_snapinfo = *(snapinfo_instances.back());
+  for (auto& info : snapinfo_instances) {
+    delete info;
+    info = nullptr;
+  }
+
+  SnapServer *blank = new SnapServer();
+  ls.push_back(blank);
+  SnapServer *populated = new SnapServer();
+  populated->last_snap = 123;
+  populated->snaps[456] = populated_snapinfo;
+  populated->need_to_purge[2].insert(012);
+  populated->pending_update[234] = populated_snapinfo;
+  populated->pending_destroy[345].first = 567;
+  populated->pending_destroy[345].second = 768;
+  populated->pending_noop.insert(890);
+
+  ls.push_back(populated);
+}
+
+bool SnapServer::force_update(snapid_t last, snapid_t v2_since,
+			      map<snapid_t, SnapInfo>& _snaps)
+{
+  bool modified = false;
+  if (last > last_snap) {
+    derr << " updating last_snap " << last_snap << " -> " << last << dendl;
+    last_snap = last;
+    last_created = last;
+    last_destroyed = last;
+    modified = true;
+  }
+  if (v2_since > snaprealm_v2_since) {
+    derr << " updating snaprealm_v2_since " << snaprealm_v2_since
+	 << " -> " << v2_since << dendl;
+    snaprealm_v2_since = v2_since;
+    modified = true;
+  }
+  if (snaps != _snaps) {
+    derr << " updating snaps {" << snaps << "} -> {" << _snaps << "}" << dendl;
+    snaps = _snaps;
+    modified = true;
+  }
+
+  if (modified) {
+    need_to_purge.clear();
+    pending_update.clear();
+    pending_destroy.clear();
+    pending_noop.clear();
+    MDSTableServer::reset_state();
+  }
+  return modified;
+}
diff --git a/src/mds/SnapServer.h b/src/mds/SnapServer.h
new file mode 100644
index 000000000..a1019c3c8
--- /dev/null
+++ b/src/mds/SnapServer.h
@@ -0,0 +1,141 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_SNAPSERVER_H
+#define CEPH_SNAPSERVER_H
+
+#include "MDSTableServer.h"
+#include "snap.h"
+
+#include "messages/MRemoveSnaps.h"
+
+class MDSRank;
+class MonClient;
+
+class SnapServer : public MDSTableServer {
+public:
+  SnapServer(MDSRank *m, MonClient *monc)
+    : MDSTableServer(m, TABLE_SNAP), mon_client(monc) {}
+  SnapServer() : MDSTableServer(NULL, TABLE_SNAP) {}
+
+  void handle_remove_snaps(const cref_t<MRemoveSnaps> &m);
+
+  void reset_state() override;
+
+  bool upgrade_format() {
+    // upgraded from old filesystem
+    ceph_assert(is_active());
+    ceph_assert(last_snap > 0);
+    bool upgraded = false;
+    if (get_version() == 0) {
+      // version 0 confuses snapclient code
+      reset();
+      upgraded = true;
+    }
+    if (snaprealm_v2_since == CEPH_NOSNAP) {
+      // new snapshots will have new format snaprealms
+      snaprealm_v2_since = last_snap + 1;
+      upgraded = true;
+    }
+    return upgraded;
+  }
+
+  void check_osd_map(bool force);
+
+  bool can_allow_multimds_snaps() const {
+    return snaps.empty() || snaps.begin()->first >= snaprealm_v2_since;
+  }
+
+  void encode(bufferlist& bl) const {
+    encode_server_state(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    decode_server_state(bl);
+  }
+
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<SnapServer*>& ls);
+
+  bool force_update(snapid_t last, snapid_t v2_since,
+		    map<snapid_t, SnapInfo>& _snaps);
+
+protected:
+  void encode_server_state(bufferlist& bl) const override {
+    ENCODE_START(5, 3, bl);
+    encode(last_snap, bl);
+    encode(snaps, bl);
+    encode(need_to_purge, bl);
+    encode(pending_update, bl);
+    encode(pending_destroy, bl);
+    encode(pending_noop, bl);
+    encode(last_created, bl);
+    encode(last_destroyed, bl);
+    encode(snaprealm_v2_since, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode_server_state(bufferlist::const_iterator& bl) override {
+    DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
+    decode(last_snap, bl);
+    decode(snaps, bl);
+    decode(need_to_purge, bl);
+    decode(pending_update, bl);
+    if (struct_v >= 2)
+      decode(pending_destroy, bl);
+    else {
+      map<version_t, snapid_t> t;
+      decode(t, bl);
+      for (map<version_t, snapid_t>::iterator p = t.begin(); p != t.end(); ++p)
+	pending_destroy[p->first].first = p->second; 
+    } 
+    decode(pending_noop, bl);
+    if (struct_v >= 4) {
+      decode(last_created, bl);
+      decode(last_destroyed, bl);
+    } else {
+      last_created = last_snap;
+      last_destroyed = last_snap;
+    }
+    if (struct_v >= 5)
+      decode(snaprealm_v2_since, bl);
+    else
+      snaprealm_v2_since = CEPH_NOSNAP;
+
+    DECODE_FINISH(bl);
+  }
+
+  // server bits
+  void _prepare(const bufferlist &bl, uint64_t reqid, mds_rank_t bymds, bufferlist &out) override;
+  void _get_reply_buffer(version_t tid, bufferlist *pbl) const override;
+  void _commit(version_t tid, cref_t<MMDSTableRequest> req) override;
+  void _rollback(version_t tid) override;
+  void _server_update(bufferlist& bl) override;
+  bool _notify_prep(version_t tid) override;
+  void handle_query(const cref_t<MMDSTableRequest> &m) override;
+
+  MonClient *mon_client = nullptr;
+  snapid_t last_snap = 0;
+  snapid_t last_created, last_destroyed;
+  snapid_t snaprealm_v2_since;
+  map<snapid_t, SnapInfo> snaps;
+  map<int, set<snapid_t> > need_to_purge;
+
+  map<version_t, SnapInfo> pending_update;
+  map<version_t, pair<snapid_t,snapid_t> > pending_destroy; // (removed_snap, seq)
+  set<version_t> pending_noop;
+
+  version_t last_checked_osdmap = 0;
+};
+WRITE_CLASS_ENCODER(SnapServer)
+
+#endif
diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc
new file mode 100644
index 000000000..9bf1a83d3
--- /dev/null
+++ b/src/mds/StrayManager.cc
@@ -0,0 +1,763 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "common/perf_counters.h"
+
+#include "mds/MDSRank.h"
+#include "mds/MDCache.h"
+#include "mds/MDLog.h"
+#include "mds/CDir.h"
+#include "mds/CDentry.h"
+#include "events/EUpdate.h"
+#include "messages/MClientRequest.h"
+
+#include "StrayManager.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+  return *_dout << "mds." << mds->get_nodeid() << ".cache.strays ";
+}
+
+class StrayManagerIOContext : public virtual MDSIOContextBase {
+protected:
+  StrayManager *sm;
+  MDSRank *get_mds() override
+  {
+    return sm->mds;
+  }
+public:
+  explicit StrayManagerIOContext(StrayManager *sm_) : sm(sm_) {}
+};
+
+class StrayManagerLogContext : public virtual MDSLogContextBase {
+protected:
+  StrayManager *sm;
+  MDSRank *get_mds() override
+  {
+    return sm->mds;
+  }
+public:
+  explicit StrayManagerLogContext(StrayManager *sm_) : sm(sm_) {}
+};
+
+class StrayManagerContext : public virtual MDSContext {
+protected:
+  StrayManager *sm;
+  MDSRank *get_mds() override
+  {
+    return sm->mds;
+  }
+public:
+  explicit StrayManagerContext(StrayManager *sm_) : sm(sm_) {}
+};
+
+
+/**
+ * Context wrapper for _purge_stray_purged completion
+ */
+class C_IO_PurgeStrayPurged : public StrayManagerIOContext {
+  CDentry *dn;
+  bool only_head;
+public:
+  C_IO_PurgeStrayPurged(StrayManager *sm_, CDentry *d, bool oh) : 
+    StrayManagerIOContext(sm_), dn(d), only_head(oh) { }
+  void finish(int r) override {
+    ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
+    sm->_purge_stray_purged(dn, only_head);
+  }
+  void print(ostream& out) const override {
+    CInode *in = dn->get_projected_linkage()->get_inode();
+    out << "purge_stray(" << in->ino() << ")";
+  }
+};
+
+
+void StrayManager::purge(CDentry *dn)
+{
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  CInode *in = dnl->get_inode();
+  dout(10) << __func__ << " " << *dn << " " << *in << dendl;
+  ceph_assert(!dn->is_replicated());
+
+  // CHEAT.  there's no real need to journal our intent to purge, since
+  // that is implicit in the dentry's presence and non-use in the stray
+  // dir.  on recovery, we'll need to re-eval all strays anyway.
+  
+  SnapContext nullsnapc;
+
+  PurgeItem item;
+  item.ino = in->ino();
+  item.stamp = ceph_clock_now();
+  if (in->is_dir()) {
+    item.action = PurgeItem::PURGE_DIR;
+    item.fragtree = in->dirfragtree;
+  } else {
+    item.action = PurgeItem::PURGE_FILE;
+
+    const SnapContext *snapc;
+    SnapRealm *realm = in->find_snaprealm();
+    if (realm) {
+      dout(10) << " realm " << *realm << dendl;
+      snapc = &realm->get_snap_context();
+    } else {
+      dout(10) << " NO realm, using null context" << dendl;
+      snapc = &nullsnapc;
+      ceph_assert(in->last == CEPH_NOSNAP);
+    }
+
+    const auto& pi = in->get_projected_inode();
+
+    uint64_t to = 0;
+    if (in->is_file()) {
+      to = std::max(pi->size, pi->get_max_size());
+      // when truncating a file, the filer does not delete stripe objects that are
+      // truncated to zero. so we need to purge stripe objects up to the max size
+      // the file has ever been.
+      to = std::max(pi->max_size_ever, to);
+    }
+
+    item.size = to;
+    item.layout = pi->layout;
+    item.old_pools.reserve(pi->old_pools.size());
+    for (const auto &p : pi->old_pools) {
+      if (p != pi->layout.pool_id)
+	item.old_pools.push_back(p);
+    }
+    item.snapc = *snapc;
+  }
+
+  purge_queue.push(item, new C_IO_PurgeStrayPurged(
+        this, dn, false));
+}
+
+class C_PurgeStrayLogged : public StrayManagerLogContext {
+  CDentry *dn;
+  version_t pdv;
+  MutationRef mut;
+public:
+  C_PurgeStrayLogged(StrayManager *sm_, CDentry *d, version_t v, MutationRef& m) :
+    StrayManagerLogContext(sm_), dn(d), pdv(v), mut(m) { }
+  void finish(int r) override {
+    sm->_purge_stray_logged(dn, pdv, mut);
+  }
+};
+
+class C_TruncateStrayLogged : public StrayManagerLogContext {
+  CDentry *dn;
+  MutationRef mut;
+public:
+  C_TruncateStrayLogged(StrayManager *sm, CDentry *d, MutationRef& m) :
+    StrayManagerLogContext(sm), dn(d), mut(m) {}
+  void finish(int r) override {
+    sm->_truncate_stray_logged(dn, mut);
+  }
+};
+
+void StrayManager::_purge_stray_purged(
+    CDentry *dn, bool only_head)
+{
+  CInode *in = dn->get_projected_linkage()->get_inode();
+  dout(10) << "_purge_stray_purged " << *dn << " " << *in << dendl;
+
+  logger->inc(l_mdc_strays_enqueued);
+  num_strays_enqueuing--;
+  logger->set(l_mdc_num_strays_enqueuing, num_strays_enqueuing);
+
+  if (only_head) {
+    /* This was a ::truncate */
+    MutationRef mut(new MutationImpl());
+    mut->ls = mds->mdlog->get_current_segment();
+    
+    auto pi = in->project_inode(mut);
+    pi.inode->size = 0;
+    pi.inode->max_size_ever = 0;
+    pi.inode->client_ranges.clear();
+    pi.inode->truncate_size = 0;
+    pi.inode->truncate_from = 0;
+    pi.inode->version = in->pre_dirty();
+    pi.inode->client_ranges.clear();
+    in->clear_clientwriteable();
+
+    CDir *dir = dn->get_dir();
+    auto pf = dir->project_fnode(mut);
+    pf->version = dir->pre_dirty();
+
+    EUpdate *le = new EUpdate(mds->mdlog, "purge_stray truncate");
+    mds->mdlog->start_entry(le);
+
+    le->metablob.add_dir_context(dir);
+    auto& dl = le->metablob.add_dir(dn->dir, true);
+    le->metablob.add_primary_dentry(dl, dn, in, EMetaBlob::fullbit::STATE_DIRTY);
+
+    mds->mdlog->submit_entry(le, new C_TruncateStrayLogged(this, dn, mut));
+  } else {
+    if (in->get_num_ref() != (int)in->is_dirty() ||
+        dn->get_num_ref() !=
+	  (int)dn->is_dirty() +
+	  !!dn->state_test(CDentry::STATE_FRAGMENTING) +
+	  !!in->get_num_ref() + 1 /* PIN_PURGING */) {
+      // Nobody should be taking new references to an inode when it
+      // is being purged (aside from it were 
+
+      derr << "Rogue reference after purge to " << *dn << dendl;
+      ceph_abort_msg("rogue reference to purging inode");
+    }
+
+    MutationRef mut(new MutationImpl());
+    mut->ls = mds->mdlog->get_current_segment();
+
+    // kill dentry.
+    version_t pdv = dn->pre_dirty();
+    dn->push_projected_linkage(); // NULL
+
+    EUpdate *le = new EUpdate(mds->mdlog, "purge_stray");
+    mds->mdlog->start_entry(le);
+
+    // update dirfrag fragstat, rstat
+    CDir *dir = dn->get_dir();
+    auto pf = dir->project_fnode(mut);
+    pf->version = dir->pre_dirty();
+    if (in->is_dir())
+      pf->fragstat.nsubdirs--;
+    else
+      pf->fragstat.nfiles--;
+    pf->rstat.sub(in->get_inode()->accounted_rstat);
+
+    le->metablob.add_dir_context(dn->dir);
+    auto& dl = le->metablob.add_dir(dn->dir, true);
+    le->metablob.add_null_dentry(dl, dn, true);
+    le->metablob.add_destroyed_inode(in->ino());
+
+    mds->mdlog->submit_entry(le, new C_PurgeStrayLogged(this, dn, pdv, mut));
+  }
+}
+
+void StrayManager::_purge_stray_logged(CDentry *dn, version_t pdv, MutationRef& mut)
+{
+  CInode *in = dn->get_linkage()->get_inode();
+  CDir *dir = dn->get_dir();
+  dout(10) << "_purge_stray_logged " << *dn << " " << *in << dendl;
+
+  ceph_assert(!in->state_test(CInode::STATE_RECOVERING));
+  ceph_assert(!dir->is_frozen_dir());
+
+  bool new_dn = dn->is_new();
+
+  // unlink
+  ceph_assert(dn->get_projected_linkage()->is_null());
+  dir->unlink_inode(dn, !new_dn);
+  dn->pop_projected_linkage();
+  dn->mark_dirty(pdv, mut->ls);
+
+  mut->apply();
+
+  in->state_clear(CInode::STATE_ORPHAN);
+  dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED);
+  dn->put(CDentry::PIN_PURGING);
+
+
+  // drop dentry?
+  if (new_dn) {
+    dout(20) << " dn is new, removing" << dendl;
+    dn->mark_clean();
+    dir->remove_dentry(dn);
+  }
+
+  // drop inode
+  inodeno_t ino = in->ino();
+  if (in->is_dirty())
+    in->mark_clean();
+  mds->mdcache->remove_inode(in);
+
+  dir->auth_unpin(this);
+
+  if (mds->is_stopping())
+    mds->mdcache->shutdown_export_stray_finish(ino);
+}
+
+void StrayManager::enqueue(CDentry *dn, bool trunc)
+{
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  ceph_assert(dnl);
+  CInode *in = dnl->get_inode();
+  ceph_assert(in);
+
+  /* We consider a stray to be purging as soon as it is enqueued, to avoid
+   * enqueing it twice */
+  dn->state_set(CDentry::STATE_PURGING);
+  in->state_set(CInode::STATE_PURGING);
+
+  /* We must clear this as soon as enqueuing it, to prevent the journal
+   * expiry code from seeing a dirty parent and trying to write a backtrace */
+  if (!trunc) {
+    if (in->is_dirty_parent()) {
+      in->clear_dirty_parent();
+    }
+  }
+
+  dout(20) << __func__ << ": purging dn: " << *dn << dendl;
+
+  if (!dn->state_test(CDentry::STATE_PURGINGPINNED)) {
+    dn->get(CDentry::PIN_PURGING);
+    dn->state_set(CDentry::STATE_PURGINGPINNED);
+  }
+
+  ++num_strays_enqueuing;
+  logger->set(l_mdc_num_strays_enqueuing, num_strays_enqueuing);
+
+  // Resources are available, acquire them and execute the purge
+  _enqueue(dn, trunc);
+
+  dout(10) << __func__ << ": purging this dentry immediately: "
+    << *dn << dendl;
+}
+
+class C_RetryEnqueue : public StrayManagerContext {
+  CDentry *dn;
+  bool trunc;
+  public:
+    C_RetryEnqueue(StrayManager *sm_, CDentry *dn_, bool t) :
+      StrayManagerContext(sm_), dn(dn_), trunc(t) { }
+    void finish(int r) override {
+      sm->_enqueue(dn, trunc);
+    }
+};
+
+void StrayManager::_enqueue(CDentry *dn, bool trunc)
+{
+  ceph_assert(started);
+
+  CDir *dir = dn->get_dir();
+  if (!dir->can_auth_pin()) {
+    dout(10) << " can't auth_pin (freezing?) " << *dir << ", waiting" << dendl;
+    dir->add_waiter(CDir::WAIT_UNFREEZE, new C_RetryEnqueue(this, dn, trunc));
+    return;
+  }
+
+  dn->get_dir()->auth_pin(this);
+  if (trunc) {
+    truncate(dn);
+  } else {
+    purge(dn);
+  }
+}
+
+void StrayManager::queue_delayed(CDentry *dn)
+{
+  if (!started)
+    return;
+
+  if (dn->state_test(CDentry::STATE_EVALUATINGSTRAY))
+    return;
+
+  if (!dn->item_stray.is_on_list()) {
+    delayed_eval_stray.push_back(&dn->item_stray);
+    num_strays_delayed++;
+    logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
+  }
+}
+
+void StrayManager::advance_delayed()
+{
+  if (!started)
+    return;
+
+  while (!delayed_eval_stray.empty()) {
+    CDentry *dn = delayed_eval_stray.front();
+    dn->item_stray.remove_myself();
+    num_strays_delayed--;
+
+    if (dn->get_projected_linkage()->is_null()) {
+      /* A special case: a stray dentry can go null if its inode is being
+       * re-linked into another MDS's stray dir during a shutdown migration. */
+      dout(4) << __func__ << ": delayed dentry is now null: " << *dn << dendl;
+      continue;
+    }
+
+    eval_stray(dn);
+  }
+  logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
+}
+
+void StrayManager::set_num_strays(uint64_t num)
+{
+  ceph_assert(!started);
+  num_strays = num;
+  logger->set(l_mdc_num_strays, num_strays);
+}
+
+void StrayManager::notify_stray_created()
+{
+  num_strays++;
+  logger->set(l_mdc_num_strays, num_strays);
+  logger->inc(l_mdc_strays_created);
+}
+
+void StrayManager::notify_stray_removed()
+{
+  num_strays--;
+  logger->set(l_mdc_num_strays, num_strays);
+}
+
+struct C_EvalStray : public StrayManagerContext {
+  CDentry *dn;
+  C_EvalStray(StrayManager *sm_, CDentry *d) : StrayManagerContext(sm_), dn(d) {}
+  void finish(int r) override {
+    sm->eval_stray(dn);
+  }
+};
+
+struct C_MDC_EvalStray : public StrayManagerContext {
+  CDentry *dn;
+  C_MDC_EvalStray(StrayManager *sm_, CDentry *d) : StrayManagerContext(sm_), dn(d) {}
+  void finish(int r) override {
+    sm->eval_stray(dn);
+  }
+};
+
+bool StrayManager::_eval_stray(CDentry *dn)
+{
+  dout(10) << "eval_stray " << *dn << dendl;
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  ceph_assert(dnl->is_primary());
+  dout(10) << " inode is " << *dnl->get_inode() << dendl;
+  CInode *in = dnl->get_inode();
+  ceph_assert(in);
+  ceph_assert(!in->state_test(CInode::STATE_REJOINUNDEF));
+
+  // The only dentries elegible for purging are those
+  // in the stray directories
+  ceph_assert(dn->get_dir()->get_inode()->is_stray());
+
+  // Inode may not pass through this function if it
+  // was already identified for purging (i.e. cannot
+  // call eval_stray() after purge()
+  ceph_assert(!dn->state_test(CDentry::STATE_PURGING));
+
+  if (!dn->is_auth())
+    return false;
+
+  if (!started)
+    return false;
+
+  if (dn->item_stray.is_on_list()) {
+    dn->item_stray.remove_myself();
+    num_strays_delayed--;
+    logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
+  }
+
+  // purge?
+  if (in->get_inode()->nlink == 0) {
+    // past snaprealm parents imply snapped dentry remote links.
+    // only important for directories.  normal file data snaps are handled
+    // by the object store.
+    if (in->snaprealm) {
+      in->snaprealm->prune_past_parent_snaps();
+      in->purge_stale_snap_data(in->snaprealm->get_snaps());
+    }
+    if (in->is_dir()) {
+      if (in->snaprealm && in->snaprealm->has_past_parent_snaps()) {
+	dout(20) << "  directory has past parents "
+		 << in->snaprealm << dendl;
+	if (in->state_test(CInode::STATE_MISSINGOBJS)) {
+	  mds->clog->error() << "previous attempt at committing dirfrag of ino "
+			     << in->ino() << " has failed, missing object";
+	  mds->handle_write_error(-CEPHFS_ENOENT);
+	}
+	return false;  // not until some snaps are deleted.
+      }
+
+      mds->mdcache->clear_dirty_bits_for_stray(in);
+
+      if (!in->remote_parents.empty()) {
+	// unlink any stale remote snap dentry.
+	for (auto it = in->remote_parents.begin(); it != in->remote_parents.end(); ) {
+	  CDentry *remote_dn = *it;
+	  ++it;
+	  ceph_assert(remote_dn->last != CEPH_NOSNAP);
+	  remote_dn->unlink_remote(remote_dn->get_linkage());
+	}
+      }
+    }
+    if (dn->is_replicated()) {
+      dout(20) << " replicated" << dendl;
+      return false;
+    }
+    if (dn->is_any_leases() || in->is_any_caps()) {
+      dout(20) << " caps | leases" << dendl;
+      return false;  // wait
+    }
+    if (in->state_test(CInode::STATE_NEEDSRECOVER) ||
+	in->state_test(CInode::STATE_RECOVERING)) {
+      dout(20) << " pending recovery" << dendl;
+      return false;  // don't mess with file size probing
+    }
+    if (in->get_num_ref() > (int)in->is_dirty() + (int)in->is_dirty_parent()) {
+      dout(20) << " too many inode refs" << dendl;
+      return false;
+    }
+    if (dn->get_num_ref() > (int)dn->is_dirty() + !!in->get_num_ref()) {
+      dout(20) << " too many dn refs" << dendl;
+      return false;
+    }
+    // don't purge multiversion inode with snap data
+    if (in->snaprealm && in->snaprealm->has_past_parent_snaps() &&
+	in->is_any_old_inodes()) {
+      // A file with snapshots: we will truncate the HEAD revision
+      // but leave the metadata intact.
+      ceph_assert(!in->is_dir());
+      dout(20) << " file has past parents "
+        << in->snaprealm << dendl;
+      if (in->is_file() && in->get_projected_inode()->size > 0) {
+	enqueue(dn, true); // truncate head objects    
+      }
+    } else {
+      // A straightforward file, ready to be purged.  Enqueue it.
+      if (in->is_dir()) {
+	in->close_dirfrags();
+      }
+
+      enqueue(dn, false);
+    }
+
+    return true;
+  } else {
+    /*
+     * Where a stray has some links, they should be remotes, check
+     * if we can do anything with them if we happen to have them in
+     * cache.
+     */
+    _eval_stray_remote(dn, NULL);
+    return false;
+  }
+}
+
+void StrayManager::activate()
+{
+  dout(10) << __func__ << dendl;
+  started = true;
+  purge_queue.activate();
+}
+
+bool StrayManager::eval_stray(CDentry *dn)
+{
+  // avoid nested eval_stray
+  if (dn->state_test(CDentry::STATE_EVALUATINGSTRAY))
+      return false;
+
+  dn->state_set(CDentry::STATE_EVALUATINGSTRAY);
+  bool ret = _eval_stray(dn);
+  dn->state_clear(CDentry::STATE_EVALUATINGSTRAY);
+  return ret;
+}
+
+void StrayManager::eval_remote(CDentry *remote_dn)
+{
+  dout(10) << __func__ << " " << *remote_dn << dendl;
+
+  CDentry::linkage_t *dnl = remote_dn->get_projected_linkage();
+  ceph_assert(dnl->is_remote());
+  CInode *in = dnl->get_inode();
+
+  if (!in) {
+    dout(20) << __func__ << ": no inode, cannot evaluate" << dendl;
+    return;
+  }
+
+  if (remote_dn->last != CEPH_NOSNAP) {
+    dout(20) << __func__ << ": snap dentry, cannot evaluate" << dendl;
+    return;
+  }
+
+  // refers to stray?
+  CDentry *primary_dn = in->get_projected_parent_dn();
+  ceph_assert(primary_dn != NULL);
+  if (primary_dn->get_dir()->get_inode()->is_stray()) {
+    _eval_stray_remote(primary_dn, remote_dn);
+  } else {
+    dout(20) << __func__ << ": inode's primary dn not stray" << dendl;
+  }
+}
+
+class C_RetryEvalRemote : public StrayManagerContext {
+  CDentry *dn;
+  public:
+    C_RetryEvalRemote(StrayManager *sm_, CDentry *dn_) :
+      StrayManagerContext(sm_), dn(dn_) {
+      dn->get(CDentry::PIN_PTRWAITER);
+    }
+    void finish(int r) override {
+      if (dn->get_projected_linkage()->is_remote())
+	sm->eval_remote(dn);
+      dn->put(CDentry::PIN_PTRWAITER);
+    }
+};
+
+void StrayManager::_eval_stray_remote(CDentry *stray_dn, CDentry *remote_dn)
+{
+  dout(20) << __func__ << " " << *stray_dn << dendl;
+  ceph_assert(stray_dn != NULL);
+  ceph_assert(stray_dn->get_dir()->get_inode()->is_stray());
+  CDentry::linkage_t *stray_dnl = stray_dn->get_projected_linkage();
+  ceph_assert(stray_dnl->is_primary());
+  CInode *stray_in = stray_dnl->get_inode();
+  ceph_assert(stray_in->get_inode()->nlink >= 1);
+  ceph_assert(stray_in->last == CEPH_NOSNAP);
+
+  /* If no remote_dn hinted, pick one arbitrarily */
+  if (remote_dn == NULL) {
+    if (!stray_in->remote_parents.empty()) {
+      for (const auto &dn : stray_in->remote_parents) {
+	if (dn->last == CEPH_NOSNAP && !dn->is_projected()) {
+	  if (dn->is_auth()) {
+	    remote_dn = dn;
+	    if (remote_dn->dir->can_auth_pin())
+	      break;
+	  } else if (!remote_dn) {
+	    remote_dn = dn;
+	  }
+	}
+      }
+    }
+    if (!remote_dn) {
+      dout(20) << __func__ << ": not reintegrating (no remote parents in cache)" << dendl;
+      return;
+    }
+  }
+  ceph_assert(remote_dn->last == CEPH_NOSNAP);
+  // NOTE: we repeat this check in _rename(), since our submission path is racey.
+  if (!remote_dn->is_projected()) {
+    if (remote_dn->is_auth()) {
+      if (remote_dn->dir->can_auth_pin()) {
+	reintegrate_stray(stray_dn, remote_dn);
+      } else {
+	remote_dn->dir->add_waiter(CDir::WAIT_UNFREEZE, new C_RetryEvalRemote(this, remote_dn));
+	dout(20) << __func__ << ": not reintegrating (can't authpin remote parent)" << dendl;
+      }
+
+    } else if (!remote_dn->is_auth() && stray_dn->is_auth()) {
+      migrate_stray(stray_dn, remote_dn->authority().first);
+    } else {
+      dout(20) << __func__ << ": not reintegrating" << dendl;
+    }
+  } else {
+    // don't do anything if the remote parent is projected, or we may
+    // break user-visible semantics!
+    dout(20) << __func__ << ": not reintegrating (projected)" << dendl;
+  }
+}
+
+void StrayManager::reintegrate_stray(CDentry *straydn, CDentry *rdn)
+{
+  dout(10) << __func__ << " " << *straydn << " to " << *rdn << dendl;
+
+  logger->inc(l_mdc_strays_reintegrated);
+  
+  // rename it to remote linkage .
+  filepath src(straydn->get_name(), straydn->get_dir()->ino());
+  filepath dst(rdn->get_name(), rdn->get_dir()->ino());
+
+  auto req = make_message<MClientRequest>(CEPH_MDS_OP_RENAME);
+  req->set_filepath(dst);
+  req->set_filepath2(src);
+  req->set_tid(mds->issue_tid());
+
+  mds->send_message_mds(req, rdn->authority().first);
+}
+ 
+void StrayManager::migrate_stray(CDentry *dn, mds_rank_t to)
+{
+  dout(10) << __func__ << " " << *dn << " to mds." << to << dendl;
+
+  logger->inc(l_mdc_strays_migrated);
+
+  // rename it to another mds.
+  inodeno_t dirino = dn->get_dir()->ino();
+  ceph_assert(MDS_INO_IS_STRAY(dirino));
+
+  filepath src(dn->get_name(), dirino);
+  filepath dst(dn->get_name(), MDS_INO_STRAY(to, MDS_INO_STRAY_INDEX(dirino)));
+
+  auto req = make_message<MClientRequest>(CEPH_MDS_OP_RENAME);
+  req->set_filepath(dst);
+  req->set_filepath2(src);
+  req->set_tid(mds->issue_tid());
+
+  mds->send_message_mds(req, to);
+}
+
+StrayManager::StrayManager(MDSRank *mds, PurgeQueue &purge_queue_)
+  : delayed_eval_stray(member_offset(CDentry, item_stray)),
+    mds(mds), purge_queue(purge_queue_)
+{
+  ceph_assert(mds != NULL);
+}
+
+void StrayManager::truncate(CDentry *dn)
+{
+  const CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  const CInode *in = dnl->get_inode();
+  ceph_assert(in);
+  dout(10) << __func__ << ": " << *dn << " " << *in << dendl;
+  ceph_assert(!dn->is_replicated());
+
+  const SnapRealm *realm = in->find_snaprealm();
+  ceph_assert(realm);
+  dout(10) << " realm " << *realm << dendl;
+  const SnapContext *snapc = &realm->get_snap_context();
+
+  uint64_t to = std::max(in->get_inode()->size, in->get_inode()->get_max_size());
+  // when truncating a file, the filer does not delete stripe objects that are
+  // truncated to zero. so we need to purge stripe objects up to the max size
+  // the file has ever been.
+  to = std::max(in->get_inode()->max_size_ever, to);
+
+  ceph_assert(to > 0);
+
+  PurgeItem item;
+  item.action = PurgeItem::TRUNCATE_FILE;
+  item.ino = in->ino();
+  item.layout = in->get_inode()->layout;
+  item.snapc = *snapc;
+  item.size = to;
+  item.stamp = ceph_clock_now();
+
+  purge_queue.push(item, new C_IO_PurgeStrayPurged(
+        this, dn, true));
+}
+
+void StrayManager::_truncate_stray_logged(CDentry *dn, MutationRef& mut)
+{
+  CInode *in = dn->get_projected_linkage()->get_inode();
+
+  dout(10) << __func__ << ": " << *dn << " " << *in << dendl;
+
+  mut->apply();
+
+  in->state_clear(CInode::STATE_PURGING);
+  dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED);
+  dn->put(CDentry::PIN_PURGING);
+
+  dn->get_dir()->auth_unpin(this);
+
+  eval_stray(dn);
+
+  if (!dn->state_test(CDentry::STATE_PURGING) &&  mds->is_stopping())
+    mds->mdcache->shutdown_export_stray_finish(in->ino());
+}
+
diff --git a/src/mds/StrayManager.h b/src/mds/StrayManager.h
new file mode 100644
index 000000000..86b6941a5
--- /dev/null
+++ b/src/mds/StrayManager.h
@@ -0,0 +1,198 @@
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef STRAY_MANAGER_H
+#define STRAY_MANAGER_H
+
+#include "include/common_fwd.h"
+#include "include/elist.h"
+#include <list>
+#include "Mutation.h"
+#include "PurgeQueue.h"
+
+class MDSRank;
+class CInode;
+class CDentry;
+
+class StrayManager
+{
+  // My public interface is for consumption by MDCache
+public:
+  explicit StrayManager(MDSRank *mds, PurgeQueue &purge_queue_);
+  void set_logger(PerfCounters *l) {logger = l;}
+  void activate();
+
+  bool eval_stray(CDentry *dn);
+
+  void set_num_strays(uint64_t num);
+  uint64_t get_num_strays() const { return num_strays; }
+
+  /**
+   * Queue dentry for later evaluation. (evaluate it while not in the
+   * middle of another metadata operation)
+   */
+  void queue_delayed(CDentry *dn);
+
+  /**
+   * Eval strays in the delayed_eval_stray list
+   */
+  void advance_delayed();
+
+  /**
+   * Remote dentry potentially points to a stray. When it is touched,
+   * call in here to evaluate it for migration (move a stray residing
+   * on another MDS to this MDS) or reintegration (move a stray dentry's
+   * inode into a non-stray hardlink dentry and clean up the stray).
+   *
+   * @param stray_dn a stray dentry whose inode has been referenced
+   *                 by a remote dentry
+   * @param remote_dn (optional) which remote dentry was touched
+   *                  in an operation that led us here: this is used
+   *                  as a hint for which remote to reintegrate into
+   *                  if there are multiple remotes.
+   */
+  void eval_remote(CDentry *remote_dn);
+
+  /**
+   * Given a dentry within one of my stray directories,
+   * send it off to a stray directory in another MDS.
+   *
+   * This is for use:
+   *  * Case A: when shutting down a rank, we migrate strays
+   *    away from ourselves rather than waiting for purge
+   *  * Case B: when a client request has a trace that refers to
+   *    a stray inode on another MDS, we migrate that inode from
+   *    there to here, in order that we can later re-integrate it
+   *    here.
+   *
+   * In case B, the receiver should be calling into eval_stray
+   * on completion of mv (i.e. inode put), resulting in a subsequent
+   * reintegration.
+   */
+  void migrate_stray(CDentry *dn, mds_rank_t dest);
+
+  /**
+   * Update stats to reflect a newly created stray dentry. Needed
+   * because stats on strays live here, but creation happens
+   * in Server or MDCache. For our purposes "creation" includes
+   * loading a stray from a dirfrag and migrating a stray from
+   * another MDS, in addition to creations per-se.
+   */
+  void notify_stray_created();
+
+  /**
+   * Update stats to reflect a removed stray dentry. Needed because
+   * stats on strays live here, but removal happens in Server or
+   * MDCache. Also includes migration (rename) of strays from
+   * this MDS to another MDS.
+   */
+  void notify_stray_removed();
+
+protected:
+  friend class StrayManagerIOContext;
+  friend class StrayManagerLogContext;
+  friend class StrayManagerContext;
+
+  friend class C_StraysFetched;
+  friend class C_RetryEnqueue;
+  friend class C_PurgeStrayLogged;
+  friend class C_TruncateStrayLogged;
+  friend class C_IO_PurgeStrayPurged;
+
+  void truncate(CDentry *dn);
+
+  /**
+   * Purge a dentry from a stray directory. This function
+   * is called once eval_stray is satisfied and StrayManager
+   * throttling is also satisfied. There is no going back
+   * at this stage!
+   */
+  void purge(CDentry *dn);
+
+  /**
+   * Completion handler for a Filer::purge on a stray inode.
+   */
+  void _purge_stray_purged(CDentry *dn, bool only_head);
+
+  void _purge_stray_logged(CDentry *dn, version_t pdv, MutationRef& mut);
+
+  /**
+   * Callback: we have logged the update to an inode's metadata
+   * reflecting it's newly-zeroed length.
+   */
+  void _truncate_stray_logged(CDentry *dn, MutationRef &mut);
+  /**
+   * Call this on a dentry that has been identified as
+   * eligible for purging. It will be passed on to PurgeQueue.
+   */
+  void enqueue(CDentry *dn, bool trunc);
+  /**
+   * Final part of enqueue() which we may have to retry
+   * after opening snap parents.
+   */
+  void _enqueue(CDentry *dn, bool trunc);
+
+  /**
+   * When hard links exist to an inode whose primary dentry
+   * is unlinked, the inode gets a stray primary dentry.
+   *
+   * We may later "reintegrate" the inode into a remaining
+   * non-stray dentry (one of what was previously a remote
+   * dentry) by issuing a rename from the stray to the other
+   * dentry.
+   */
+  void reintegrate_stray(CDentry *dn, CDentry *rlink);
+
+  /**
+   * Evaluate a stray dentry for purging or reintegration.
+   *
+   * purging: If the inode has no linkage, and no more references, then
+   *          we may decide to purge it.
+   *
+   * reintegration: If the inode still has linkage, then it means someone else
+   *                (a hard link) is still referring to it, and we should
+   *                think about reintegrating that inode into the remote dentry.
+   *
+   * @returns true if the dentry will be purged (caller should never
+   *          take more refs after this happens), else false.
+   */
+  bool _eval_stray(CDentry *dn);
+
+  void _eval_stray_remote(CDentry *stray_dn, CDentry *remote_dn);
+
+  // Has passed through eval_stray and still has refs
+  elist<CDentry*> delayed_eval_stray;
+
+  // strays that have been trimmed from cache
+  std::set<std::string> trimmed_strays;
+
+  // Global references for doing I/O
+  MDSRank *mds;
+  PerfCounters *logger = nullptr;
+
+  bool started = false;
+
+  // Stray dentries for this rank (including those not in cache)
+  uint64_t num_strays = 0;
+
+  // Stray dentries
+  uint64_t num_strays_delayed = 0;
+  /**
+   * Entries that have entered enqueue() but not been persistently
+   * recorded by PurgeQueue yet
+   */
+  uint64_t num_strays_enqueuing = 0;
+
+  PurgeQueue &purge_queue;
+};
+#endif  // STRAY_MANAGER_H
diff --git a/src/mds/balancers/greedyspill.lua b/src/mds/balancers/greedyspill.lua
new file mode 100644
index 000000000..20576cdb8
--- /dev/null
+++ b/src/mds/balancers/greedyspill.lua
@@ -0,0 +1,49 @@
+local metrics = {"auth.meta_load", "all.meta_load", "req_rate", "queue_len", "cpu_load_avg"}
+
+-- Metric for balancing is the workload; also dumps metrics
+local function mds_load()
+  for rank, mds in pairs(mds) do
+    local s = "MDS"..rank..": < "
+    for _, metric in ipairs(metrics) do
+      s = s..metric.."="..mds[metric].." "
+    end
+    mds.load = mds["all.meta_load"]
+    BAL_LOG(5, s.."> load="..mds.load)
+  end
+end
+
+-- Shed load when you have load and your neighbor doesn't
+local function when()
+  if not mds[whoami+1] then
+    -- i'm the last rank
+    BAL_LOG(5, "when: not migrating! I am the last rank, nothing to spill to.");
+    return false
+  end
+  my_load = mds[whoami]["load"]
+  his_load = mds[whoami+1]["load"]
+  if my_load > 0.01 and his_load < 0.01 then
+    BAL_LOG(5, "when: migrating! my_load="..my_load.." hisload="..his_load)
+    return true
+  end
+  BAL_LOG(5, "when: not migrating! my_load="..my_load.." hisload="..his_load)
+  return false
+end
+
+-- Shed half your load to your neighbor
+-- neighbor=whoami+2 because Lua tables are indexed starting at 1
+local function where(targets)
+  targets[whoami+1] = mds[whoami]["load"]/2
+  return targets
+end
+
+local targets = {}
+for rank in pairs(mds) do
+  targets[rank] = 0
+end
+
+mds_load()
+if when() then
+  where(targets)
+end
+
+return targets
diff --git a/src/mds/cephfs_features.cc b/src/mds/cephfs_features.cc
new file mode 100644
index 000000000..3c7949c5e
--- /dev/null
+++ b/src/mds/cephfs_features.cc
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <array>
+#include "cephfs_features.h"
+#include "mdstypes.h"
+
+static const std::array feature_names
+{
+  "reserved",
+  "reserved",
+  "reserved",
+  "reserved",
+  "reserved",
+  "jewel",
+  "kraken",
+  "luminous",
+  "mimic",
+  "reply_encoding",
+  "reclaim_client",
+  "lazy_caps_wanted",
+  "multi_reconnect",
+  "deleg_ino",
+  "metric_collect",
+  "alternate_name",
+  "notify_session_state",
+  "op_getvxattr",
+};
+static_assert(feature_names.size() == CEPHFS_FEATURE_MAX + 1);
+
+std::string_view cephfs_feature_name(size_t id)
+{
+  if (id > feature_names.size())
+    return "unknown"sv;
+  return feature_names[id];
+}
+
+int cephfs_feature_from_name(std::string_view name)
+{
+  if (name == "reserved"sv) {
+    return -1;
+  }
+  for (size_t i = 0; i < feature_names.size(); ++i) {
+    if (name == feature_names[i])
+      return i;
+  }
+  return -1;
+}
+
+std::string cephfs_stringify_features(const feature_bitset_t& features)
+{
+  CachedStackStringStream css;
+  bool first = true;
+  *css << "{";
+  for (size_t i = 0; i < feature_names.size(); ++i) {
+    if (!features.test(i))
+      continue;
+    if (!first)
+      *css << ",";
+    *css << i << "=" << cephfs_feature_name(i);
+    first = false;
+  }
+  *css << "}";
+  return css->str();
+}
+
+void cephfs_dump_features(ceph::Formatter *f, const feature_bitset_t& features)
+{
+  for (size_t i = 0; i < feature_names.size(); ++i) {
+    if (!features.test(i))
+      continue;
+    char s[18];
+    snprintf(s, sizeof(s), "feature_%lu", i);
+    f->dump_string(s, cephfs_feature_name(i));
+  }
+}
+
diff --git a/src/mds/cephfs_features.h b/src/mds/cephfs_features.h
new file mode 100644
index 000000000..d1fc94266
--- /dev/null
+++ b/src/mds/cephfs_features.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPHFS_FEATURES_H
+#define CEPHFS_FEATURES_H
+
+#include "include/cephfs/metrics/Types.h"
+
+class feature_bitset_t;
+namespace ceph {
+  class Formatter;
+}
+
+// When adding a new release, please update the "current" release below, add a
+// feature bit for that release, add that feature bit to CEPHFS_FEATURES_ALL,
+// and update Server::update_required_client_features(). This feature bit
+// is used to indicate that operator only wants clients from that release or
+// later to mount CephFS.
+#define CEPHFS_CURRENT_RELEASE  CEPH_RELEASE_PACIFIC
+
+// The first 5 bits are reserved for old ceph releases.
+#define CEPHFS_FEATURE_JEWEL                5
+#define CEPHFS_FEATURE_KRAKEN               6
+#define CEPHFS_FEATURE_LUMINOUS             7
+#define CEPHFS_FEATURE_MIMIC                8
+#define CEPHFS_FEATURE_REPLY_ENCODING       9
+#define CEPHFS_FEATURE_RECLAIM_CLIENT       10
+#define CEPHFS_FEATURE_LAZY_CAP_WANTED      11
+#define CEPHFS_FEATURE_MULTI_RECONNECT      12
+#define CEPHFS_FEATURE_NAUTILUS             12
+#define CEPHFS_FEATURE_DELEG_INO            13
+#define CEPHFS_FEATURE_OCTOPUS              13
+#define CEPHFS_FEATURE_METRIC_COLLECT       14
+#define CEPHFS_FEATURE_ALTERNATE_NAME       15
+#define CEPHFS_FEATURE_NOTIFY_SESSION_STATE 16
+#define CEPHFS_FEATURE_OP_GETVXATTR         17
+#define CEPHFS_FEATURE_MAX                  17
+
+#define CEPHFS_FEATURES_ALL {		\
+  0, 1, 2, 3, 4,			\
+  CEPHFS_FEATURE_JEWEL,			\
+  CEPHFS_FEATURE_KRAKEN,		\
+  CEPHFS_FEATURE_LUMINOUS,		\
+  CEPHFS_FEATURE_MIMIC,			\
+  CEPHFS_FEATURE_REPLY_ENCODING,        \
+  CEPHFS_FEATURE_RECLAIM_CLIENT,	\
+  CEPHFS_FEATURE_LAZY_CAP_WANTED,	\
+  CEPHFS_FEATURE_MULTI_RECONNECT,	\
+  CEPHFS_FEATURE_NAUTILUS,              \
+  CEPHFS_FEATURE_DELEG_INO,             \
+  CEPHFS_FEATURE_OCTOPUS,               \
+  CEPHFS_FEATURE_METRIC_COLLECT,        \
+  CEPHFS_FEATURE_ALTERNATE_NAME,        \
+  CEPHFS_FEATURE_NOTIFY_SESSION_STATE,  \
+  CEPHFS_FEATURE_OP_GETVXATTR,          \
+}
+
+#define CEPHFS_METRIC_FEATURES_ALL {		\
+    CLIENT_METRIC_TYPE_CAP_INFO,		\
+    CLIENT_METRIC_TYPE_READ_LATENCY,		\
+    CLIENT_METRIC_TYPE_WRITE_LATENCY,		\
+    CLIENT_METRIC_TYPE_METADATA_LATENCY,	\
+    CLIENT_METRIC_TYPE_DENTRY_LEASE,		\
+    CLIENT_METRIC_TYPE_OPENED_FILES,		\
+    CLIENT_METRIC_TYPE_PINNED_ICAPS,		\
+    CLIENT_METRIC_TYPE_OPENED_INODES,		\
+    CLIENT_METRIC_TYPE_READ_IO_SIZES,		\
+    CLIENT_METRIC_TYPE_WRITE_IO_SIZES,		\
+    CLIENT_METRIC_TYPE_AVG_READ_LATENCY,	\
+    CLIENT_METRIC_TYPE_STDEV_READ_LATENCY,	\
+    CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY,	\
+    CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,	\
+    CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,	\
+    CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,	\
+}
+
+#define CEPHFS_FEATURES_MDS_SUPPORTED CEPHFS_FEATURES_ALL
+#define CEPHFS_FEATURES_MDS_REQUIRED {}
+
+#define CEPHFS_FEATURES_CLIENT_SUPPORTED CEPHFS_FEATURES_ALL
+#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
+
+extern std::string_view cephfs_feature_name(size_t id);
+extern int cephfs_feature_from_name(std::string_view name);
+std::string cephfs_stringify_features(const feature_bitset_t& features);
+void cephfs_dump_features(ceph::Formatter *f, const feature_bitset_t& features);
+
+#endif
diff --git a/src/mds/events/ECommitted.h b/src/mds/events/ECommitted.h
new file mode 100644
index 000000000..41b120813
--- /dev/null
+++ b/src/mds/events/ECommitted.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_ECOMMITTED_H
+#define CEPH_MDS_ECOMMITTED_H
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+class ECommitted : public LogEvent {
+public:
+  metareqid_t reqid;
+
+  ECommitted() : LogEvent(EVENT_COMMITTED) { }
+  explicit ECommitted(metareqid_t r) :
+    LogEvent(EVENT_COMMITTED), reqid(r) { }
+
+  void print(ostream& out) const override {
+    out << "ECommitted " << reqid;
+  }
+
+  void encode(bufferlist &bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator &bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<ECommitted*>& ls);
+
+  void update_segment() override {}
+  void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(ECommitted)
+
+#endif
diff --git a/src/mds/events/EExport.h b/src/mds/events/EExport.h
new file mode 100644
index 000000000..589062db2
--- /dev/null
+++ b/src/mds/events/EExport.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_EEXPORT_H
+#define CEPH_EEXPORT_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../MDSRank.h"
+
+#include "EMetaBlob.h"
+#include "../LogEvent.h"
+
+class EExport : public LogEvent {
+public:
+  EMetaBlob metablob; // exported dir
+protected:
+  dirfrag_t      base;
+  set<dirfrag_t> bounds;
+  mds_rank_t target;
+  
+public:
+  EExport() :
+    LogEvent(EVENT_EXPORT), target(MDS_RANK_NONE) { }
+  EExport(MDLog *mdlog, CDir *dir, mds_rank_t t) :
+    LogEvent(EVENT_EXPORT),
+    base(dir->dirfrag()), target(t) { }
+  
+  set<dirfrag_t> &get_bounds() { return bounds; }
+  
+  void print(ostream& out) const override {
+    out << "EExport " << base << " to mds." << target << " " << metablob;
+  }
+
+  EMetaBlob *get_metablob() override { return &metablob; }
+
+  void encode(bufferlist& bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator &bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<EExport*>& ls);
+  void replay(MDSRank *mds) override;
+
+};
+WRITE_CLASS_ENCODER_FEATURES(EExport)
+
+#endif
diff --git a/src/mds/events/EFragment.h b/src/mds/events/EFragment.h
new file mode 100644
index 000000000..1cbbf7a8a
--- /dev/null
+++ b/src/mds/events/EFragment.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_EFRAGMENT_H
+#define CEPH_MDS_EFRAGMENT_H
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+struct dirfrag_rollback {
+  CDir::fnode_const_ptr fnode;
+  dirfrag_rollback() { }
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+};
+WRITE_CLASS_ENCODER(dirfrag_rollback)
+
+class EFragment : public LogEvent {
+public:
+  EMetaBlob metablob;
+  __u8 op{0};
+  inodeno_t ino;
+  frag_t basefrag;
+  __s32 bits{0};         // positive for split (from basefrag), negative for merge (to basefrag)
+  frag_vec_t orig_frags;
+  bufferlist rollback;
+
+  EFragment() : LogEvent(EVENT_FRAGMENT) { }
+  EFragment(MDLog *mdlog, int o, dirfrag_t df, int b) :
+    LogEvent(EVENT_FRAGMENT),
+    op(o), ino(df.ino), basefrag(df.frag), bits(b) { }
+
+  void print(ostream& out) const override {
+    out << "EFragment " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << " " << metablob;
+  }
+
+  enum {
+    OP_PREPARE = 1,
+    OP_COMMIT = 2,
+    OP_ROLLBACK = 3,
+    OP_FINISH = 4 // finish deleting orphan dirfrags
+  };
+  static std::string_view op_name(int o) {
+    switch (o) {
+    case OP_PREPARE: return "prepare";
+    case OP_COMMIT: return "commit";
+    case OP_ROLLBACK: return "rollback";
+    case OP_FINISH: return "finish";
+    default: return "???";
+    }
+  }
+
+  void add_orig_frag(frag_t df, dirfrag_rollback *drb=NULL) {
+    using ceph::encode;
+    orig_frags.push_back(df);
+    if (drb)
+      encode(*drb, rollback);
+  }
+
+  EMetaBlob *get_metablob() override { return &metablob; }
+
+  void encode(bufferlist &bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator &bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<EFragment*>& ls);
+  void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(EFragment)
+
+#endif
diff --git a/src/mds/events/EImportFinish.h b/src/mds/events/EImportFinish.h
new file mode 100644
index 000000000..24bb1cd13
--- /dev/null
+++ b/src/mds/events/EImportFinish.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_EIMPORTFINISH_H
+#define CEPH_EIMPORTFINISH_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../MDSRank.h"
+#include "../LogEvent.h"
+
+class EImportFinish : public LogEvent {
+ protected:
+  dirfrag_t base; // imported dir
+  bool success;
+
+ public:
+  EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH), 
+				     base(dir->dirfrag()),
+				     success(s) { }
+  EImportFinish() : LogEvent(EVENT_IMPORTFINISH), base(), success(false) { }
+  
+  void print(ostream& out) const override {
+    out << "EImportFinish " << base;
+    if (success)
+      out << " success";
+    else
+      out << " failed";
+  }
+
+  void encode(bufferlist& bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator &bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<EImportFinish*>& ls);
+  
+  void replay(MDSRank *mds) override;
+
+};
+WRITE_CLASS_ENCODER_FEATURES(EImportFinish)
+
+#endif
diff --git a/src/mds/events/EImportStart.h b/src/mds/events/EImportStart.h
new file mode 100644
index 000000000..d4673a84a
--- /dev/null
+++ b/src/mds/events/EImportStart.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_EIMPORTSTART_H
+#define CEPH_EIMPORTSTART_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+class MDLog;
+class MDSRank;
+
+#include "EMetaBlob.h"
+#include "../LogEvent.h"
+
+class EImportStart : public LogEvent {
+protected:
+  dirfrag_t base;
+  vector<dirfrag_t> bounds;
+  mds_rank_t from;
+
+public:
+  EMetaBlob metablob;
+  bufferlist client_map;  // encoded map<__u32,entity_inst_t>
+  version_t cmapv{0};
+
+  EImportStart(MDLog *log, dirfrag_t di, const vector<dirfrag_t>& b, mds_rank_t f) :
+    LogEvent(EVENT_IMPORTSTART),
+    base(di), bounds(b), from(f) { }
+  EImportStart() :
+    LogEvent(EVENT_IMPORTSTART), from(MDS_RANK_NONE) { }
+  
+  void print(ostream& out) const override {
+    out << "EImportStart " << base << " from mds." << from << " " << metablob;
+  }
+
+  EMetaBlob *get_metablob() override { return &metablob; }
+  
+  void encode(bufferlist &bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator &bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<EImportStart*>& ls);
+  
+  void update_segment() override;
+  void replay(MDSRank *mds) override;
+
+};
+WRITE_CLASS_ENCODER_FEATURES(EImportStart)
+
+#endif
diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h
new file mode 100644
index 000000000..d555627a3
--- /dev/null
+++ b/src/mds/events/EMetaBlob.h
@@ -0,0 +1,613 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_EMETABLOB_H
+#define CEPH_MDS_EMETABLOB_H
+
+#include <string_view>
+
+#include "../CInode.h"
+#include "../CDir.h"
+#include "../CDentry.h"
+#include "../LogSegment.h"
+
+#include "include/interval_set.h"
+#include "common/strescape.h"
+
+class MDSRank;
+class MDLog;
+class LogSegment;
+struct MDPeerUpdate;
+
+/*
+ * a bunch of metadata in the journal
+ */
+
+/* notes:
+ *
+ * - make sure you adjust the inode.version for any modified inode you
+ *   journal.  CDir and CDentry maintain a projected_version, but CInode
+ *   doesn't, since the journaled inode usually has to be modified 
+ *   manually anyway (to delay the change in the MDS's cache until after
+ *   it is journaled).
+ *
+ */
+
+
+class EMetaBlob {
+
+public:
+  /* fullbit - a regular dentry + inode
+   *
+   * We encode this one a bit weirdly, just because (also, it's marginally faster
+   * on multiple encodes, which I think can happen):
+   * Encode a bufferlist on struct creation with all data members, without a struct_v.
+   * When encode is called, encode struct_v and then append the bufferlist.
+   * Decode straight into the appropriate variables.
+   *
+   * So, if you add members, encode them in the constructor and then change
+   * the struct_v in the encode function!
+   */
+  struct fullbit {
+    static const int STATE_DIRTY =	 (1<<0);
+    static const int STATE_DIRTYPARENT = (1<<1);
+    static const int STATE_DIRTYPOOL   = (1<<2);
+    static const int STATE_NEED_SNAPFLUSH = (1<<3);
+    static const int STATE_EPHEMERAL_RANDOM = (1<<4);
+    std::string  dn;         // dentry
+    std::string alternate_name;
+    snapid_t dnfirst, dnlast;
+    version_t dnv{0};
+    CInode::inode_const_ptr inode;      // if it's not XXX should not be part of mempool; wait for std::pmr to simplify
+    CInode::xattr_map_const_ptr xattrs;
+    fragtree_t dirfragtree;
+    std::string symlink;
+    snapid_t oldest_snap;
+    bufferlist snapbl;
+    __u8 state{0};
+    CInode::old_inode_map_const_ptr old_inodes; // XXX should not be part of mempool; wait for std::pmr to simplify
+
+    fullbit(std::string_view d, std::string_view an, snapid_t df, snapid_t dl,
+	    version_t v, const CInode::inode_const_ptr& i, const fragtree_t &dft,
+	    const CInode::xattr_map_const_ptr& xa, std::string_view sym,
+	    snapid_t os, const bufferlist &sbl, __u8 st,
+	    const CInode::old_inode_map_const_ptr& oi) :
+      dn(d), alternate_name(an), dnfirst(df), dnlast(dl), dnv(v), inode(i), xattrs(xa),
+      oldest_snap(os), state(st), old_inodes(oi)
+    {
+      if (i->is_symlink())
+	symlink = sym;
+      if (i->is_dir())
+	dirfragtree = dft;
+      snapbl = sbl;
+    }
+    explicit fullbit(bufferlist::const_iterator &p) {
+      decode(p);
+    }
+    fullbit() = default;
+    fullbit(const fullbit&) = delete;
+    ~fullbit() {}
+    fullbit& operator=(const fullbit&) = delete;
+
+    void encode(bufferlist& bl, uint64_t features) const;
+    void decode(bufferlist::const_iterator &bl);
+    void dump(Formatter *f) const;
+    static void generate_test_instances(std::list<EMetaBlob::fullbit*>& ls);
+
+    void update_inode(MDSRank *mds, CInode *in);
+    bool is_dirty() const { return (state & STATE_DIRTY); }
+    bool is_dirty_parent() const { return (state & STATE_DIRTYPARENT); }
+    bool is_dirty_pool() const { return (state & STATE_DIRTYPOOL); }
+    bool need_snapflush() const { return (state & STATE_NEED_SNAPFLUSH); }
+    bool is_export_ephemeral_random() const { return (state & STATE_EPHEMERAL_RANDOM); }
+
+    void print(ostream& out) const {
+      out << " fullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
+	  << " inode " << inode->ino
+	  << " state=" << state;
+      if (!alternate_name.empty()) {
+          out << " altn " << binstrprint(alternate_name, 8);
+      }
+      out << std::endl;
+    }
+    string state_string() const {
+      string state_string;
+      bool marked_already = false;
+      if (is_dirty()) {
+	state_string.append("dirty");
+	marked_already = true;
+      }
+      if (is_dirty_parent()) {
+	state_string.append(marked_already ? "+dirty_parent" : "dirty_parent");
+	if (is_dirty_pool())
+	  state_string.append("+dirty_pool");
+      }
+      return state_string;
+    }
+  };
+  WRITE_CLASS_ENCODER_FEATURES(fullbit)
+  
+  /* remotebit - a dentry + remote inode link (i.e. just an ino)
+   */
+  struct remotebit {
+    std::string dn;
+    std::string alternate_name;
+    snapid_t dnfirst = 0, dnlast = 0;
+    version_t dnv = 0;
+    inodeno_t ino = 0;
+    unsigned char d_type = '\0';
+    bool dirty = false;
+
+    remotebit(std::string_view d, std::string_view an, snapid_t df, snapid_t dl, version_t v, inodeno_t i, unsigned char dt, bool dr) : 
+      dn(d), alternate_name(an), dnfirst(df), dnlast(dl), dnv(v), ino(i), d_type(dt), dirty(dr) { }
+    explicit remotebit(bufferlist::const_iterator &p) { decode(p); }
+    remotebit() = default;
+
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator &bl);
+    void print(ostream& out) const {
+      out << " remotebit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
+	  << " ino " << ino
+	  << " dirty=" << dirty;
+      if (!alternate_name.empty()) {
+        out << " altn " << binstrprint(alternate_name, 8);
+      }
+      out << std::endl;
+    }
+    void dump(Formatter *f) const;
+    static void generate_test_instances(std::list<remotebit*>& ls);
+  };
+  WRITE_CLASS_ENCODER(remotebit)
+
+  /*
+   * nullbit - a null dentry
+   */
+  struct nullbit {
+    std::string dn;
+    snapid_t dnfirst, dnlast;
+    version_t dnv;
+    bool dirty;
+
+    nullbit(std::string_view d, snapid_t df, snapid_t dl, version_t v, bool dr) :
+      dn(d), dnfirst(df), dnlast(dl), dnv(v), dirty(dr) { }
+    explicit nullbit(bufferlist::const_iterator &p) { decode(p); }
+    nullbit(): dnfirst(0), dnlast(0), dnv(0), dirty(false) {}
+
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator &bl);
+    void dump(Formatter *f) const;
+    static void generate_test_instances(std::list<nullbit*>& ls);
+    void print(ostream& out) const {
+      out << " nullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
+	  << " dirty=" << dirty << std::endl;
+    }
+  };
+  WRITE_CLASS_ENCODER(nullbit)
+
+
+  /* dirlump - contains metadata for any dir we have contents for.
+   */
+public:
+  struct dirlump {
+    static const int STATE_COMPLETE =    (1<<1);
+    static const int STATE_DIRTY =       (1<<2);  // dirty due to THIS journal item, that is!
+    static const int STATE_NEW =         (1<<3);  // new directory
+    static const int STATE_IMPORTING =	 (1<<4);  // importing directory
+    static const int STATE_DIRTYDFT =	 (1<<5);  // dirty dirfragtree
+
+    //version_t  dirv;
+    CDir::fnode_const_ptr fnode;
+    __u32 state;
+    __u32 nfull, nremote, nnull;
+
+  private:
+    mutable bufferlist dnbl;
+    mutable bool dn_decoded;
+    mutable list<fullbit> dfull;
+    mutable vector<remotebit> dremote;
+    mutable vector<nullbit> dnull;
+
+  public:
+    dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { }
+    dirlump(const dirlump&) = delete;
+    dirlump& operator=(const dirlump&) = delete;
+    
+    bool is_complete() const { return state & STATE_COMPLETE; }
+    void mark_complete() { state |= STATE_COMPLETE; }
+    bool is_dirty() const { return state & STATE_DIRTY; }
+    void mark_dirty() { state |= STATE_DIRTY; }
+    bool is_new() const { return state & STATE_NEW; }
+    void mark_new() { state |= STATE_NEW; }
+    bool is_importing() { return state & STATE_IMPORTING; }
+    void mark_importing() { state |= STATE_IMPORTING; }
+    bool is_dirty_dft() { return state & STATE_DIRTYDFT; }
+    void mark_dirty_dft() { state |= STATE_DIRTYDFT; }
+
+    const list<fullbit>			&get_dfull() const { return dfull; }
+    list<fullbit>			&_get_dfull() { return dfull; }
+    const vector<remotebit>		&get_dremote() const { return dremote; }
+    const vector<nullbit>		&get_dnull() const { return dnull; }
+
+    template< class... Args>
+    void add_dfull(Args&&... args) {
+      dfull.emplace_back(std::forward<Args>(args)...);
+    }
+    template< class... Args>
+    void add_dremote(Args&&... args) {
+      dremote.emplace_back(std::forward<Args>(args)...);
+    }
+    template< class... Args>
+    void add_dnull(Args&&... args) {
+      dnull.emplace_back(std::forward<Args>(args)...);
+    }
+
+    void print(dirfrag_t dirfrag, ostream& out) const {
+      out << "dirlump " << dirfrag << " v " << fnode->version
+	  << " state " << state
+	  << " num " << nfull << "/" << nremote << "/" << nnull
+	  << std::endl;
+      _decode_bits();
+      for (const auto& p : dfull)
+	p.print(out);
+      for (const auto& p : dremote)
+	p.print(out);
+      for (const auto& p : dnull)
+	p.print(out);
+    }
+
+    string state_string() const {
+      string state_string;
+      bool marked_already = false;
+      if (is_complete()) {
+	state_string.append("complete");
+	marked_already = true;
+      }
+      if (is_dirty()) {
+	state_string.append(marked_already ? "+dirty" : "dirty");
+	marked_already = true;
+      }
+      if (is_new()) {
+	state_string.append(marked_already ? "+new" : "new");
+      }
+      return state_string;
+    }
+
+    // if this changes, update the versioning in encode for it!
+    void _encode_bits(uint64_t features) const {
+      using ceph::encode;
+      if (!dn_decoded) return;
+      encode(dfull, dnbl, features);
+      encode(dremote, dnbl);
+      encode(dnull, dnbl);
+    }
+    void _decode_bits() const { 
+      using ceph::decode;
+      if (dn_decoded) return;
+      auto p = dnbl.cbegin();
+      decode(dfull, p);
+      decode(dremote, p);
+      decode(dnull, p);
+      dn_decoded = true;
+    }
+
+    void encode(bufferlist& bl, uint64_t features) const;
+    void decode(bufferlist::const_iterator &bl);
+    void dump(Formatter *f) const;
+    static void generate_test_instances(std::list<dirlump*>& ls);
+  };
+  WRITE_CLASS_ENCODER_FEATURES(dirlump)
+
+  // my lumps.  preserve the order we added them in a list.
+  vector<dirfrag_t>         lump_order;
+  map<dirfrag_t, dirlump> lump_map;
+  list<fullbit> roots;
+public:
+  vector<pair<__u8,version_t> > table_tids;  // tableclient transactions
+
+  inodeno_t opened_ino;
+public:
+  inodeno_t renamed_dirino;
+  vector<frag_t> renamed_dir_frags;
+private:
+  
+  // ino (pre)allocation.  may involve both inotable AND session state.
+  version_t inotablev, sessionmapv;
+  inodeno_t allocated_ino;            // inotable
+  interval_set<inodeno_t> preallocated_inos; // inotable + session
+  inodeno_t used_preallocated_ino;    //            session
+  entity_name_t client_name;          //            session
+
+  // inodes i've truncated
+  vector<inodeno_t> truncate_start;        // start truncate
+  map<inodeno_t, LogSegment::seq_t> truncate_finish;  // finished truncate (started in segment blah)
+
+public:
+  vector<inodeno_t> destroyed_inodes;
+private:
+
+  // idempotent op(s)
+  vector<pair<metareqid_t,uint64_t> > client_reqs;
+  vector<pair<metareqid_t,uint64_t> > client_flushes;
+
+ public:
+  void encode(bufferlist& bl, uint64_t features) const;
+  void decode(bufferlist::const_iterator& bl);
+  void get_inodes(std::set<inodeno_t> &inodes) const;
+  void get_paths(std::vector<std::string> &paths) const;
+  void get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const;
+  entity_name_t get_client_name() const {return client_name;}
+
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<EMetaBlob*>& ls);
+  // soft stateadd
+  uint64_t last_subtree_map;
+  uint64_t event_seq;
+
+  // for replay, in certain cases
+  //LogSegment *_segment;
+
+  EMetaBlob() : opened_ino(0), renamed_dirino(0),
+                inotablev(0), sessionmapv(0), allocated_ino(0),
+                last_subtree_map(0), event_seq(0)
+                {}
+  EMetaBlob(const EMetaBlob&) = delete;
+  ~EMetaBlob() { }
+  EMetaBlob& operator=(const EMetaBlob&) = delete;
+
+  void print(ostream& out) {
+    for (const auto &p : lump_order)
+      lump_map[p].print(p, out);
+  }
+
+  void add_client_req(metareqid_t r, uint64_t tid=0) {
+    client_reqs.push_back(pair<metareqid_t,uint64_t>(r, tid));
+  }
+  void add_client_flush(metareqid_t r, uint64_t tid=0) {
+    client_flushes.push_back(pair<metareqid_t,uint64_t>(r, tid));
+  }
+
+  void add_table_transaction(int table, version_t tid) {
+    table_tids.push_back(pair<__u8, version_t>(table, tid));
+  }
+
+  void add_opened_ino(inodeno_t ino) {
+    ceph_assert(!opened_ino);
+    opened_ino = ino;
+  }
+
+  void set_ino_alloc(inodeno_t alloc,
+		     inodeno_t used_prealloc,
+		     interval_set<inodeno_t>& prealloc,
+		     entity_name_t client,
+		     version_t sv, version_t iv) {
+    allocated_ino = alloc;
+    used_preallocated_ino = used_prealloc;
+    preallocated_inos = prealloc;
+    client_name = client;
+    sessionmapv = sv;
+    inotablev = iv;
+  }
+
+  void add_truncate_start(inodeno_t ino) {
+    truncate_start.push_back(ino);
+  }
+  void add_truncate_finish(inodeno_t ino, uint64_t segoff) {
+    truncate_finish[ino] = segoff;
+  }
+  
+  bool rewrite_truncate_finish(MDSRank const *mds, std::map<uint64_t, uint64_t> const &old_to_new);
+
+  void add_destroyed_inode(inodeno_t ino) {
+    destroyed_inodes.push_back(ino);
+  }
+  
+  void add_null_dentry(CDentry *dn, bool dirty) {
+    add_null_dentry(add_dir(dn->get_dir(), false), dn, dirty);
+  }
+  void add_null_dentry(dirlump& lump, CDentry *dn, bool dirty) {
+    // add the dir
+    lump.nnull++;
+    lump.add_dnull(dn->get_name(), dn->first, dn->last,
+		   dn->get_projected_version(), dirty);
+  }
+
+  void add_remote_dentry(CDentry *dn, bool dirty) {
+    add_remote_dentry(add_dir(dn->get_dir(), false), dn, dirty, 0, 0);
+  }
+  void add_remote_dentry(CDentry *dn, bool dirty, inodeno_t rino, int rdt) {
+    add_remote_dentry(add_dir(dn->get_dir(), false), dn, dirty, rino, rdt);
+  }
+  void add_remote_dentry(dirlump& lump, CDentry *dn, bool dirty, 
+			 inodeno_t rino=0, unsigned char rdt=0) {
+    if (!rino) {
+      rino = dn->get_projected_linkage()->get_remote_ino();
+      rdt = dn->get_projected_linkage()->get_remote_d_type();
+    }
+    lump.nremote++;
+    lump.add_dremote(dn->get_name(), dn->get_alternate_name(), dn->first, dn->last,
+		     dn->get_projected_version(), rino, rdt, dirty);
+  }
+
+  // return remote pointer to to-be-journaled inode
+  void add_primary_dentry(CDentry *dn, CInode *in, bool dirty,
+			  bool dirty_parent=false, bool dirty_pool=false,
+			  bool need_snapflush=false) {
+    __u8 state = 0;
+    if (dirty) state |= fullbit::STATE_DIRTY;
+    if (dirty_parent) state |= fullbit::STATE_DIRTYPARENT;
+    if (dirty_pool) state |= fullbit::STATE_DIRTYPOOL;
+    if (need_snapflush) state |= fullbit::STATE_NEED_SNAPFLUSH;
+    add_primary_dentry(add_dir(dn->get_dir(), false), dn, in, state);
+  }
+  void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, __u8 state) {
+    if (!in) 
+      in = dn->get_projected_linkage()->get_inode();
+
+    if (in->is_ephemeral_rand()) {
+      state |= fullbit::STATE_EPHEMERAL_RANDOM;
+    }
+
+    const auto& pi = in->get_projected_inode();
+    ceph_assert(pi->version > 0);
+
+    if ((state & fullbit::STATE_DIRTY) && pi->is_backtrace_updated())
+      state |= fullbit::STATE_DIRTYPARENT;
+
+    bufferlist snapbl;
+    const sr_t *sr = in->get_projected_srnode();
+    if (sr)
+      sr->encode(snapbl);
+
+    lump.nfull++;
+    lump.add_dfull(dn->get_name(), dn->get_alternate_name(), dn->first, dn->last, dn->get_projected_version(),
+		   pi, in->dirfragtree, in->get_projected_xattrs(), in->symlink,
+		   in->oldest_snap, snapbl, state, in->get_old_inodes());
+
+    // make note of where this inode was last journaled
+    in->last_journaled = event_seq;
+    //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
+  }
+
+  // convenience: primary or remote?  figure it out.
+  void add_dentry(CDentry *dn, bool dirty) {
+    dirlump& lump = add_dir(dn->get_dir(), false);
+    add_dentry(lump, dn, dirty, false, false);
+  }
+  void add_import_dentry(CDentry *dn) {
+    bool dirty_parent = false;
+    bool dirty_pool = false;
+    if (dn->get_linkage()->is_primary()) {
+      dirty_parent = dn->get_linkage()->get_inode()->is_dirty_parent();
+      dirty_pool = dn->get_linkage()->get_inode()->is_dirty_pool();
+    }
+    dirlump& lump = add_dir(dn->get_dir(), false);
+    add_dentry(lump, dn, dn->is_dirty(), dirty_parent, dirty_pool);
+  }
+  void add_dentry(dirlump& lump, CDentry *dn, bool dirty, bool dirty_parent, bool dirty_pool) {
+    // primary or remote
+    if (dn->get_projected_linkage()->is_remote()) {
+      add_remote_dentry(dn, dirty);
+      return;
+    } else if (dn->get_projected_linkage()->is_null()) {
+      add_null_dentry(dn, dirty);
+      return;
+    }
+    ceph_assert(dn->get_projected_linkage()->is_primary());
+    add_primary_dentry(dn, 0, dirty, dirty_parent, dirty_pool);
+  }
+
+  void add_root(bool dirty, CInode *in) {
+    in->last_journaled = event_seq;
+    //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
+
+    const auto& pi = in->get_projected_inode();
+    const auto& px = in->get_projected_xattrs();
+    const auto& pdft = in->dirfragtree;
+
+    bufferlist snapbl;
+    const sr_t *sr = in->get_projected_srnode();
+    if (sr)
+      sr->encode(snapbl);
+
+    for (auto p = roots.begin(); p != roots.end(); ++p) {
+      if (p->inode->ino == in->ino()) {
+	roots.erase(p);
+	break;
+      }
+    }
+
+    string empty;
+    roots.emplace_back(empty, "", in->first, in->last, 0, pi, pdft, px, in->symlink,
+		       in->oldest_snap, snapbl, (dirty ? fullbit::STATE_DIRTY : 0),
+		       in->get_old_inodes());
+  }
+  
+  dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) {
+    return add_dir(dir->dirfrag(), dir->get_projected_fnode(),
+		   dirty, complete);
+  }
+  dirlump& add_new_dir(CDir *dir) {
+    return add_dir(dir->dirfrag(), dir->get_projected_fnode(),
+		   true, true, true); // dirty AND complete AND new
+  }
+  dirlump& add_import_dir(CDir *dir) {
+    // dirty=false would be okay in some cases
+    return add_dir(dir->dirfrag(), dir->get_projected_fnode(),
+		   dir->is_dirty(), dir->is_complete(), false, true, dir->is_dirty_dft());
+  }
+  dirlump& add_fragmented_dir(CDir *dir, bool dirty, bool dirtydft) {
+    return add_dir(dir->dirfrag(), dir->get_projected_fnode(),
+		   dirty, false, false, false, dirtydft);
+  }
+  dirlump& add_dir(dirfrag_t df, const CDir::fnode_const_ptr& pf, bool dirty,
+		   bool complete=false, bool isnew=false,
+		   bool importing=false, bool dirty_dft=false) {
+    if (lump_map.count(df) == 0)
+      lump_order.push_back(df);
+
+    dirlump& l = lump_map[df];
+    l.fnode = pf;
+    if (complete) l.mark_complete();
+    if (dirty) l.mark_dirty();
+    if (isnew) l.mark_new();
+    if (importing) l.mark_importing();
+    if (dirty_dft) l.mark_dirty_dft();
+    return l;
+  }
+  
+  static const int TO_AUTH_SUBTREE_ROOT = 0;  // default.
+  static const int TO_ROOT = 1;
+  
+  void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT);
+
+  bool empty() {
+    return roots.empty() && lump_order.empty() && table_tids.empty() &&
+	   truncate_start.empty() && truncate_finish.empty() &&
+	   destroyed_inodes.empty() && client_reqs.empty() &&
+	   opened_ino == 0 && inotablev == 0 && sessionmapv == 0;
+  }
+
+  void print(ostream& out) const {
+    out << "[metablob";
+    if (!lump_order.empty()) 
+      out << " " << lump_order.front() << ", " << lump_map.size() << " dirs";
+    if (!table_tids.empty())
+      out << " table_tids=" << table_tids;
+    if (allocated_ino || preallocated_inos.size()) {
+      if (allocated_ino)
+	out << " alloc_ino=" << allocated_ino;
+      if (preallocated_inos.size())
+	out << " prealloc_ino=" << preallocated_inos;
+      if (used_preallocated_ino)
+	out << " used_prealloc_ino=" << used_preallocated_ino;
+      out << " v" << inotablev;
+    }
+    out << "]";
+  }
+
+  void update_segment(LogSegment *ls);
+  void replay(MDSRank *mds, LogSegment *ls, MDPeerUpdate *su=NULL);
+};
+WRITE_CLASS_ENCODER_FEATURES(EMetaBlob)
+WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::fullbit)
+WRITE_CLASS_ENCODER(EMetaBlob::remotebit)
+WRITE_CLASS_ENCODER(EMetaBlob::nullbit)
+WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::dirlump)
+
+inline ostream& operator<<(ostream& out, const EMetaBlob& t) {
+  t.print(out);
+  return out;
+}
+
+#endif
diff --git a/src/mds/events/ENoOp.h b/src/mds/events/ENoOp.h
new file mode 100644
index 000000000..1bf5161e8
--- /dev/null
+++ b/src/mds/events/ENoOp.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_ENOOP_H
+#define CEPH_MDS_ENOOP_H
+
+#include "../LogEvent.h"
+
+class ENoOp : public LogEvent {
+  uint32_t pad_size;
+
+public:
+  ENoOp() : LogEvent(EVENT_NOOP), pad_size(0) { }
+  explicit ENoOp(uint32_t size_) : LogEvent(EVENT_NOOP), pad_size(size_){ }
+
+  void encode(bufferlist& bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator& bl) override;
+  void dump(Formatter *f) const override {}
+
+  void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(ENoOp)
+
+#endif
diff --git a/src/mds/events/EOpen.h b/src/mds/events/EOpen.h
new file mode 100644
index 000000000..06bfd3e88
--- /dev/null
+++ b/src/mds/events/EOpen.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_EOPEN_H
+#define CEPH_MDS_EOPEN_H
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+class EOpen : public LogEvent {
+public:
+  EMetaBlob metablob;
+  vector<inodeno_t> inos;
+  vector<vinodeno_t> snap_inos;
+
+  EOpen() : LogEvent(EVENT_OPEN) { }
+  explicit EOpen(MDLog *mdlog) :
+    LogEvent(EVENT_OPEN) { }
+
+  void print(ostream& out) const override {
+    out << "EOpen " << metablob << ", " << inos.size() << " open files";
+  }
+
+  EMetaBlob *get_metablob() override { return &metablob; }
+
+  void add_clean_inode(CInode *in) {
+    if (!in->is_base()) {
+      metablob.add_dir_context(in->get_projected_parent_dn()->get_dir());
+      metablob.add_primary_dentry(in->get_projected_parent_dn(), 0, false);
+      if (in->last == CEPH_NOSNAP)
+	inos.push_back(in->ino());
+      else
+	snap_inos.push_back(in->vino());
+    }
+  }
+  void add_ino(inodeno_t ino) {
+    inos.push_back(ino);
+  }
+
+  void encode(bufferlist& bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator& bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<EOpen*>& ls);
+
+  void update_segment() override;
+  void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(EOpen)
+
+#endif
diff --git a/src/mds/events/EPeerUpdate.h b/src/mds/events/EPeerUpdate.h
new file mode 100644
index 000000000..38f53735e
--- /dev/null
+++ b/src/mds/events/EPeerUpdate.h
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_EPEERUPDATE_H
+#define CEPH_MDS_EPEERUPDATE_H
+
+#include <string_view>
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+/*
+ * rollback records, for remote/peer updates, which may need to be manually
+ * rolled back during journal replay.  (or while active if leader fails, but in
+ * that case these records aren't needed.)
+ */
+struct link_rollback {
+  metareqid_t reqid;
+  inodeno_t ino;
+  bool was_inc;
+  utime_t old_ctime;
+  utime_t old_dir_mtime;
+  utime_t old_dir_rctime;
+  bufferlist snapbl;
+
+  link_rollback() : ino(0), was_inc(false) {}
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<link_rollback*>& ls);
+};
+WRITE_CLASS_ENCODER(link_rollback)
+
+/*
+ * this is only used on an empty dir with a dirfrag on a remote node.
+ * we are auth for nothing.  all we need to do is relink the directory
+ * in the hierarchy properly during replay to avoid breaking the
+ * subtree map.
+ */
+struct rmdir_rollback {
+  metareqid_t reqid;
+  dirfrag_t src_dir;
+  string src_dname;
+  dirfrag_t dest_dir;
+  string dest_dname;
+  bufferlist snapbl;
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<rmdir_rollback*>& ls);
+};
+WRITE_CLASS_ENCODER(rmdir_rollback)
+
+struct rename_rollback {
+  struct drec {
+    dirfrag_t dirfrag;
+    utime_t dirfrag_old_mtime;
+    utime_t dirfrag_old_rctime;
+    inodeno_t ino, remote_ino;
+    string dname;
+    char remote_d_type;
+    utime_t old_ctime;
+
+    drec() : remote_d_type((char)S_IFREG) {}
+
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator& bl);
+    void dump(Formatter *f) const;
+    static void generate_test_instances(std::list<drec*>& ls);
+  };
+  WRITE_CLASS_MEMBER_ENCODER(drec)
+
+  metareqid_t reqid;
+  drec orig_src, orig_dest;
+  drec stray; // we know this is null, but we want dname, old mtime/rctime
+  utime_t ctime;
+  bufferlist srci_snapbl;
+  bufferlist desti_snapbl;
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<rename_rollback*>& ls);
+};
+WRITE_CLASS_ENCODER(rename_rollback::drec)
+WRITE_CLASS_ENCODER(rename_rollback)
+
+
+class EPeerUpdate : public LogEvent {
+public:
+  const static int OP_PREPARE = 1;
+  const static int OP_COMMIT = 2;
+  const static int OP_ROLLBACK = 3;
+
+  const static int LINK = 1;
+  const static int RENAME = 2;
+  const static int RMDIR = 3;
+
+  /*
+   * we journal a rollback metablob that contains the unmodified metadata
+   * too, because we may be updating previously dirty metadata, which
+   * will allow old log segments to be trimmed.  if we end of rolling back,
+   * those updates could be lost.. so we re-journal the unmodified metadata,
+   * and replay will apply _either_ commit or rollback.
+   */
+  EMetaBlob commit;
+  bufferlist rollback;
+  string type;
+  metareqid_t reqid;
+  mds_rank_t leader;
+  __u8 op;  // prepare, commit, abort
+  __u8 origop; // link | rename
+
+  EPeerUpdate() : LogEvent(EVENT_PEERUPDATE), leader(0), op(0), origop(0) { }
+  EPeerUpdate(MDLog *mdlog, std::string_view s, metareqid_t ri, int leadermds, int o, int oo) :
+    LogEvent(EVENT_PEERUPDATE),
+    type(s),
+    reqid(ri),
+    leader(leadermds),
+    op(o), origop(oo) { }
+
+  void print(ostream& out) const override {
+    if (type.length())
+      out << type << " ";
+    out << " " << (int)op;
+    if (origop == LINK) out << " link";
+    if (origop == RENAME) out << " rename";
+    out << " " << reqid;
+    out << " for mds." << leader;
+    out << commit;
+  }
+
+  EMetaBlob *get_metablob() override { return &commit; }
+
+  void encode(bufferlist& bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator& bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<EPeerUpdate*>& ls);
+
+  void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(EPeerUpdate)
+
+#endif
diff --git a/src/mds/events/EPurged.h b/src/mds/events/EPurged.h
new file mode 100644
index 000000000..cda1b2ecf
--- /dev/null
+++ b/src/mds/events/EPurged.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_EPURGE_H
+#define CEPH_MDS_EPURGE_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+
+class EPurged : public LogEvent {
+public:
+  EPurged() : LogEvent(EVENT_PURGED) { }
+  EPurged(const interval_set<inodeno_t>& _inos, LogSegment::seq_t _seq, version_t iv)
+    : LogEvent(EVENT_PURGED), inos(_inos), seq(_seq), inotablev(iv) {
+  }
+  void encode(bufferlist& bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator& bl) override;
+  void dump(Formatter *f) const override;
+  void print(ostream& out) const override {
+    out << "Eurged " << inos.size() << " inos, inotable v" << inotablev;
+  }
+
+  void update_segment() override;
+  void replay(MDSRank *mds) override;
+
+protected:
+  interval_set<inodeno_t> inos;
+  LogSegment::seq_t seq;
+  version_t inotablev{0};
+};
+WRITE_CLASS_ENCODER_FEATURES(EPurged)
+
+#endif // CEPH_MDS_EPURGE_H
diff --git a/src/mds/events/EResetJournal.h b/src/mds/events/EResetJournal.h
new file mode 100644
index 000000000..302227fc6
--- /dev/null
+++ b/src/mds/events/EResetJournal.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_MDS_ERESETJOURNAL_H
+#define CEPH_MDS_ERESETJOURNAL_H
+
+#include "../LogEvent.h"
+
+// generic log event
+class EResetJournal : public LogEvent {
+ public:
+  EResetJournal() : LogEvent(EVENT_RESETJOURNAL) { }
+  ~EResetJournal() override {}
+
+  void encode(bufferlist& bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator& bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<EResetJournal*>& ls);
+  void print(ostream& out) const override {
+    out << "EResetJournal";
+  }
+
+  void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(EResetJournal)
+
+#endif
diff --git a/src/mds/events/ESession.h b/src/mds/events/ESession.h
new file mode 100644
index 000000000..c6586e3b5
--- /dev/null
+++ b/src/mds/events/ESession.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_ESESSION_H
+#define CEPH_MDS_ESESSION_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+
+class ESession : public LogEvent {
+ protected:
+  entity_inst_t client_inst;
+  bool open;    // open or close
+  version_t cmapv{0};  // client map version
+
+  interval_set<inodeno_t> inos_to_free;
+  version_t inotablev{0};
+
+  interval_set<inodeno_t> inos_to_purge;
+  
+  // Client metadata stored during open
+  client_metadata_t client_metadata;
+
+ public:
+  ESession() : LogEvent(EVENT_SESSION), open(false) { }
+  ESession(const entity_inst_t& inst, bool o, version_t v,
+	   const client_metadata_t& cm) :
+    LogEvent(EVENT_SESSION),
+    client_inst(inst), open(o), cmapv(v), inotablev(0),
+    client_metadata(cm) { }
+  ESession(const entity_inst_t& inst, bool o, version_t v,
+	   const interval_set<inodeno_t>& to_free, version_t iv,
+	   const interval_set<inodeno_t>& to_purge) :
+    LogEvent(EVENT_SESSION), client_inst(inst), open(o), cmapv(v),
+    inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge) {}
+
+  void encode(bufferlist& bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator& bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<ESession*>& ls);
+
+  void print(ostream& out) const override {
+    if (open)
+      out << "ESession " << client_inst << " open cmapv " << cmapv;
+    else
+      out << "ESession " << client_inst << " close cmapv " << cmapv;
+    if (inos_to_free.size() || inos_to_purge.size())
+      out << " (" << inos_to_free.size() << " to free, v" << inotablev
+	  << ", " << inos_to_purge.size() << " to purge)";
+  }
+  
+  void update_segment() override;
+  void replay(MDSRank *mds) override;
+  entity_inst_t get_client_inst() const {return client_inst;}
+};
+WRITE_CLASS_ENCODER_FEATURES(ESession)
+
+#endif
diff --git a/src/mds/events/ESessions.h b/src/mds/events/ESessions.h
new file mode 100644
index 000000000..fad702120
--- /dev/null
+++ b/src/mds/events/ESessions.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_ESESSIONS_H
+#define CEPH_MDS_ESESSIONS_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+
+class ESessions : public LogEvent {
+protected:
+  version_t cmapv;  // client map version
+  bool old_style_encode;
+
+public:
+  map<client_t,entity_inst_t> client_map;
+  map<client_t,client_metadata_t> client_metadata_map;
+
+  ESessions() : LogEvent(EVENT_SESSIONS), cmapv(0), old_style_encode(false) { }
+  ESessions(version_t pv, map<client_t,entity_inst_t>&& cm,
+	    map<client_t,client_metadata_t>&& cmm) :
+    LogEvent(EVENT_SESSIONS),
+    cmapv(pv), old_style_encode(false),
+    client_map(std::move(cm)),
+    client_metadata_map(std::move(cmm)) {}
+
+  void mark_old_encoding() { old_style_encode = true; }
+
+  void encode(bufferlist &bl, uint64_t features) const override;
+  void decode_old(bufferlist::const_iterator &bl);
+  void decode_new(bufferlist::const_iterator &bl);
+  void decode(bufferlist::const_iterator &bl) override {
+    if (old_style_encode) decode_old(bl);
+    else decode_new(bl);
+  }
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<ESessions*>& ls);
+
+  void print(ostream& out) const override {
+    out << "ESessions " << client_map.size() << " opens cmapv " << cmapv;
+  }
+  
+  void update_segment() override;
+  void replay(MDSRank *mds) override;  
+};
+WRITE_CLASS_ENCODER_FEATURES(ESessions)
+
+#endif
diff --git a/src/mds/events/ESubtreeMap.h b/src/mds/events/ESubtreeMap.h
new file mode 100644
index 000000000..3dc824087
--- /dev/null
+++ b/src/mds/events/ESubtreeMap.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_ESUBTREEMAP_H
+#define CEPH_MDS_ESUBTREEMAP_H
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+class ESubtreeMap : public LogEvent {
+public:
+  EMetaBlob metablob;
+  map<dirfrag_t, vector<dirfrag_t> > subtrees;
+  set<dirfrag_t> ambiguous_subtrees;
+  uint64_t expire_pos;
+  uint64_t event_seq;
+
+  ESubtreeMap() : LogEvent(EVENT_SUBTREEMAP), expire_pos(0), event_seq(0) { }
+  
+  void print(ostream& out) const override {
+    out << "ESubtreeMap " << subtrees.size() << " subtrees " 
+	<< ", " << ambiguous_subtrees.size() << " ambiguous "
+	<< metablob;
+  }
+
+  EMetaBlob *get_metablob() override { return &metablob; }
+
+  void encode(bufferlist& bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator& bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<ESubtreeMap*>& ls);
+
+  void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(ESubtreeMap)
+
+#endif
diff --git a/src/mds/events/ETableClient.h b/src/mds/events/ETableClient.h
new file mode 100644
index 000000000..3f6e454e6
--- /dev/null
+++ b/src/mds/events/ETableClient.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_ETABLECLIENT_H
+#define CEPH_MDS_ETABLECLIENT_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../mds_table_types.h"
+#include "../LogEvent.h"
+
+struct ETableClient : public LogEvent {
+  __u16 table;
+  __s16 op;
+  version_t tid;
+
+  ETableClient() : LogEvent(EVENT_TABLECLIENT), table(0), op(0), tid(0) { }
+  ETableClient(int t, int o, version_t ti) :
+    LogEvent(EVENT_TABLECLIENT),
+    table(t), op(o), tid(ti) { }
+
+  void encode(bufferlist& bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator& bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<ETableClient*>& ls);
+
+  void print(ostream& out) const override {
+    out << "ETableClient " << get_mdstable_name(table) << " " << get_mdstableserver_opname(op);
+    if (tid) out << " tid " << tid;
+  }  
+
+  //void update_segment();
+  void replay(MDSRank *mds) override;  
+};
+WRITE_CLASS_ENCODER_FEATURES(ETableClient)
+
+#endif
diff --git a/src/mds/events/ETableServer.h b/src/mds/events/ETableServer.h
new file mode 100644
index 000000000..59fe8ff1c
--- /dev/null
+++ b/src/mds/events/ETableServer.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_ETABLESERVER_H
+#define CEPH_MDS_ETABLESERVER_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../mds_table_types.h"
+#include "../LogEvent.h"
+
+struct ETableServer : public LogEvent {
+  __u16 table;
+  __s16 op;
+  uint64_t reqid;
+  mds_rank_t bymds;
+  bufferlist mutation;
+  version_t tid;
+  version_t version;
+
+  ETableServer() : LogEvent(EVENT_TABLESERVER), table(0), op(0),
+		   reqid(0), bymds(MDS_RANK_NONE), tid(0), version(0) { }
+  ETableServer(int t, int o, uint64_t ri, mds_rank_t m, version_t ti, version_t v) :
+    LogEvent(EVENT_TABLESERVER),
+    table(t), op(o), reqid(ri), bymds(m), tid(ti), version(v) { }
+
+  void encode(bufferlist& bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator& bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<ETableServer*>& ls);
+
+  void print(ostream& out) const override {
+    out << "ETableServer " << get_mdstable_name(table) 
+	<< " " << get_mdstableserver_opname(op);
+    if (reqid) out << " reqid " << reqid;
+    if (bymds >= 0) out << " mds." << bymds;
+    if (tid) out << " tid " << tid;
+    if (version) out << " version " << version;
+    if (mutation.length()) out << " mutation=" << mutation.length() << " bytes";
+  }  
+
+  void update_segment() override;
+  void replay(MDSRank *mds) override;  
+};
+WRITE_CLASS_ENCODER_FEATURES(ETableServer)
+
+#endif
diff --git a/src/mds/events/EUpdate.h b/src/mds/events/EUpdate.h
new file mode 100644
index 000000000..d320014a1
--- /dev/null
+++ b/src/mds/events/EUpdate.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_EUPDATE_H
+#define CEPH_MDS_EUPDATE_H
+
+#include <string_view>
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+class EUpdate : public LogEvent {
+public:
+  EMetaBlob metablob;
+  string type;
+  bufferlist client_map;
+  version_t cmapv;
+  metareqid_t reqid;
+  bool had_peers;
+
+  EUpdate() : LogEvent(EVENT_UPDATE), cmapv(0), had_peers(false) { }
+  EUpdate(MDLog *mdlog, std::string_view s) :
+    LogEvent(EVENT_UPDATE),
+    type(s), cmapv(0), had_peers(false) { }
+  
+  void print(ostream& out) const override {
+    if (type.length())
+      out << "EUpdate " << type << " ";
+    out << metablob;
+  }
+
+  EMetaBlob *get_metablob() override { return &metablob; }
+
+  void encode(bufferlist& bl, uint64_t features) const override;
+  void decode(bufferlist::const_iterator& bl) override;
+  void dump(Formatter *f) const override;
+  static void generate_test_instances(std::list<EUpdate*>& ls);
+
+  void update_segment() override;
+  void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(EUpdate)
+
+#endif
diff --git a/src/mds/flock.cc b/src/mds/flock.cc
new file mode 100644
index 000000000..69d579d30
--- /dev/null
+++ b/src/mds/flock.cc
@@ -0,0 +1,600 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <errno.h>
+
+#include "common/debug.h"
+#include "mdstypes.h"
+#include "mds/flock.h"
+
+#define dout_subsys ceph_subsys_mds
+
+using std::list;
+using std::pair;
+using std::multimap;
+
+static multimap<ceph_filelock, ceph_lock_state_t*> global_waiting_locks;
+
+static void remove_global_waiting(ceph_filelock &fl, ceph_lock_state_t *lock_state)
+{
+  for (auto p = global_waiting_locks.find(fl);
+       p != global_waiting_locks.end(); ) {
+    if (p->first != fl)
+      break;
+    if (p->second == lock_state) {
+      global_waiting_locks.erase(p);
+      break;
+    }
+    ++p;
+  }
+}
+
+ceph_lock_state_t::~ceph_lock_state_t()
+{
+  if (type == CEPH_LOCK_FCNTL) {
+    for (auto p = waiting_locks.begin(); p != waiting_locks.end(); ++p) {
+      remove_global_waiting(p->second, this);
+    }
+  }
+}
+
+bool ceph_lock_state_t::is_waiting(const ceph_filelock &fl) const
+{
+  auto p = waiting_locks.find(fl.start);
+  while (p != waiting_locks.end()) {
+    if (p->second.start > fl.start)
+      return false;
+    if (p->second.length == fl.length &&
+	ceph_filelock_owner_equal(p->second, fl))
+      return true;
+    ++p;
+  }
+  return false;
+}
+
+void ceph_lock_state_t::remove_waiting(const ceph_filelock& fl)
+{
+  for (auto p = waiting_locks.find(fl.start);
+       p != waiting_locks.end(); ) {
+    if (p->second.start > fl.start)
+      break;
+    if (p->second.length == fl.length &&
+	ceph_filelock_owner_equal(p->second, fl)) {
+      if (type == CEPH_LOCK_FCNTL) {
+	remove_global_waiting(p->second, this);
+      }
+      waiting_locks.erase(p);
+      --client_waiting_lock_counts[(client_t)fl.client];
+      if (!client_waiting_lock_counts[(client_t)fl.client]) {
+        client_waiting_lock_counts.erase((client_t)fl.client);
+      }
+      break;
+    }
+    ++p;
+  }
+}
+
+bool ceph_lock_state_t::is_deadlock(const ceph_filelock& fl,
+				    list<multimap<uint64_t, ceph_filelock>::iterator>&
+				      overlapping_locks,
+				    const ceph_filelock *first_fl, unsigned depth) const
+{
+  ldout(cct,15) << "is_deadlock " << fl << dendl;
+
+  // only for posix lock
+  if (type != CEPH_LOCK_FCNTL)
+    return false;
+
+  // find conflict locks' owners
+  std::set<ceph_filelock> lock_owners;
+  for (auto p = overlapping_locks.begin();
+       p != overlapping_locks.end();
+       ++p) {
+
+    if (fl.type == CEPH_LOCK_SHARED &&
+	(*p)->second.type == CEPH_LOCK_SHARED)
+      continue;
+
+    // circle detected
+    if (first_fl && ceph_filelock_owner_equal(*first_fl, (*p)->second)) {
+      ldout(cct,15) << " detect deadlock" << dendl;
+      return true;
+    }
+
+    ceph_filelock tmp = (*p)->second;
+    tmp.start = 0;
+    tmp.length = 0;
+    tmp.type = 0;
+    lock_owners.insert(tmp);
+  }
+
+  if (depth >= MAX_DEADLK_DEPTH)
+    return false;
+
+  first_fl = first_fl ? first_fl : &fl;
+  for (auto p = lock_owners.begin();
+       p != lock_owners.end();
+       ++p) {
+    ldout(cct,15) << " conflict lock owner " << *p << dendl;
+    // if conflict lock' owner is waiting for other lock?
+    for (auto q = global_waiting_locks.lower_bound(*p);
+	 q != global_waiting_locks.end();
+	 ++q) {
+      if (!ceph_filelock_owner_equal(q->first, *p))
+	break;
+
+      list<multimap<uint64_t, ceph_filelock>::iterator>
+	_overlapping_locks, _self_overlapping_locks;
+      ceph_lock_state_t& state = *(q->second);
+      if (state.get_overlapping_locks(q->first, _overlapping_locks)) {
+	state.split_by_owner(q->first, _overlapping_locks, _self_overlapping_locks);
+      }
+      if (!_overlapping_locks.empty()) {
+	if (is_deadlock(q->first, _overlapping_locks, first_fl, depth + 1))
+	  return true;
+      }
+    }
+  }
+  return false;
+}
+
+void ceph_lock_state_t::add_waiting(const ceph_filelock& fl)
+{
+  waiting_locks.insert(pair<uint64_t, ceph_filelock>(fl.start, fl));
+  ++client_waiting_lock_counts[(client_t)fl.client];
+  if (type == CEPH_LOCK_FCNTL) {
+    global_waiting_locks.insert(pair<ceph_filelock,ceph_lock_state_t*>(fl, this));
+  }
+}
+
+bool ceph_lock_state_t::add_lock(ceph_filelock& new_lock,
+                                 bool wait_on_fail, bool replay,
+				 bool *deadlock)
+{
+  ldout(cct,15) << "add_lock " << new_lock << dendl;
+  bool ret = false;
+  list<multimap<uint64_t, ceph_filelock>::iterator>
+    overlapping_locks, self_overlapping_locks, neighbor_locks;
+
+  // first, get any overlapping locks and split them into owned-by-us and not
+  if (get_overlapping_locks(new_lock, overlapping_locks, &neighbor_locks)) {
+    ldout(cct,15) << "got overlapping lock, splitting by owner" << dendl;
+    split_by_owner(new_lock, overlapping_locks, self_overlapping_locks);
+  }
+  if (!overlapping_locks.empty()) { //overlapping locks owned by others :(
+    if (CEPH_LOCK_EXCL == new_lock.type) {
+      //can't set, we want an exclusive
+      ldout(cct,15) << "overlapping lock, and this lock is exclusive, can't set"
+              << dendl;
+      if (wait_on_fail && !replay) {
+	if (is_deadlock(new_lock, overlapping_locks))
+	  *deadlock = true;
+	else
+	  add_waiting(new_lock);
+      }
+    } else { //shared lock, check for any exclusive locks blocking us
+      if (contains_exclusive_lock(overlapping_locks)) { //blocked :(
+        ldout(cct,15) << " blocked by exclusive lock in overlapping_locks" << dendl;
+	if (wait_on_fail && !replay) {
+	  if (is_deadlock(new_lock, overlapping_locks))
+	    *deadlock = true;
+	  else
+	    add_waiting(new_lock);
+	}
+      } else {
+        //yay, we can insert a shared lock
+        ldout(cct,15) << "inserting shared lock" << dendl;
+        remove_waiting(new_lock);
+        adjust_locks(self_overlapping_locks, new_lock, neighbor_locks);
+        held_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
+        ret = true;
+      }
+    }
+  } else { //no overlapping locks except our own
+    remove_waiting(new_lock);
+    adjust_locks(self_overlapping_locks, new_lock, neighbor_locks);
+    ldout(cct,15) << "no conflicts, inserting " << new_lock << dendl;
+    held_locks.insert(pair<uint64_t, ceph_filelock>
+                      (new_lock.start, new_lock));
+    ret = true;
+  }
+  if (ret) {
+    ++client_held_lock_counts[(client_t)new_lock.client];
+  }
+  return ret;
+}
+
+void ceph_lock_state_t::look_for_lock(ceph_filelock& testing_lock)
+{
+  list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks,
+    self_overlapping_locks;
+  if (get_overlapping_locks(testing_lock, overlapping_locks)) {
+    split_by_owner(testing_lock, overlapping_locks, self_overlapping_locks);
+  }
+  if (!overlapping_locks.empty()) { //somebody else owns overlapping lock
+    if (CEPH_LOCK_EXCL == testing_lock.type) { //any lock blocks it
+      testing_lock = (*overlapping_locks.begin())->second;
+    } else {
+      ceph_filelock *blocking_lock;
+      if ((blocking_lock = contains_exclusive_lock(overlapping_locks))) {
+        testing_lock = *blocking_lock;
+      } else { //nothing blocking!
+        testing_lock.type = CEPH_LOCK_UNLOCK;
+      }
+    }
+    return;
+  }
+  //if we get here, only our own locks block
+  testing_lock.type = CEPH_LOCK_UNLOCK;
+}
+
+void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock,
+                 list<ceph_filelock>& activated_locks)
+{
+  list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks,
+    self_overlapping_locks;
+  if (get_overlapping_locks(removal_lock, overlapping_locks)) {
+    ldout(cct,15) << "splitting by owner" << dendl;
+    split_by_owner(removal_lock, overlapping_locks, self_overlapping_locks);
+  } else ldout(cct,15) << "attempt to remove lock at " << removal_lock.start
+                 << " but no locks there!" << dendl;
+  bool remove_to_end = (0 == removal_lock.length);
+  uint64_t removal_start = removal_lock.start;
+  uint64_t removal_end = removal_start + removal_lock.length - 1;
+  __s64 old_lock_client = 0;
+  ceph_filelock *old_lock;
+
+  ldout(cct,15) << "examining " << self_overlapping_locks.size()
+          << " self-overlapping locks for removal" << dendl;
+  for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+         iter = self_overlapping_locks.begin();
+       iter != self_overlapping_locks.end();
+       ++iter) {
+    ldout(cct,15) << "self overlapping lock " << (*iter)->second << dendl;
+    old_lock = &(*iter)->second;
+    bool old_lock_to_end = (0 == old_lock->length);
+    uint64_t old_lock_end = old_lock->start + old_lock->length - 1;
+    old_lock_client = old_lock->client;
+    if (remove_to_end) {
+      if (old_lock->start < removal_start) {
+        old_lock->length = removal_start - old_lock->start;
+      } else {
+        ldout(cct,15) << "erasing " << (*iter)->second << dendl;
+        held_locks.erase(*iter);
+        --client_held_lock_counts[old_lock_client];
+      }
+    } else if (old_lock_to_end) {
+      ceph_filelock append_lock = *old_lock;
+      append_lock.start = removal_end+1;
+      held_locks.insert(pair<uint64_t, ceph_filelock>
+                        (append_lock.start, append_lock));
+      ++client_held_lock_counts[(client_t)old_lock->client];
+      if (old_lock->start >= removal_start) {
+        ldout(cct,15) << "erasing " << (*iter)->second << dendl;
+        held_locks.erase(*iter);
+        --client_held_lock_counts[old_lock_client];
+      } else old_lock->length = removal_start - old_lock->start;
+    } else {
+      if (old_lock_end  > removal_end) {
+        ceph_filelock append_lock = *old_lock;
+        append_lock.start = removal_end + 1;
+        append_lock.length = old_lock_end - append_lock.start + 1;
+        held_locks.insert(pair<uint64_t, ceph_filelock>
+                          (append_lock.start, append_lock));
+        ++client_held_lock_counts[(client_t)old_lock->client];
+      }
+      if (old_lock->start < removal_start) {
+        old_lock->length = removal_start - old_lock->start;
+      } else {
+        ldout(cct,15) << "erasing " << (*iter)->second << dendl;
+        held_locks.erase(*iter);
+        --client_held_lock_counts[old_lock_client];
+      }
+    }
+    if (!client_held_lock_counts[old_lock_client]) {
+      client_held_lock_counts.erase(old_lock_client);
+    }
+  }
+}
+
+bool ceph_lock_state_t::remove_all_from (client_t client)
+{
+  bool cleared_any = false;
+  if (client_held_lock_counts.count(client)) {
+    multimap<uint64_t, ceph_filelock>::iterator iter = held_locks.begin();
+    while (iter != held_locks.end()) {
+      if ((client_t)iter->second.client == client) {
+	held_locks.erase(iter++);
+      } else
+	++iter;
+    }
+    client_held_lock_counts.erase(client);
+    cleared_any = true;
+  }
+
+  if (client_waiting_lock_counts.count(client)) {
+    multimap<uint64_t, ceph_filelock>::iterator iter = waiting_locks.begin();
+    while (iter != waiting_locks.end()) {
+      if ((client_t)iter->second.client != client) {
+	++iter;
+	continue;
+      }
+      if (type == CEPH_LOCK_FCNTL) {
+	remove_global_waiting(iter->second, this);
+      }
+      waiting_locks.erase(iter++);
+    }
+    client_waiting_lock_counts.erase(client);
+  }
+  return cleared_any;
+}
+
+void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::iterator> old_locks,
+                  ceph_filelock& new_lock,
+                  list<multimap<uint64_t, ceph_filelock>::iterator>
+                  neighbor_locks)
+{
+  ldout(cct,15) << "adjust_locks" << dendl;
+  bool new_lock_to_end = (0 == new_lock.length);
+  __s64 old_lock_client = 0;
+  ceph_filelock *old_lock;
+  for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+         iter = old_locks.begin();
+       iter != old_locks.end();
+       ++iter) {
+    old_lock = &(*iter)->second;
+    ldout(cct,15) << "adjusting lock: " << *old_lock << dendl;
+    bool old_lock_to_end = (0 == old_lock->length);
+    uint64_t old_lock_start = old_lock->start;
+    uint64_t old_lock_end = old_lock->start + old_lock->length - 1;
+    uint64_t new_lock_start = new_lock.start;
+    uint64_t new_lock_end = new_lock.start + new_lock.length - 1;
+    old_lock_client = old_lock->client;
+    if (new_lock_to_end || old_lock_to_end) {
+      //special code path to deal with a length set at 0
+      ldout(cct,15) << "one lock extends forever" << dendl;
+      if (old_lock->type == new_lock.type) {
+        //just unify them in new lock, remove old lock
+        ldout(cct,15) << "same lock type, unifying" << dendl;
+        new_lock.start = (new_lock_start < old_lock_start) ? new_lock_start :
+          old_lock_start;
+        new_lock.length = 0;
+        held_locks.erase(*iter);
+        --client_held_lock_counts[old_lock_client];
+      } else { //not same type, have to keep any remains of old lock around
+        ldout(cct,15) << "shrinking old lock" << dendl;
+        if (new_lock_to_end) {
+          if (old_lock_start < new_lock_start) {
+            old_lock->length = new_lock_start - old_lock_start;
+          } else {
+            held_locks.erase(*iter);
+            --client_held_lock_counts[old_lock_client];
+          }
+        } else { //old lock extends past end of new lock
+          ceph_filelock appended_lock = *old_lock;
+          appended_lock.start = new_lock_end + 1;
+          held_locks.insert(pair<uint64_t, ceph_filelock>
+                            (appended_lock.start, appended_lock));
+          ++client_held_lock_counts[(client_t)old_lock->client];
+          if (old_lock_start < new_lock_start) {
+            old_lock->length = new_lock_start - old_lock_start;
+          } else {
+            held_locks.erase(*iter);
+            --client_held_lock_counts[old_lock_client];
+          }
+        }
+      }
+    } else {
+      if (old_lock->type == new_lock.type) { //just merge them!
+        ldout(cct,15) << "merging locks, they're the same type" << dendl;
+        new_lock.start = (old_lock_start < new_lock_start ) ? old_lock_start :
+          new_lock_start;
+        int new_end = (new_lock_end > old_lock_end) ? new_lock_end :
+          old_lock_end;
+        new_lock.length = new_end - new_lock.start + 1;
+        ldout(cct,15) << "erasing lock " << (*iter)->second << dendl;
+        held_locks.erase(*iter);
+        --client_held_lock_counts[old_lock_client];
+      } else { //we'll have to update sizes and maybe make new locks
+        ldout(cct,15) << "locks aren't same type, changing sizes" << dendl;
+        if (old_lock_end > new_lock_end) { //add extra lock after new_lock
+          ceph_filelock appended_lock = *old_lock;
+          appended_lock.start = new_lock_end + 1;
+          appended_lock.length = old_lock_end - appended_lock.start + 1;
+          held_locks.insert(pair<uint64_t, ceph_filelock>
+                            (appended_lock.start, appended_lock));
+          ++client_held_lock_counts[(client_t)old_lock->client];
+        }
+        if (old_lock_start < new_lock_start) {
+          old_lock->length = new_lock_start - old_lock_start;
+        } else { //old_lock starts inside new_lock, so remove it
+          //if it extended past new_lock_end it's been replaced
+          held_locks.erase(*iter);
+          --client_held_lock_counts[old_lock_client];
+        }
+      }
+    }
+    if (!client_held_lock_counts[old_lock_client]) {
+      client_held_lock_counts.erase(old_lock_client);
+    }
+  }
+
+  //make sure to coalesce neighboring locks
+  for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+         iter = neighbor_locks.begin();
+       iter != neighbor_locks.end();
+       ++iter) {
+    old_lock = &(*iter)->second;
+    old_lock_client = old_lock->client;
+    ldout(cct,15) << "lock to coalesce: " << *old_lock << dendl;
+    /* because if it's a neighboring lock there can't be any self-overlapping
+       locks that covered it */
+    if (old_lock->type == new_lock.type) { //merge them
+      if (0 == new_lock.length) {
+        if (old_lock->start + old_lock->length == new_lock.start) {
+          new_lock.start = old_lock->start;
+        } else ceph_abort(); /* if there's no end to new_lock, the neighbor
+                             HAS TO be to left side */
+      } else if (0 == old_lock->length) {
+        if (new_lock.start + new_lock.length == old_lock->start) {
+          new_lock.length = 0;
+        } else ceph_abort(); //same as before, but reversed
+      } else {
+        if (old_lock->start + old_lock->length == new_lock.start) {
+          new_lock.start = old_lock->start;
+          new_lock.length = old_lock->length + new_lock.length;
+        } else if (new_lock.start + new_lock.length == old_lock->start) {
+          new_lock.length = old_lock->length + new_lock.length;
+        }
+      }
+      held_locks.erase(*iter);
+      --client_held_lock_counts[old_lock_client];
+    }
+    if (!client_held_lock_counts[old_lock_client]) {
+      client_held_lock_counts.erase(old_lock_client);
+    }
+  }
+}
+
+multimap<uint64_t, ceph_filelock>::iterator
+ceph_lock_state_t::get_lower_bound(uint64_t start,
+                                   multimap<uint64_t, ceph_filelock>& lock_map)
+{
+   multimap<uint64_t, ceph_filelock>::iterator lower_bound =
+     lock_map.lower_bound(start);
+   if ((lower_bound->first != start)
+       && (start != 0)
+       && (lower_bound != lock_map.begin())) --lower_bound;
+   if (lock_map.end() == lower_bound)
+     ldout(cct,15) << "get_lower_dout(15)eturning end()" << dendl;
+   else ldout(cct,15) << "get_lower_bound returning iterator pointing to "
+                << lower_bound->second << dendl;
+   return lower_bound;
+ }
+
+multimap<uint64_t, ceph_filelock>::iterator
+ceph_lock_state_t::get_last_before(uint64_t end,
+                                   multimap<uint64_t, ceph_filelock>& lock_map)
+{
+  multimap<uint64_t, ceph_filelock>::iterator last =
+    lock_map.upper_bound(end);
+  if (last != lock_map.begin()) --last;
+  if (lock_map.end() == last)
+    ldout(cct,15) << "get_last_before returning end()" << dendl;
+  else ldout(cct,15) << "get_last_before returning iterator pointing to "
+               << last->second << dendl;
+  return last;
+}
+
+bool ceph_lock_state_t::share_space(
+    multimap<uint64_t, ceph_filelock>::iterator& iter,
+    uint64_t start, uint64_t end)
+{
+  bool ret = ((iter->first >= start && iter->first <= end) ||
+              ((iter->first < start) &&
+               (((iter->first + iter->second.length - 1) >= start) ||
+                (0 == iter->second.length))));
+  ldout(cct,15) << "share_space got start: " << start << ", end: " << end
+          << ", lock: " << iter->second << ", returning " << ret << dendl;
+  return ret;
+}
+
+bool ceph_lock_state_t::get_overlapping_locks(const ceph_filelock& lock,
+                           list<multimap<uint64_t,
+                               ceph_filelock>::iterator> & overlaps,
+                           list<multimap<uint64_t,
+                               ceph_filelock>::iterator> *self_neighbors)
+{
+  ldout(cct,15) << "get_overlapping_locks" << dendl;
+  // create a lock starting one earlier and ending one later
+  // to check for neighbors
+  ceph_filelock neighbor_check_lock = lock;
+  if (neighbor_check_lock.start != 0) {
+    neighbor_check_lock.start = neighbor_check_lock.start - 1;
+    if (neighbor_check_lock.length)
+      neighbor_check_lock.length = neighbor_check_lock.length + 2;
+  } else {
+    if (neighbor_check_lock.length)
+      neighbor_check_lock.length = neighbor_check_lock.length + 1;
+  }
+  //find the last held lock starting at the point after lock
+  uint64_t endpoint = lock.start;
+  if (lock.length) {
+    endpoint += lock.length;
+  } else {
+    endpoint = uint64_t(-1); // max offset
+  }
+  multimap<uint64_t, ceph_filelock>::iterator iter =
+    get_last_before(endpoint, held_locks);
+  bool cont = iter != held_locks.end();
+  while(cont) {
+    if (share_space(iter, lock)) {
+      overlaps.push_front(iter);
+    } else if (self_neighbors &&
+	       ceph_filelock_owner_equal(neighbor_check_lock, iter->second) &&
+               share_space(iter, neighbor_check_lock)) {
+      self_neighbors->push_front(iter);
+    }
+    if ((iter->first < lock.start) && (CEPH_LOCK_EXCL == iter->second.type)) {
+      //can't be any more overlapping locks or they'd interfere with this one
+      cont = false;
+    } else if (held_locks.begin() == iter) cont = false;
+    else --iter;
+  }
+  return !overlaps.empty();
+}
+
+bool ceph_lock_state_t::get_waiting_overlaps(const ceph_filelock& lock,
+                                             list<multimap<uint64_t,
+                                               ceph_filelock>::iterator>&
+                                               overlaps)
+{
+  ldout(cct,15) << "get_waiting_overlaps" << dendl;
+  multimap<uint64_t, ceph_filelock>::iterator iter =
+    get_last_before(lock.start + lock.length - 1, waiting_locks);
+  bool cont = iter != waiting_locks.end();
+  while(cont) {
+    if (share_space(iter, lock)) overlaps.push_front(iter);
+    if (waiting_locks.begin() == iter) cont = false;
+    --iter;
+  }
+  return !overlaps.empty();
+}
+
+void ceph_lock_state_t::split_by_owner(const ceph_filelock& owner,
+                                       list<multimap<uint64_t,
+                                           ceph_filelock>::iterator>& locks,
+                                       list<multimap<uint64_t,
+                                           ceph_filelock>::iterator>&
+                                           owned_locks)
+{
+  list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+    iter = locks.begin();
+  ldout(cct,15) << "owner lock: " << owner << dendl;
+  while (iter != locks.end()) {
+    ldout(cct,15) << "comparing to " << (*iter)->second << dendl;
+    if (ceph_filelock_owner_equal((*iter)->second, owner)) {
+      ldout(cct,15) << "success, pushing to owned_locks" << dendl;
+      owned_locks.push_back(*iter);
+      iter = locks.erase(iter);
+    } else {
+      ldout(cct,15) << "failure, something not equal in this group "
+              << (*iter)->second.client << ":" << owner.client << ","
+	      << (*iter)->second.owner << ":" << owner.owner << ","
+	      << (*iter)->second.pid << ":" << owner.pid << dendl;
+      ++iter;
+    }
+  }
+}
+
+ceph_filelock *
+ceph_lock_state_t::contains_exclusive_lock(list<multimap<uint64_t,
+                                               ceph_filelock>::iterator>& locks)
+{
+  for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+         iter = locks.begin();
+       iter != locks.end();
+       ++iter) {
+    if (CEPH_LOCK_EXCL == (*iter)->second.type) return &(*iter)->second;
+  }
+  return NULL;
+}
diff --git a/src/mds/flock.h b/src/mds/flock.h
new file mode 100644
index 000000000..915d912e1
--- /dev/null
+++ b/src/mds/flock.h
@@ -0,0 +1,289 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_MDS_FLOCK_H
+#define CEPH_MDS_FLOCK_H
+
+#include <errno.h>
+
+#include "common/debug.h"
+#include "mdstypes.h"
+
+inline std::ostream& operator<<(std::ostream& out, const ceph_filelock& l) {
+  out << "start: " << l.start << ", length: " << l.length
+      << ", client: " << l.client << ", owner: " << l.owner
+      << ", pid: " << l.pid << ", type: " << (int)l.type
+      << std::endl;
+  return out;
+}
+
+inline bool ceph_filelock_owner_equal(const ceph_filelock& l, const ceph_filelock& r)
+{
+  if (l.client != r.client || l.owner != r.owner)
+    return false;
+  // The file lock is from old client if the most significant bit of
+  // 'owner' is not set. Old clients use both 'owner' and 'pid' to
+  // identify the owner of lock.
+  if (l.owner & (1ULL << 63))
+    return true;
+  return l.pid == r.pid;
+}
+
+inline int ceph_filelock_owner_compare(const ceph_filelock& l, const ceph_filelock& r)
+{
+  if (l.client != r.client)
+    return l.client > r.client ? 1 : -1;
+  if (l.owner != r.owner)
+    return l.owner > r.owner ? 1 : -1;
+  if (l.owner & (1ULL << 63))
+    return 0;
+  if (l.pid != r.pid)
+    return l.pid > r.pid ? 1 : -1;
+  return 0;
+}
+
+inline int ceph_filelock_compare(const ceph_filelock& l, const ceph_filelock& r)
+{
+  int ret = ceph_filelock_owner_compare(l, r);
+  if (ret)
+    return ret;
+  if (l.start != r.start)
+    return l.start > r.start ? 1 : -1;
+  if (l.length != r.length)
+    return l.length > r.length ? 1 : -1;
+  if (l.type != r.type)
+    return l.type > r.type ? 1 : -1;
+  return 0;
+}
+
+inline bool operator<(const ceph_filelock& l, const ceph_filelock& r)
+{
+  return ceph_filelock_compare(l, r) < 0;
+}
+
+inline bool operator==(const ceph_filelock& l, const ceph_filelock& r) {
+  return ceph_filelock_compare(l, r) == 0;
+}
+
+inline bool operator!=(const ceph_filelock& l, const ceph_filelock& r) {
+  return ceph_filelock_compare(l, r) != 0;
+}
+
+class ceph_lock_state_t {
+public:
+  explicit ceph_lock_state_t(CephContext *cct_, int type_) : cct(cct_), type(type_) {}
+  ~ceph_lock_state_t();
+  /**
+   * Check if a lock is on the waiting_locks list.
+   *
+   * @param fl The filelock to check for
+   * @returns True if the lock is waiting, false otherwise
+   */
+  bool is_waiting(const ceph_filelock &fl) const;
+  /**
+   * Remove a lock from the waiting_locks list
+   *
+   * @param fl The filelock to remove
+   */
+  void remove_waiting(const ceph_filelock& fl);
+  /*
+   * Try to set a new lock. If it's blocked and wait_on_fail is true,
+   * add the lock to waiting_locks.
+   * The lock needs to be of type CEPH_LOCK_EXCL or CEPH_LOCK_SHARED.
+   * This may merge previous locks, or convert the type of already-owned
+   * locks.
+   *
+   * @param new_lock The lock to set
+   * @param wait_on_fail whether to wait until the lock can be set.
+   * Otherwise it fails immediately when blocked.
+   *
+   * @returns true if set, false if not set.
+   */
+  bool add_lock(ceph_filelock& new_lock, bool wait_on_fail, bool replay,
+		bool *deadlock);
+  /**
+   * See if a lock is blocked by existing locks. If the lock is blocked,
+   * it will be set to the value of the first blocking lock. Otherwise,
+   * it will be returned unchanged, except for setting the type field
+   * to CEPH_LOCK_UNLOCK.
+   *
+   * @param testing_lock The lock to check for conflicts on.
+   */
+  void look_for_lock(ceph_filelock& testing_lock);
+
+  /*
+   * Remove lock(s) described in old_lock. This may involve splitting a
+   * previous lock or making a previous lock smaller.
+   *
+   * @param removal_lock The lock to remove
+   * @param activated_locks A return parameter, holding activated wait locks.
+   */
+  void remove_lock(const ceph_filelock removal_lock,
+                   std::list<ceph_filelock>& activated_locks);
+
+  bool remove_all_from(client_t client);
+
+  void encode(ceph::bufferlist& bl) const {
+    using ceph::encode;
+    encode(held_locks, bl);
+    encode(client_held_lock_counts, bl);
+  }
+  void decode(ceph::bufferlist::const_iterator& bl) {
+    using ceph::decode;
+    decode(held_locks, bl);
+    decode(client_held_lock_counts, bl);
+  }
+  bool empty() const {
+    return held_locks.empty() && waiting_locks.empty() &&
+	   client_held_lock_counts.empty() &&
+	   client_waiting_lock_counts.empty();
+  }
+
+  std::multimap<uint64_t, ceph_filelock> held_locks;    // current locks
+  std::multimap<uint64_t, ceph_filelock> waiting_locks; // locks waiting for other locks
+  // both of the above are keyed by starting offset
+  std::map<client_t, int> client_held_lock_counts;
+  std::map<client_t, int> client_waiting_lock_counts;
+
+private:
+  static const unsigned MAX_DEADLK_DEPTH = 5;
+
+  /**
+   * Check if adding the lock causes deadlock
+   *
+   * @param fl The blocking filelock 
+   * @param overlapping_locks list of all overlapping locks 
+   * @param first_fl 
+   * @depth recursion call depth
+   */
+  bool is_deadlock(const ceph_filelock& fl,
+		   std::list<std::multimap<uint64_t, ceph_filelock>::iterator>&
+		   overlapping_locks,
+		   const ceph_filelock *first_fl=NULL, unsigned depth=0) const;
+
+  /**
+   * Add a lock to the waiting_locks list
+   *
+   * @param fl The filelock to add
+   */
+  void add_waiting(const ceph_filelock& fl);
+
+  /**
+   * Adjust old locks owned by a single process so that process can set
+   * a new lock of different type. Handle any changes needed to the old locks
+   * (and the new lock) so that once the new lock is inserted into the 
+   * held_locks list the process has a coherent, non-fragmented set of lock
+   * ranges. Make sure any overlapping locks are combined, trimmed, and removed
+   * as needed.
+   * This function should only be called once you know the lock will be
+   * inserted, as it DOES adjust new_lock. You can call this function
+   * on an empty list, in which case it does nothing.
+   * This function does not remove elements from old_locks, so regard the list
+   * as bad information following function invocation.
+   *
+   * @param new_lock The new lock the process has requested.
+   * @param old_locks list of all locks currently held by same
+   *    client/process that overlap new_lock.
+   * @param neighbor_locks locks owned by same process that neighbor new_lock on
+   *    left or right side.
+   */
+  void adjust_locks(std::list<std::multimap<uint64_t, ceph_filelock>::iterator> old_locks,
+                    ceph_filelock& new_lock,
+                    std::list<std::multimap<uint64_t, ceph_filelock>::iterator>
+                      neighbor_locks);
+
+  //get last lock prior to start position
+  std::multimap<uint64_t, ceph_filelock>::iterator
+  get_lower_bound(uint64_t start,
+                  std::multimap<uint64_t, ceph_filelock>& lock_map);
+  //get latest-starting lock that goes over the byte "end"
+  std::multimap<uint64_t, ceph_filelock>::iterator
+  get_last_before(uint64_t end,
+                  std::multimap<uint64_t, ceph_filelock>& lock_map);
+
+  /*
+   * See if an iterator's lock covers any of the same bounds as a given range
+   * Rules: locks cover "length" bytes from "start", so the last covered
+   * byte is at start + length - 1.
+   * If the length is 0, the lock covers from "start" to the end of the file.
+   */
+  bool share_space(std::multimap<uint64_t, ceph_filelock>::iterator& iter,
+		   uint64_t start, uint64_t end);
+  
+  bool share_space(std::multimap<uint64_t, ceph_filelock>::iterator& iter,
+                   const ceph_filelock &lock) {
+    uint64_t end = lock.start;
+    if (lock.length) {
+      end += lock.length - 1;
+    } else { // zero length means end of file
+      end = uint64_t(-1);
+    }
+    return share_space(iter, lock.start, end);
+  }
+  /*
+   *get a list of all locks overlapping with the given lock's range
+   * lock: the lock to compare with.
+   * overlaps: an empty list, to be filled.
+   * Returns: true if at least one lock overlaps.
+   */
+  bool get_overlapping_locks(const ceph_filelock& lock,
+                             std::list<std::multimap<uint64_t,
+                                 ceph_filelock>::iterator> & overlaps,
+                             std::list<std::multimap<uint64_t,
+                                 ceph_filelock>::iterator> *self_neighbors);
+
+  
+  bool get_overlapping_locks(const ceph_filelock& lock,
+			     std::list<std::multimap<uint64_t, ceph_filelock>::iterator>& overlaps) {
+    return get_overlapping_locks(lock, overlaps, NULL);
+  }
+
+  /**
+   * Get a list of all waiting locks that overlap with the given lock's range.
+   * lock: specifies the range to compare with
+   * overlaps: an empty list, to be filled
+   * Returns: true if at least one waiting_lock overlaps
+   */
+  bool get_waiting_overlaps(const ceph_filelock& lock,
+                            std::list<std::multimap<uint64_t,
+                                ceph_filelock>::iterator>& overlaps);
+  /*
+   * split a list of locks up by whether they're owned by same
+   * process as given lock
+   * owner: the owning lock
+   * locks: the list of locks (obtained from get_overlapping_locks, probably)
+   *        Will have all locks owned by owner removed
+   * owned_locks: an empty list, to be filled with the locks owned by owner
+   */
+  void split_by_owner(const ceph_filelock& owner,
+		      std::list<std::multimap<uint64_t,
+		          ceph_filelock>::iterator> & locks,
+		      std::list<std::multimap<uint64_t,
+		          ceph_filelock>::iterator> & owned_locks);
+
+  ceph_filelock *contains_exclusive_lock(std::list<std::multimap<uint64_t,
+                                         ceph_filelock>::iterator>& locks);
+
+  CephContext *cct;
+  int type;
+};
+WRITE_CLASS_ENCODER(ceph_lock_state_t)
+
+inline std::ostream& operator<<(std::ostream &out, const ceph_lock_state_t &l) {
+  out << "ceph_lock_state_t. held_locks.size()=" << l.held_locks.size()
+      << ", waiting_locks.size()=" << l.waiting_locks.size()
+      << ", client_held_lock_counts -- " << l.client_held_lock_counts
+      << "\n client_waiting_lock_counts -- " << l.client_waiting_lock_counts
+      << "\n held_locks -- ";
+    for (auto iter = l.held_locks.begin();
+         iter != l.held_locks.end();
+         ++iter)
+      out << iter->second;
+    out << "\n waiting_locks -- ";
+    for (auto iter =l.waiting_locks.begin();
+         iter != l.waiting_locks.end();
+         ++iter)
+      out << iter->second << "\n";
+  return out;
+}
+
+#endif
diff --git a/src/mds/inode_backtrace.cc b/src/mds/inode_backtrace.cc
new file mode 100644
index 000000000..be58d1bc3
--- /dev/null
+++ b/src/mds/inode_backtrace.cc
@@ -0,0 +1,163 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "inode_backtrace.h"
+
+#include "common/Formatter.h"
+
+/* inode_backpointer_t */
+
+void inode_backpointer_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(dirino, bl);
+  encode(dname, bl);
+  encode(version, bl);
+  ENCODE_FINISH(bl);
+}
+
+void inode_backpointer_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(dirino, bl);
+  decode(dname, bl);
+  decode(version, bl);
+  DECODE_FINISH(bl);
+}
+
+void inode_backpointer_t::decode_old(ceph::buffer::list::const_iterator& bl)
+{
+  using ceph::decode;
+  decode(dirino, bl);
+  decode(dname, bl);
+  decode(version, bl);
+}
+
+void inode_backpointer_t::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("dirino", dirino);
+  f->dump_string("dname", dname);
+  f->dump_unsigned("version", version);
+}
+
+void inode_backpointer_t::generate_test_instances(std::list<inode_backpointer_t*>& ls)
+{
+  ls.push_back(new inode_backpointer_t);
+  ls.push_back(new inode_backpointer_t);
+  ls.back()->dirino = 1;
+  ls.back()->dname = "foo";
+  ls.back()->version = 123;
+}
+
+
+/*
+ * inode_backtrace_t
+ */
+
+void inode_backtrace_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(5, 4, bl);
+  encode(ino, bl);
+  encode(ancestors, bl);
+  encode(pool, bl);
+  encode(old_pools, bl);
+  ENCODE_FINISH(bl);
+}
+
+void inode_backtrace_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
+  if (struct_v < 3)
+    return;  // sorry, the old data was crap
+  decode(ino, bl);
+  if (struct_v >= 4) {
+    decode(ancestors, bl);
+  } else {
+    __u32 n;
+    decode(n, bl);
+    while (n--) {
+      ancestors.push_back(inode_backpointer_t());
+      ancestors.back().decode_old(bl);
+    }
+  }
+  if (struct_v >= 5) {
+    decode(pool, bl);
+    decode(old_pools, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void inode_backtrace_t::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("ino", ino);
+  f->open_array_section("ancestors");
+  for (auto p = ancestors.begin(); p != ancestors.end(); ++p) {
+    f->open_object_section("backpointer");
+    p->dump(f);
+    f->close_section();
+  }
+  f->close_section();
+  f->dump_int("pool", pool);
+  f->open_array_section("old_pools");
+  for (auto p = old_pools.begin(); p != old_pools.end(); ++p) {
+    f->dump_int("old_pool", *p);
+  }
+  f->close_section();
+}
+
+void inode_backtrace_t::generate_test_instances(std::list<inode_backtrace_t*>& ls)
+{
+  ls.push_back(new inode_backtrace_t);
+  ls.push_back(new inode_backtrace_t);
+  ls.back()->ino = 1;
+  ls.back()->ancestors.push_back(inode_backpointer_t());
+  ls.back()->ancestors.back().dirino = 123;
+  ls.back()->ancestors.back().dname = "bar";
+  ls.back()->ancestors.back().version = 456;
+  ls.back()->pool = 0;
+  ls.back()->old_pools.push_back(10);
+  ls.back()->old_pools.push_back(7);
+}
+
+int inode_backtrace_t::compare(const inode_backtrace_t& other,
+                               bool *equivalent, bool *divergent) const
+{
+  int min_size = std::min(ancestors.size(),other.ancestors.size());
+  *equivalent = true;
+  *divergent = false;
+  if (min_size == 0)
+    return 0;
+  int comparator = 0;
+  if (ancestors[0].version > other.ancestors[0].version)
+    comparator = 1;
+  else if (ancestors[0].version < other.ancestors[0].version)
+    comparator = -1;
+  if (ancestors[0].dirino != other.ancestors[0].dirino ||
+      ancestors[0].dname != other.ancestors[0].dname)
+    *divergent = true;
+  for (int i = 1; i < min_size; ++i) {
+    if (*divergent) {
+      /**
+       * we already know the dentries and versions are
+       * incompatible; no point checking farther
+       */
+      break;
+    }
+    if (ancestors[i].dirino != other.ancestors[i].dirino ||
+        ancestors[i].dname != other.ancestors[i].dname) {
+      *equivalent = false;
+      return comparator;
+    } else if (ancestors[i].version > other.ancestors[i].version) {
+      if (comparator < 0)
+        *divergent = true;
+      comparator = 1;
+    } else if (ancestors[i].version < other.ancestors[i].version) {
+      if (comparator > 0)
+        *divergent = true;
+      comparator = -1;
+    }
+  }
+  if (*divergent)
+    *equivalent = false;
+  return comparator;
+}
diff --git a/src/mds/inode_backtrace.h b/src/mds/inode_backtrace.h
new file mode 100644
index 000000000..3e731e954
--- /dev/null
+++ b/src/mds/inode_backtrace.h
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_INODE_BACKTRACE_H
+#define CEPH_INODE_BACKTRACE_H
+
+#include <string_view>
+
+#include "mdstypes.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+/** metadata backpointers **/
+
+/*
+ * - inode_backpointer_t is just the _pointer_ portion; it doesn't
+ *   tell us who we point _from_.
+ *
+ * - it _does_ include a version of the source object, so we can look
+ *   at two different pointers (from the same inode) and tell which is
+ *   newer.
+ */
+struct inode_backpointer_t {
+  inode_backpointer_t() {}
+  inode_backpointer_t(inodeno_t i, std::string_view d, version_t v) : dirino(i), dname(d), version(v) {}
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void decode_old(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<inode_backpointer_t*>& ls);
+
+  inodeno_t dirino;    // containing directory ino
+  std::string dname;        // linking dentry name
+  version_t version = 0;   // child's version at time of backpointer creation
+};
+WRITE_CLASS_ENCODER(inode_backpointer_t)
+
+inline bool operator==(const inode_backpointer_t& l, const inode_backpointer_t& r) {
+	return l.dirino == r.dirino && l.version == r.version && l.dname == r.dname;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const inode_backpointer_t& ib) {
+  return out << "<" << ib.dirino << "/" << ib.dname << " v" << ib.version << ">";
+}
+
+/*
+ * inode_backtrace_t is a complete ancestor backtraces for a given inode.
+ * we include who _we_ are, so that the backtrace can stand alone (as, say,
+ * an xattr on an object).
+ */
+struct inode_backtrace_t {
+  inode_backtrace_t() {}
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<inode_backtrace_t*>& ls);
+
+  /**
+   * Compare two backtraces *for the same inode*.
+   * @pre The backtraces are for the same inode
+   *
+   * @param other The backtrace to compare ourselves with
+   * @param equivalent A bool pointer which will be set to true if
+   * the other backtrace is equivalent to our own (has the same dentries)
+   * @param divergent A bool pointer which will be set to true if
+   * the backtraces have differing entries without versions supporting them
+   *
+   * @returns 1 if we are newer than the other, 0 if equal, -1 if older
+   */
+  int compare(const inode_backtrace_t& other,
+               bool *equivalent, bool *divergent) const;
+
+  void clear() {
+    ancestors.clear();
+    old_pools.clear();
+  }
+
+  inodeno_t ino;       // my ino
+  std::vector<inode_backpointer_t> ancestors;
+  int64_t pool = -1;
+  std::vector<int64_t> old_pools;
+};
+WRITE_CLASS_ENCODER(inode_backtrace_t)
+
+inline std::ostream& operator<<(std::ostream& out, const inode_backtrace_t& it) {
+  return out << "(" << it.pool << ")" << it.ino << ":" << it.ancestors << "//" << it.old_pools;
+}
+
+inline bool operator==(const inode_backtrace_t& l,
+                       const inode_backtrace_t& r) {
+  return l.ino == r.ino &&
+      l.pool == r.pool &&
+      l.old_pools == r.old_pools &&
+      l.ancestors == r.ancestors;
+}
+
+#endif
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
new file mode 100644
index 000000000..09c39d15d
--- /dev/null
+++ b/src/mds/journal.cc
@@ -0,0 +1,3304 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "common/config.h"
+#include "osdc/Journaler.h"
+#include "events/ESubtreeMap.h"
+#include "events/ESession.h"
+#include "events/ESessions.h"
+
+#include "events/EMetaBlob.h"
+#include "events/EResetJournal.h"
+#include "events/ENoOp.h"
+
+#include "events/EUpdate.h"
+#include "events/EPeerUpdate.h"
+#include "events/EOpen.h"
+#include "events/ECommitted.h"
+#include "events/EPurged.h"
+
+#include "events/EExport.h"
+#include "events/EImportStart.h"
+#include "events/EImportFinish.h"
+#include "events/EFragment.h"
+
+#include "events/ETableClient.h"
+#include "events/ETableServer.h"
+
+#include "include/stringify.h"
+
+#include "LogSegment.h"
+
+#include "MDSRank.h"
+#include "MDLog.h"
+#include "MDCache.h"
+#include "Server.h"
+#include "Migrator.h"
+#include "Mutation.h"
+
+#include "InoTable.h"
+#include "MDSTableClient.h"
+#include "MDSTableServer.h"
+
+#include "Locker.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
+
+
+// -----------------------
+// LogSegment
+
+struct BatchStoredBacktrace : public MDSIOContext {
+  MDSContext *fin;
+  std::vector<CInodeCommitOperations> ops_vec;
+
+  BatchStoredBacktrace(MDSRank *m, MDSContext *f,
+		       std::vector<CInodeCommitOperations>&& ops) :
+    MDSIOContext(m), fin(f), ops_vec(std::move(ops)) {}
+  void finish(int r) override {
+    for (auto& op : ops_vec) {
+      op.in->_stored_backtrace(r, op.version, nullptr);
+    }
+    fin->complete(r);
+  }
+  void print(ostream& out) const override {
+    out << "batch backtrace_store";
+  }
+};
+
+struct BatchCommitBacktrace : public Context {
+  MDSRank *mds;
+  MDSContext *fin;
+  std::vector<CInodeCommitOperations> ops_vec;
+
+  BatchCommitBacktrace(MDSRank *m, MDSContext *f,
+		       std::vector<CInodeCommitOperations>&& ops) :
+    mds(m), fin(f), ops_vec(std::move(ops)) {}
+  void finish(int r) override {
+    C_GatherBuilder gather(g_ceph_context);
+
+    for (auto &op : ops_vec) {
+      op.in->_commit_ops(r, gather, op.ops_vec, op.bt);
+      op.ops_vec.clear();
+      op.bt.clear();
+    }
+    ceph_assert(gather.has_subs());
+    gather.set_finisher(new C_OnFinisher(
+			  new BatchStoredBacktrace(mds, fin, std::move(ops_vec)),
+			  mds->finisher));
+    gather.activate();
+  }
+};
+
+void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio)
+{
+  set<CDir*> commit;
+
+  dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire" << dendl;
+
+  ceph_assert(g_conf()->mds_kill_journal_expire_at != 1);
+
+  // commit dirs
+  for (elist<CDir*>::iterator p = new_dirfrags.begin(); !p.end(); ++p) {
+    dout(20) << " new_dirfrag " << **p << dendl;
+    ceph_assert((*p)->is_auth());
+    commit.insert(*p);
+  }
+  for (elist<CDir*>::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) {
+    dout(20) << " dirty_dirfrag " << **p << dendl;
+    ceph_assert((*p)->is_auth());
+    commit.insert(*p);
+  }
+  for (elist<CDentry*>::iterator p = dirty_dentries.begin(); !p.end(); ++p) {
+    dout(20) << " dirty_dentry " << **p << dendl;
+    ceph_assert((*p)->is_auth());
+    commit.insert((*p)->get_dir());
+  }
+  for (elist<CInode*>::iterator p = dirty_inodes.begin(); !p.end(); ++p) {
+    dout(20) << " dirty_inode " << **p << dendl;
+    ceph_assert((*p)->is_auth());
+    if ((*p)->is_base()) {
+      (*p)->store(gather_bld.new_sub());
+    } else
+      commit.insert((*p)->get_parent_dn()->get_dir());
+  }
+
+  if (!commit.empty()) {
+    for (set<CDir*>::iterator p = commit.begin();
+	 p != commit.end();
+	 ++p) {
+      CDir *dir = *p;
+      ceph_assert(dir->is_auth());
+      if (dir->can_auth_pin()) {
+	dout(15) << "try_to_expire committing " << *dir << dendl;
+	dir->commit(0, gather_bld.new_sub(), false, op_prio);
+      } else {
+	dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl;
+	dir->add_waiter(CDir::WAIT_UNFREEZE, gather_bld.new_sub());
+      }
+    }
+  }
+
+  // leader ops with possibly uncommitted peers
+  for (set<metareqid_t>::iterator p = uncommitted_leaders.begin();
+       p != uncommitted_leaders.end();
+       ++p) {
+    dout(10) << "try_to_expire waiting for peers to ack commit on " << *p << dendl;
+    mds->mdcache->wait_for_uncommitted_leader(*p, gather_bld.new_sub());
+  }
+
+  // peer ops that haven't been committed
+  for (set<metareqid_t>::iterator p = uncommitted_peers.begin();
+       p != uncommitted_peers.end();
+       ++p) {
+    dout(10) << "try_to_expire waiting for leader to ack OP_FINISH on " << *p << dendl;
+    mds->mdcache->wait_for_uncommitted_peer(*p, gather_bld.new_sub());
+  }
+
+  // uncommitted fragments
+  for (set<dirfrag_t>::iterator p = uncommitted_fragments.begin();
+       p != uncommitted_fragments.end();
+       ++p) {
+    dout(10) << "try_to_expire waiting for uncommitted fragment " << *p << dendl;
+    mds->mdcache->wait_for_uncommitted_fragment(*p, gather_bld.new_sub());
+  }
+
+  // nudge scatterlocks
+  for (elist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) {
+    CInode *in = *p;
+    dout(10) << "try_to_expire waiting for dirlock flush on " << *in << dendl;
+    mds->locker->scatter_nudge(&in->filelock, gather_bld.new_sub());
+  }
+  for (elist<CInode*>::iterator p = dirty_dirfrag_dirfragtree.begin(); !p.end(); ++p) {
+    CInode *in = *p;
+    dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in << dendl;
+    mds->locker->scatter_nudge(&in->dirfragtreelock, gather_bld.new_sub());
+  }
+  for (elist<CInode*>::iterator p = dirty_dirfrag_nest.begin(); !p.end(); ++p) {
+    CInode *in = *p;
+    dout(10) << "try_to_expire waiting for nest flush on " << *in << dendl;
+    mds->locker->scatter_nudge(&in->nestlock, gather_bld.new_sub());
+  }
+
+  ceph_assert(g_conf()->mds_kill_journal_expire_at != 2);
+
+  // open files and snap inodes 
+  if (!open_files.empty()) {
+    ceph_assert(!mds->mdlog->is_capped()); // hmm FIXME
+    EOpen *le = 0;
+    LogSegment *ls = mds->mdlog->get_current_segment();
+    ceph_assert(ls != this);
+    elist<CInode*>::iterator p = open_files.begin(member_offset(CInode, item_open_file));
+    while (!p.end()) {
+      CInode *in = *p;
+      ++p;
+      if (in->last != CEPH_NOSNAP && in->is_auth() && !in->client_snap_caps.empty()) {
+	// journal snap inodes that need flush. This simplify the mds failover hanlding
+	dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl;
+	if (!le) {
+	  le = new EOpen(mds->mdlog);
+	  mds->mdlog->start_entry(le);
+	}
+	le->add_clean_inode(in);
+	ls->open_files.push_back(&in->item_open_file);
+      } else {
+	// open files are tracked by open file table, no need to journal them again
+	in->item_open_file.remove_myself();
+      }
+    }
+    if (le) {
+      mds->mdlog->submit_entry(le);
+      mds->mdlog->wait_for_safe(gather_bld.new_sub());
+      dout(10) << "try_to_expire waiting for open files to rejournal" << dendl;
+    }
+  }
+
+  ceph_assert(g_conf()->mds_kill_journal_expire_at != 3);
+
+  size_t count = 0;
+  for (elist<CInode*>::iterator it = dirty_parent_inodes.begin(); !it.end(); ++it)
+    count++;
+
+  std::vector<CInodeCommitOperations> ops_vec;
+  ops_vec.reserve(count);
+  // backtraces to be stored/updated
+  for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) {
+    CInode *in = *p;
+    ceph_assert(in->is_auth());
+    if (in->can_auth_pin()) {
+      dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl;
+      ops_vec.resize(ops_vec.size() + 1);
+      in->store_backtrace(ops_vec.back(), op_prio);
+    } else {
+      dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl;
+      in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub());
+    }
+  }
+  if (!ops_vec.empty())
+    mds->finisher->queue(new BatchCommitBacktrace(mds, gather_bld.new_sub(), std::move(ops_vec)));
+
+  ceph_assert(g_conf()->mds_kill_journal_expire_at != 4);
+
+  // idalloc
+  if (inotablev > mds->inotable->get_committed_version()) {
+    dout(10) << "try_to_expire saving inotable table, need " << inotablev
+	      << ", committed is " << mds->inotable->get_committed_version()
+	      << " (" << mds->inotable->get_committing_version() << ")"
+	      << dendl;
+    mds->inotable->save(gather_bld.new_sub(), inotablev);
+  }
+
+  // sessionmap
+  if (sessionmapv > mds->sessionmap.get_committed()) {
+    dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv 
+	      << ", committed is " << mds->sessionmap.get_committed()
+	      << " (" << mds->sessionmap.get_committing() << ")"
+	      << dendl;
+    mds->sessionmap.save(gather_bld.new_sub(), sessionmapv);
+  }
+
+  // updates to sessions for completed_requests
+  mds->sessionmap.save_if_dirty(touched_sessions, &gather_bld);
+  touched_sessions.clear();
+
+  // pending commit atids
+  for (map<int, ceph::unordered_set<version_t> >::iterator p = pending_commit_tids.begin();
+       p != pending_commit_tids.end();
+       ++p) {
+    MDSTableClient *client = mds->get_table_client(p->first);
+    ceph_assert(client);
+    for (ceph::unordered_set<version_t>::iterator q = p->second.begin();
+	 q != p->second.end();
+	 ++q) {
+      dout(10) << "try_to_expire " << get_mdstable_name(p->first) << " transaction " << *q 
+	       << " pending commit (not yet acked), waiting" << dendl;
+      ceph_assert(!client->has_committed(*q));
+      client->wait_for_ack(*q, gather_bld.new_sub());
+    }
+  }
+  
+  // table servers
+  for (map<int, version_t>::iterator p = tablev.begin();
+       p != tablev.end();
+       ++p) {
+    MDSTableServer *server = mds->get_table_server(p->first);
+    ceph_assert(server);
+    if (p->second > server->get_committed_version()) {
+      dout(10) << "try_to_expire waiting for " << get_mdstable_name(p->first) 
+	       << " to save, need " << p->second << dendl;
+      server->save(gather_bld.new_sub());
+    }
+  }
+
+  // truncating
+  for (set<CInode*>::iterator p = truncating_inodes.begin();
+       p != truncating_inodes.end();
+       ++p) {
+    dout(10) << "try_to_expire waiting for truncate of " << **p << dendl;
+    (*p)->add_waiter(CInode::WAIT_TRUNC, gather_bld.new_sub());
+  }
+  // purge inodes
+  dout(10) << "try_to_expire waiting for purge of " << purging_inodes << dendl;
+  if (purging_inodes.size())
+    set_purged_cb(gather_bld.new_sub());
+  
+  if (gather_bld.has_subs()) {
+    dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire waiting" << dendl;
+    mds->mdlog->flush();
+  } else {
+    ceph_assert(g_conf()->mds_kill_journal_expire_at != 5);
+    dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire success" << dendl;
+  }
+}
+
+// -----------------------
+// EMetaBlob
+
+void EMetaBlob::add_dir_context(CDir *dir, int mode)
+{
+  MDSRank *mds = dir->mdcache->mds;
+
+  list<CDentry*> parents;
+
+  // it may be okay not to include the maybe items, if
+  //  - we journaled the maybe child inode in this segment
+  //  - that subtree turns out to be unambiguously auth
+  list<CDentry*> maybe;
+  bool maybenot = false;
+
+  while (true) {
+    // already have this dir?  (we must always add in order)
+    if (lump_map.count(dir->dirfrag())) {
+      dout(20) << "EMetaBlob::add_dir_context(" << dir << ") have lump " << dir->dirfrag() << dendl;
+      break;
+    }
+
+    // stop at root/stray
+    CInode *diri = dir->get_inode();
+    CDentry *parent = diri->get_projected_parent_dn();
+
+    if (mode == TO_AUTH_SUBTREE_ROOT) {
+      // subtree root?
+      if (dir->is_subtree_root()) {
+	// match logic in MDCache::create_subtree_map()
+	if (dir->get_dir_auth().first == mds->get_nodeid()) {
+	  mds_authority_t parent_auth = parent ? parent->authority() : CDIR_AUTH_UNDEF;
+	  if (parent_auth.first == dir->get_dir_auth().first) {
+	    if (parent_auth.second == CDIR_AUTH_UNKNOWN &&
+		!dir->is_ambiguous_dir_auth() &&
+		!dir->state_test(CDir::STATE_EXPORTBOUND) &&
+		!dir->state_test(CDir::STATE_AUXSUBTREE) &&
+		!diri->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
+	      dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir << dendl;
+	      ceph_abort();
+	    }
+	    dout(20) << "EMetaBlob::add_dir_context(" << dir << ") ambiguous or transient subtree " << dendl;
+	  } else {
+	    // it's an auth subtree, we don't need maybe (if any), and we're done.
+	    dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
+		     << " at " << *dir << dendl;
+	    maybe.clear();
+	    break;
+	  }
+	} else {
+	  dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached ambig or !auth subtree, need " << maybe
+		   << " at " << *dir << dendl;
+	  // we need the maybe list after all!
+	  parents.splice(parents.begin(), maybe);
+	  maybenot = false;
+	}
+      }
+
+      // was the inode journaled in this blob?
+      if (event_seq && diri->last_journaled == event_seq) {
+	dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri this blob " << *diri << dendl;
+	break;
+      }
+
+      // have we journaled this inode since the last subtree map?
+      if (!maybenot && last_subtree_map && diri->last_journaled >= last_subtree_map) {
+	dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri in this segment (" 
+		 << diri->last_journaled << " >= " << last_subtree_map << "), setting maybenot flag "
+		 << *diri << dendl;
+	maybenot = true;
+      }
+    }
+
+    if (!parent)
+      break;
+
+    if (maybenot) {
+      dout(25) << "EMetaBlob::add_dir_context(" << dir << ")      maybe " << *parent << dendl;
+      maybe.push_front(parent);
+    } else {
+      dout(25) << "EMetaBlob::add_dir_context(" << dir << ") definitely " << *parent << dendl;
+      parents.push_front(parent);
+    }
+    
+    dir = parent->get_dir();
+  }
+  
+  parents.splice(parents.begin(), maybe);
+
+  dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl;
+  for (const auto& dentry : parents) {
+    ceph_assert(dentry->get_projected_linkage()->is_primary());
+    add_dentry(dentry, false);
+  }
+}
+
+void EMetaBlob::update_segment(LogSegment *ls)
+{
+  // dirty inode mtimes
+  // -> handled directly by Server.cc, replay()
+
+  // alloc table update?
+  if (inotablev)
+    ls->inotablev = inotablev;
+  if (sessionmapv)
+    ls->sessionmapv = sessionmapv;
+
+  // truncated inodes
+  // -> handled directly by Server.cc
+
+  // client requests
+  //  note the newest request per client
+  //if (!client_reqs.empty())
+    //    ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
+}
+
+// EMetaBlob::fullbit
+
+void EMetaBlob::fullbit::encode(bufferlist& bl, uint64_t features) const {
+  ENCODE_START(9, 5, bl);
+  encode(dn, bl);
+  encode(dnfirst, bl);
+  encode(dnlast, bl);
+  encode(dnv, bl);
+  encode(*inode, bl, features);
+  if (xattrs)
+    encode(*xattrs, bl);
+  else
+    encode((__u32)0, bl);
+
+  if (inode->is_symlink())
+    encode(symlink, bl);
+  if (inode->is_dir()) {
+    encode(dirfragtree, bl);
+    encode(snapbl, bl);
+  }
+  encode(state, bl);
+  if (!old_inodes || old_inodes->empty()) {
+    encode(false, bl);
+  } else {
+    encode(true, bl);
+    encode(*old_inodes, bl, features);
+  }
+  if (!inode->is_dir())
+    encode(snapbl, bl);
+  encode(oldest_snap, bl);
+  encode(alternate_name, bl);
+  ENCODE_FINISH(bl);
+}
+
+void EMetaBlob::fullbit::decode(bufferlist::const_iterator &bl) {
+  DECODE_START(9, bl);
+  decode(dn, bl);
+  decode(dnfirst, bl);
+  decode(dnlast, bl);
+  decode(dnv, bl);
+  {
+    auto _inode = CInode::allocate_inode();
+    decode(*_inode, bl);
+    inode = std::move(_inode);
+  }
+  {
+    CInode::mempool_xattr_map tmp;
+    decode_noshare(tmp, bl);
+    if (!tmp.empty())
+      xattrs = CInode::allocate_xattr_map(std::move(tmp));
+  }
+  if (inode->is_symlink())
+    decode(symlink, bl);
+  if (inode->is_dir()) {
+    decode(dirfragtree, bl);
+    decode(snapbl, bl);
+  }
+  decode(state, bl);
+  bool old_inodes_present;
+  decode(old_inodes_present, bl);
+  if (old_inodes_present) {
+    auto _old_inodes = CInode::allocate_old_inode_map();
+    decode(*_old_inodes, bl);
+    old_inodes = std::move(_old_inodes);
+  }
+  if (!inode->is_dir()) {
+    decode(snapbl, bl);
+  }
+  decode(oldest_snap, bl);
+  if (struct_v >= 9) {
+    decode(alternate_name, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void EMetaBlob::fullbit::dump(Formatter *f) const
+{
+  f->dump_string("dentry", dn);
+  f->dump_stream("snapid.first") << dnfirst;
+  f->dump_stream("snapid.last") << dnlast;
+  f->dump_int("dentry version", dnv);
+  f->open_object_section("inode");
+  inode->dump(f);
+  f->close_section(); // inode
+  f->open_object_section("xattrs");
+  if (xattrs) {
+    for (const auto &p : *xattrs) {
+      std::string s(p.second.c_str(), p.second.length());
+      f->dump_string(p.first.c_str(), s);
+    }
+  }
+  f->close_section(); // xattrs
+  if (inode->is_symlink()) {
+    f->dump_string("symlink", symlink);
+  }
+  if (inode->is_dir()) {
+    f->dump_stream("frag tree") << dirfragtree;
+    f->dump_string("has_snapbl", snapbl.length() ? "true" : "false");
+    if (inode->has_layout()) {
+      f->open_object_section("file layout policy");
+      // FIXME
+      f->dump_string("layout", "the layout exists");
+      f->close_section(); // file layout policy
+    }
+  }
+  f->dump_string("state", state_string());
+  if (old_inodes && !old_inodes->empty()) {
+    f->open_array_section("old inodes");
+    for (const auto &p : *old_inodes) {
+      f->open_object_section("inode");
+      f->dump_int("snapid", p.first);
+      p.second.dump(f);
+      f->close_section(); // inode
+    }
+    f->close_section(); // old inodes
+  }
+  f->dump_string("alternate_name", alternate_name);
+}
+
+void EMetaBlob::fullbit::generate_test_instances(std::list<EMetaBlob::fullbit*>& ls)
+{
+  auto _inode = CInode::allocate_inode();
+  fragtree_t fragtree;
+  auto _xattrs = CInode::allocate_xattr_map();
+  bufferlist empty_snapbl;
+  fullbit *sample = new fullbit("/testdn", "", 0, 0, 0,
+                                _inode, fragtree, _xattrs, "", 0, empty_snapbl,
+                                false, NULL);
+  ls.push_back(sample);
+}
+
+void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in)
+{
+  in->reset_inode(std::move(inode));
+  in->reset_xattrs(std::move(xattrs));
+  if (in->is_dir()) {
+    if (is_export_ephemeral_random()) {
+      dout(15) << "random ephemeral pin on " << *in << dendl;
+      in->set_ephemeral_pin(false, true);
+    }
+    in->maybe_export_pin();
+    if (!(in->dirfragtree == dirfragtree)) {
+      dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> "
+	       << dirfragtree << " on " << *in << dendl;
+      in->dirfragtree = std::move(dirfragtree);
+      in->force_dirfrags();
+      if (in->get_num_dirfrags() && in->authority() == CDIR_AUTH_UNDEF) {
+	auto&& ls = in->get_nested_dirfrags();
+	for (const auto& dir : ls) {
+	  if (dir->get_num_any() == 0 &&
+	      mds->mdcache->can_trim_non_auth_dirfrag(dir)) {
+	    dout(10) << " closing empty non-auth dirfrag " << *dir << dendl;
+	    in->close_dirfrag(dir->get_frag());
+	  }
+	}
+      }
+    }
+  } else if (in->is_symlink()) {
+    in->symlink = symlink;
+  }
+  in->reset_old_inodes(std::move(old_inodes));
+  if (in->is_any_old_inodes()) {
+    snapid_t min_first = in->get_old_inodes()->rbegin()->first + 1;
+    if (min_first > in->first)
+      in->first = min_first;
+  }
+
+  /*
+   * we can do this before linking hte inode bc the split_at would
+   * be a no-op.. we have no children (namely open snaprealms) to
+   * divy up
+   */
+  in->oldest_snap = oldest_snap;
+  in->decode_snap_blob(snapbl);
+
+  /*
+   * In case there was anything malformed in the journal that we are
+   * replaying, do sanity checks on the inodes we're replaying and
+   * go damaged instead of letting any trash into a live cache
+   */
+  if (in->is_file()) {
+    // Files must have valid layouts with a pool set
+    if (in->get_inode()->layout.pool_id == -1 ||
+	!in->get_inode()->layout.is_valid()) {
+      dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
+              << ": " << in->get_inode()->layout << dendl;
+      CachedStackStringStream css;
+      *css << "Invalid layout for inode " << in->ino() << " in journal";
+      mds->clog->error() << css->strv();
+      mds->damaged();
+      ceph_abort();  // Should be unreachable because damaged() calls respawn()
+    }
+  }
+}
+
+// EMetaBlob::remotebit
+
+void EMetaBlob::remotebit::encode(bufferlist& bl) const
+{
+  ENCODE_START(3, 2, bl);
+  encode(dn, bl);
+  encode(dnfirst, bl);
+  encode(dnlast, bl);
+  encode(dnv, bl);
+  encode(ino, bl);
+  encode(d_type, bl);
+  encode(dirty, bl);
+  encode(alternate_name, bl);
+  ENCODE_FINISH(bl);
+}
+
+void EMetaBlob::remotebit::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+  decode(dn, bl);
+  decode(dnfirst, bl);
+  decode(dnlast, bl);
+  decode(dnv, bl);
+  decode(ino, bl);
+  decode(d_type, bl);
+  decode(dirty, bl);
+  if (struct_v >= 3)
+    decode(alternate_name, bl);
+  DECODE_FINISH(bl);
+}
+
+void EMetaBlob::remotebit::dump(Formatter *f) const
+{
+  f->dump_string("dentry", dn);
+  f->dump_int("snapid.first", dnfirst);
+  f->dump_int("snapid.last", dnlast);
+  f->dump_int("dentry version", dnv);
+  f->dump_int("inodeno", ino);
+  uint32_t type = DTTOIF(d_type) & S_IFMT; // convert to type entries
+  string type_string;
+  switch(type) {
+  case S_IFREG:
+    type_string = "file"; break;
+  case S_IFLNK:
+    type_string = "symlink"; break;
+  case S_IFDIR:
+    type_string = "directory"; break;
+  case S_IFIFO:
+    type_string = "fifo"; break;
+  case S_IFCHR:
+    type_string = "chr"; break;
+  case S_IFBLK:
+    type_string = "blk"; break;
+  case S_IFSOCK:
+    type_string = "sock"; break;
+  default:
+    assert (0 == "unknown d_type!");
+  }
+  f->dump_string("d_type", type_string);
+  f->dump_string("dirty", dirty ? "true" : "false");
+  f->dump_string("alternate_name", alternate_name);
+}
+
+void EMetaBlob::remotebit::
+generate_test_instances(std::list<EMetaBlob::remotebit*>& ls)
+{
+  remotebit *remote = new remotebit("/test/dn", "", 0, 10, 15, 1, IFTODT(S_IFREG), false);
+  ls.push_back(remote);
+  remote = new remotebit("/test/dn2", "foo", 0, 10, 15, 1, IFTODT(S_IFREG), false);
+  ls.push_back(remote);
+}
+
+// EMetaBlob::nullbit
+
+void EMetaBlob::nullbit::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(dn, bl);
+  encode(dnfirst, bl);
+  encode(dnlast, bl);
+  encode(dnv, bl);
+  encode(dirty, bl);
+  ENCODE_FINISH(bl);
+}
+
+void EMetaBlob::nullbit::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(dn, bl);
+  decode(dnfirst, bl);
+  decode(dnlast, bl);
+  decode(dnv, bl);
+  decode(dirty, bl);
+  DECODE_FINISH(bl);
+}
+
+void EMetaBlob::nullbit::dump(Formatter *f) const
+{
+  f->dump_string("dentry", dn);
+  f->dump_int("snapid.first", dnfirst);
+  f->dump_int("snapid.last", dnlast);
+  f->dump_int("dentry version", dnv);
+  f->dump_string("dirty", dirty ? "true" : "false");
+}
+
+void EMetaBlob::nullbit::generate_test_instances(std::list<nullbit*>& ls)
+{
+  nullbit *sample = new nullbit("/test/dentry", 0, 10, 15, false);
+  nullbit *sample2 = new nullbit("/test/dirty", 10, 20, 25, true);
+  ls.push_back(sample);
+  ls.push_back(sample2);
+}
+
+// EMetaBlob::dirlump
+
+void EMetaBlob::dirlump::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(*fnode, bl);
+  encode(state, bl);
+  encode(nfull, bl);
+  encode(nremote, bl);
+  encode(nnull, bl);
+  _encode_bits(features);
+  encode(dnbl, bl);
+  ENCODE_FINISH(bl);
+}
+
+void EMetaBlob::dirlump::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl)
+  {
+    auto _fnode = CDir::allocate_fnode();
+    decode(*_fnode, bl);
+    fnode = std::move(_fnode);
+  }
+  decode(state, bl);
+  decode(nfull, bl);
+  decode(nremote, bl);
+  decode(nnull, bl);
+  decode(dnbl, bl);
+  dn_decoded = false;      // don't decode bits unless we need them.
+  DECODE_FINISH(bl);
+}
+
+void EMetaBlob::dirlump::dump(Formatter *f) const
+{
+  if (!dn_decoded) {
+    dirlump *me = const_cast<dirlump*>(this);
+    me->_decode_bits();
+  }
+  f->open_object_section("fnode");
+  fnode->dump(f);
+  f->close_section(); // fnode
+  f->dump_string("state", state_string());
+  f->dump_int("nfull", nfull);
+  f->dump_int("nremote", nremote);
+  f->dump_int("nnull", nnull);
+
+  f->open_array_section("full bits");
+  for (const auto& iter : dfull) {
+    f->open_object_section("fullbit");
+    iter.dump(f);
+    f->close_section(); // fullbit
+  }
+  f->close_section(); // full bits
+  f->open_array_section("remote bits");
+  for (const auto& iter : dremote) {
+    f->open_object_section("remotebit");
+    iter.dump(f);
+    f->close_section(); // remotebit
+  }
+  f->close_section(); // remote bits
+  f->open_array_section("null bits");
+  for (const auto& iter : dnull) {
+    f->open_object_section("null bit");
+    iter.dump(f);
+    f->close_section(); // null bit
+  }
+  f->close_section(); // null bits
+}
+
+void EMetaBlob::dirlump::generate_test_instances(std::list<dirlump*>& ls)
+{
+  auto dl = new dirlump();
+  dl->fnode = CDir::allocate_fnode();
+  ls.push_back(dl);
+}
+
+/**
+ * EMetaBlob proper
+ */
+void EMetaBlob::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(8, 5, bl);
+  encode(lump_order, bl);
+  encode(lump_map, bl, features);
+  encode(roots, bl, features);
+  encode(table_tids, bl);
+  encode(opened_ino, bl);
+  encode(allocated_ino, bl);
+  encode(used_preallocated_ino, bl);
+  encode(preallocated_inos, bl);
+  encode(client_name, bl);
+  encode(inotablev, bl);
+  encode(sessionmapv, bl);
+  encode(truncate_start, bl);
+  encode(truncate_finish, bl);
+  encode(destroyed_inodes, bl);
+  encode(client_reqs, bl);
+  encode(renamed_dirino, bl);
+  encode(renamed_dir_frags, bl);
+  {
+    // make MDSRank use v6 format happy
+    int64_t i = -1;
+    bool b = false;
+    encode(i, bl);
+    encode(b, bl);
+  }
+  encode(client_flushes, bl);
+  ENCODE_FINISH(bl);
+}
+void EMetaBlob::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
+  decode(lump_order, bl);
+  decode(lump_map, bl);
+  if (struct_v >= 4) {
+    decode(roots, bl);
+  } else {
+    bufferlist rootbl;
+    decode(rootbl, bl);
+    if (rootbl.length()) {
+      auto p = rootbl.cbegin();
+      roots.emplace_back(p);
+    }
+  }
+  decode(table_tids, bl);
+  decode(opened_ino, bl);
+  decode(allocated_ino, bl);
+  decode(used_preallocated_ino, bl);
+  decode(preallocated_inos, bl);
+  decode(client_name, bl);
+  decode(inotablev, bl);
+  decode(sessionmapv, bl);
+  decode(truncate_start, bl);
+  decode(truncate_finish, bl);
+  decode(destroyed_inodes, bl);
+  if (struct_v >= 2) {
+    decode(client_reqs, bl);
+  } else {
+    list<metareqid_t> r;
+    decode(r, bl);
+    while (!r.empty()) {
+	client_reqs.push_back(pair<metareqid_t,uint64_t>(r.front(), 0));
+	r.pop_front();
+    }
+  }
+  if (struct_v >= 3) {
+    decode(renamed_dirino, bl);
+    decode(renamed_dir_frags, bl);
+  }
+  if (struct_v >= 6) {
+    // ignore
+    int64_t i;
+    bool b;
+    decode(i, bl);
+    decode(b, bl);
+  }
+  if (struct_v >= 8) {
+    decode(client_flushes, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+
+/**
+ * Get all inodes touched by this metablob.  Includes the 'bits' within
+ * dirlumps, and the inodes of the dirs themselves.
+ */
+void EMetaBlob::get_inodes(
+    std::set<inodeno_t> &inodes) const
+{
+  // For all dirlumps in this metablob
+  for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
+    // Record inode of dirlump
+    inodeno_t const dir_ino = i->first.ino;
+    inodes.insert(dir_ino);
+
+    // Decode dirlump bits
+    dirlump const &dl = i->second;
+    dl._decode_bits();
+
+    // Record inodes of fullbits
+    for (const auto& iter : dl.get_dfull()) {
+      inodes.insert(iter.inode->ino);
+    }
+
+    // Record inodes of remotebits
+    for (const auto& iter : dl.get_dremote()) {
+      inodes.insert(iter.ino);
+    }
+  }
+}
+
+
+/**
+ * Get a map of dirfrag to set of dentries in that dirfrag which are
+ * touched in this operation.
+ */
+void EMetaBlob::get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const
+{
+  for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
+    dirlump const &dl = i->second;
+    dirfrag_t const &df = i->first;
+
+    // Get all bits
+    dl._decode_bits();
+
+    // For all bits, store dentry
+    for (const auto& iter : dl.get_dfull()) {
+      dentries[df].insert(iter.dn);
+    }
+    for (const auto& iter : dl.get_dremote()) {
+      dentries[df].insert(iter.dn);
+    }
+    for (const auto& iter : dl.get_dnull()) {
+      dentries[df].insert(iter.dn);
+    }
+  }
+}
+
+
+
+/**
+ * Calculate all paths that we can infer are touched by this metablob.  Only uses
+ * information local to this metablob so it may only be the path within the
+ * subtree.
+ */
+void EMetaBlob::get_paths(
+    std::vector<std::string> &paths) const
+{
+  // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
+  typedef std::pair<inodeno_t, std::string> Location;
+
+  // Whenever we see a dentry within a dirlump, we remember it as a child of
+  // the dirlump's inode
+  std::map<inodeno_t, std::vector<std::string> > children;
+
+  // Whenever we see a location for an inode, remember it: this allows us to
+  // build a path given an inode
+  std::map<inodeno_t, Location> ino_locations;
+
+  // Special case: operations on root inode populate roots but not dirlumps
+  if (lump_map.empty() && !roots.empty()) {
+    paths.push_back("/");
+    return;
+  }
+
+  // First pass
+  // ==========
+  // Build a tiny local metadata cache for the path structure in this metablob
+  for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
+    inodeno_t const dir_ino = i->first.ino;
+    dirlump const &dl = i->second;
+    dl._decode_bits();
+
+    for (const auto& iter : dl.get_dfull()) {
+      std::string_view dentry = iter.dn;
+      children[dir_ino].emplace_back(dentry);
+      ino_locations[iter.inode->ino] = Location(dir_ino, dentry);
+    }
+
+    for (const auto& iter : dl.get_dremote()) {
+      std::string_view dentry = iter.dn;
+      children[dir_ino].emplace_back(dentry);
+    }
+
+    for (const auto& iter : dl.get_dnull()) {
+      std::string_view dentry = iter.dn;
+      children[dir_ino].emplace_back(dentry);
+    }
+  }
+
+  std::vector<Location> leaf_locations;
+
+  // Second pass
+  // ===========
+  // Output paths for all childless nodes in the metablob
+  for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
+    inodeno_t const dir_ino = i->first.ino;
+    dirlump const &dl = i->second;
+    dl._decode_bits();
+
+    for (const auto& iter : dl.get_dfull()) {
+      std::string_view dentry = iter.dn;
+      if (children.find(iter.inode->ino) == children.end()) {
+        leaf_locations.push_back(Location(dir_ino, dentry));
+      }
+    }
+
+    for (const auto& iter : dl.get_dremote()) {
+      std::string_view dentry = iter.dn;
+      leaf_locations.push_back(Location(dir_ino, dentry));
+    }
+
+    for (const auto& iter : dl.get_dnull()) {
+      std::string_view dentry = iter.dn;
+      leaf_locations.push_back(Location(dir_ino, dentry));
+    }
+  }
+
+  // For all the leaf locations identified, generate paths
+  for (std::vector<Location>::iterator i = leaf_locations.begin(); i != leaf_locations.end(); ++i) {
+    Location const &loc = *i;
+    std::string path = loc.second;
+    inodeno_t ino = loc.first;
+    std::map<inodeno_t, Location>::iterator iter = ino_locations.find(ino);
+    while(iter != ino_locations.end()) {
+      Location const &loc = iter->second;
+      if (!path.empty()) {
+        path = loc.second + "/" + path;
+      } else {
+        path = loc.second + path;
+      }
+      iter = ino_locations.find(loc.first);
+    }
+
+    paths.push_back(path);
+  }
+}
+
+
+void EMetaBlob::dump(Formatter *f) const
+{
+  f->open_array_section("lumps");
+  for (const auto& d : lump_order) {
+    f->open_object_section("lump");
+    f->open_object_section("dirfrag");
+    f->dump_stream("dirfrag") << d;
+    f->close_section(); // dirfrag
+    f->open_object_section("dirlump");
+    lump_map.at(d).dump(f);
+    f->close_section(); // dirlump
+    f->close_section(); // lump
+  }
+  f->close_section(); // lumps
+  
+  f->open_array_section("roots");
+  for (const auto& iter : roots) {
+    f->open_object_section("root");
+    iter.dump(f);
+    f->close_section(); // root
+  }
+  f->close_section(); // roots
+
+  f->open_array_section("tableclient tranactions");
+  for (const auto& p : table_tids) {
+    f->open_object_section("transaction");
+    f->dump_int("tid", p.first);
+    f->dump_int("version", p.second);
+    f->close_section(); // transaction
+  }
+  f->close_section(); // tableclient transactions
+  
+  f->dump_int("renamed directory inodeno", renamed_dirino);
+  
+  f->open_array_section("renamed directory fragments");
+  for (const auto& p : renamed_dir_frags) {
+    f->dump_int("frag", p);
+  }
+  f->close_section(); // renamed directory fragments
+
+  f->dump_int("inotable version", inotablev);
+  f->dump_int("SessionMap version", sessionmapv);
+  f->dump_int("allocated ino", allocated_ino);
+  
+  f->dump_stream("preallocated inos") << preallocated_inos;
+  f->dump_int("used preallocated ino", used_preallocated_ino);
+
+  f->open_object_section("client name");
+  client_name.dump(f);
+  f->close_section(); // client name
+
+  f->open_array_section("inodes starting a truncate");
+  for(const auto& ino : truncate_start) {
+    f->dump_int("inodeno", ino);
+  }
+  f->close_section(); // truncate inodes
+  f->open_array_section("inodes finishing a truncated");
+  for(const auto& p : truncate_finish) {
+    f->open_object_section("inode+segment");
+    f->dump_int("inodeno", p.first);
+    f->dump_int("truncate starting segment", p.second);
+    f->close_section(); // truncated inode
+  }
+  f->close_section(); // truncate finish inodes
+
+  f->open_array_section("destroyed inodes");
+  for(vector<inodeno_t>::const_iterator i = destroyed_inodes.begin();
+      i != destroyed_inodes.end(); ++i) {
+    f->dump_int("inodeno", *i);
+  }
+  f->close_section(); // destroyed inodes
+
+  f->open_array_section("client requests");
+  for(const auto& p : client_reqs) {
+    f->open_object_section("Client request");
+    f->dump_stream("request ID") << p.first;
+    f->dump_int("oldest request on client", p.second);
+    f->close_section(); // request
+  }
+  f->close_section(); // client requests
+}
+
+void EMetaBlob::generate_test_instances(std::list<EMetaBlob*>& ls)
+{
+  ls.push_back(new EMetaBlob());
+}
+
+void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDPeerUpdate *peerup)
+{
+  dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
+
+  ceph_assert(logseg);
+
+  ceph_assert(g_conf()->mds_kill_journal_replay_at != 1);
+
+  for (auto& p : roots) {
+    CInode *in = mds->mdcache->get_inode(p.inode->ino);
+    bool isnew = in ? false:true;
+    if (!in)
+      in = new CInode(mds->mdcache, false, 2, CEPH_NOSNAP);
+    p.update_inode(mds, in);
+
+    if (isnew)
+      mds->mdcache->add_inode(in);
+    if (p.is_dirty()) in->_mark_dirty(logseg);
+    dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl;    
+  }
+
+  CInode *renamed_diri = 0;
+  CDir *olddir = 0;
+  if (renamed_dirino) {
+    renamed_diri = mds->mdcache->get_inode(renamed_dirino);
+    if (renamed_diri)
+      dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri << dendl;
+    else
+      dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino << dendl;
+
+    int nnull = 0;
+    for (const auto& lp : lump_order) {
+      dirlump &lump = lump_map[lp];
+      if (lump.nnull) {
+	dout(10) << "EMetaBlob.replay found null dentry in dir " << lp << dendl;
+	nnull += lump.nnull;
+      }
+    }
+    ceph_assert(nnull <= 1);
+  }
+
+  // keep track of any inodes we unlink and don't relink elsewhere
+  map<CInode*, CDir*> unlinked;
+  set<CInode*> linked;
+
+  // walk through my dirs (in order!)
+  int count = 0;
+  for (const auto& lp : lump_order) {
+    dout(10) << "EMetaBlob.replay dir " << lp << dendl;
+    dirlump &lump = lump_map[lp];
+
+    // the dir 
+    CDir *dir = mds->mdcache->get_force_dirfrag(lp, true);
+    if (!dir) {
+      // hmm.  do i have the inode?
+      CInode *diri = mds->mdcache->get_inode((lp).ino);
+      if (!diri) {
+	if (MDS_INO_IS_MDSDIR(lp.ino)) {
+	  ceph_assert(MDS_INO_MDSDIR(mds->get_nodeid()) != lp.ino);
+	  diri = mds->mdcache->create_system_inode(lp.ino, S_IFDIR|0755);
+	  diri->state_clear(CInode::STATE_AUTH);
+	  dout(10) << "EMetaBlob.replay created base " << *diri << dendl;
+	} else {
+	  dout(0) << "EMetaBlob.replay missing dir ino  " << lp.ino << dendl;
+          mds->clog->error() << "failure replaying journal (EMetaBlob)";
+          mds->damaged();
+          ceph_abort();  // Should be unreachable because damaged() calls respawn()
+	}
+      }
+
+      // create the dirfrag
+      dir = diri->get_or_open_dirfrag(mds->mdcache, lp.frag);
+
+      if (MDS_INO_IS_BASE(lp.ino))
+	mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
+
+      dout(10) << "EMetaBlob.replay added dir " << *dir << dendl;  
+    }
+    dir->reset_fnode(std::move(lump.fnode));
+    dir->update_projected_version();
+
+    if (lump.is_importing()) {
+      dir->state_set(CDir::STATE_AUTH);
+      dir->state_clear(CDir::STATE_COMPLETE);
+    }
+    if (lump.is_dirty()) {
+      dir->_mark_dirty(logseg);
+
+      if (!(dir->get_fnode()->rstat == dir->get_fnode()->accounted_rstat)) {
+	dout(10) << "EMetaBlob.replay      dirty nestinfo on " << *dir << dendl;
+	mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
+	logseg->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
+      } else {
+	dout(10) << "EMetaBlob.replay      clean nestinfo on " << *dir << dendl;
+      }
+      if (!(dir->get_fnode()->fragstat == dir->get_fnode()->accounted_fragstat)) {
+	dout(10) << "EMetaBlob.replay      dirty fragstat on " << *dir << dendl;
+	mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
+	logseg->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
+      } else {
+	dout(10) << "EMetaBlob.replay      clean fragstat on " << *dir << dendl;
+      }
+    }
+    if (lump.is_dirty_dft()) {
+      dout(10) << "EMetaBlob.replay      dirty dirfragtree on " << *dir << dendl;
+      dir->state_set(CDir::STATE_DIRTYDFT);
+      mds->locker->mark_updated_scatterlock(&dir->inode->dirfragtreelock);
+      logseg->dirty_dirfrag_dirfragtree.push_back(&dir->inode->item_dirty_dirfrag_dirfragtree);
+    }
+    if (lump.is_new())
+      dir->mark_new(logseg);
+    if (lump.is_complete())
+      dir->mark_complete();
+    
+    dout(10) << "EMetaBlob.replay updated dir " << *dir << dendl;  
+
+    // decode bits
+    lump._decode_bits();
+
+    // full dentry+inode pairs
+    for (auto& fb : lump._get_dfull()) {
+      CDentry *dn = dir->lookup_exact_snap(fb.dn, fb.dnlast);
+      if (!dn) {
+	dn = dir->add_null_dentry(fb.dn, fb.dnfirst, fb.dnlast);
+	dn->set_version(fb.dnv);
+	if (fb.is_dirty()) dn->_mark_dirty(logseg);
+	dout(10) << "EMetaBlob.replay added (full) " << *dn << dendl;
+      } else {
+	dn->set_version(fb.dnv);
+	if (fb.is_dirty()) dn->_mark_dirty(logseg);
+	dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *dn << dendl;
+	dn->first = fb.dnfirst;
+	ceph_assert(dn->last == fb.dnlast);
+      }
+      if (lump.is_importing())
+	dn->state_set(CDentry::STATE_AUTH);
+
+      CInode *in = mds->mdcache->get_inode(fb.inode->ino, fb.dnlast);
+      if (!in) {
+	in = new CInode(mds->mdcache, dn->is_auth(), fb.dnfirst, fb.dnlast);
+	fb.update_inode(mds, in);
+	mds->mdcache->add_inode(in);
+	if (!dn->get_linkage()->is_null()) {
+	  if (dn->get_linkage()->is_primary()) {
+	    unlinked[dn->get_linkage()->get_inode()] = dir;
+	    CachedStackStringStream css;
+	    *css << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
+	       << " " << *dn->get_linkage()->get_inode() << " should be " << in->ino();
+	    dout(0) << css->strv() << dendl;
+	    mds->clog->warn() << css->strv();
+	  }
+	  dir->unlink_inode(dn, false);
+	}
+	if (unlinked.count(in))
+	  linked.insert(in);
+	dir->link_primary_inode(dn, in);
+	dout(10) << "EMetaBlob.replay added " << *in << dendl;
+      } else {
+	in->first = fb.dnfirst;
+	fb.update_inode(mds, in);
+	if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) {
+	  dout(10) << "EMetaBlob.replay unlinking " << *in << dendl;
+	  unlinked[in] = in->get_parent_dir();
+	  in->get_parent_dir()->unlink_inode(in->get_parent_dn());
+	}
+	if (dn->get_linkage()->get_inode() != in) {
+	  if (!dn->get_linkage()->is_null()) { // note: might be remote.  as with stray reintegration.
+	    if (dn->get_linkage()->is_primary()) {
+	      unlinked[dn->get_linkage()->get_inode()] = dir;
+	      CachedStackStringStream css;
+	      *css << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
+		 << " " << *dn->get_linkage()->get_inode() << " should be " << in->ino();
+	      dout(0) << css->strv() << dendl;
+	      mds->clog->warn() << css->strv();
+	    }
+	    dir->unlink_inode(dn, false);
+	  }
+	  if (unlinked.count(in))
+	    linked.insert(in);
+	  dir->link_primary_inode(dn, in);
+	  dout(10) << "EMetaBlob.replay linked " << *in << dendl;
+	} else {
+	  dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *in << dendl;
+	}
+	ceph_assert(in->first == fb.dnfirst ||
+	       (in->is_multiversion() && in->first > fb.dnfirst));
+      }
+      if (fb.is_dirty())
+	in->_mark_dirty(logseg);
+      if (fb.is_dirty_parent())
+	in->mark_dirty_parent(logseg, fb.is_dirty_pool());
+      if (fb.need_snapflush())
+	logseg->open_files.push_back(&in->item_open_file);
+      if (dn->is_auth())
+	in->state_set(CInode::STATE_AUTH);
+      else
+	in->state_clear(CInode::STATE_AUTH);
+      ceph_assert(g_conf()->mds_kill_journal_replay_at != 2);
+
+      if (!(++count % mds->heartbeat_reset_grace()))
+        mds->heartbeat_reset();
+    }
+
+    // remote dentries
+    for (const auto& rb : lump.get_dremote()) {
+      CDentry *dn = dir->lookup_exact_snap(rb.dn, rb.dnlast);
+      if (!dn) {
+	dn = dir->add_remote_dentry(rb.dn, rb.ino, rb.d_type, mempool::mds_co::string(rb.alternate_name), rb.dnfirst, rb.dnlast);
+	dn->set_version(rb.dnv);
+	if (rb.dirty) dn->_mark_dirty(logseg);
+	dout(10) << "EMetaBlob.replay added " << *dn << dendl;
+      } else {
+	if (!dn->get_linkage()->is_null()) {
+	  dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
+	  if (dn->get_linkage()->is_primary()) {
+	    unlinked[dn->get_linkage()->get_inode()] = dir;
+	    CachedStackStringStream css;
+	    *css << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
+	       << " " << *dn->get_linkage()->get_inode() << " should be remote " << rb.ino;
+	    dout(0) << css->strv() << dendl;
+	  }
+	  dir->unlink_inode(dn, false);
+	}
+        dn->set_alternate_name(mempool::mds_co::string(rb.alternate_name));
+	dir->link_remote_inode(dn, rb.ino, rb.d_type);
+	dn->set_version(rb.dnv);
+	if (rb.dirty) dn->_mark_dirty(logseg);
+	dout(10) << "EMetaBlob.replay for [" << rb.dnfirst << "," << rb.dnlast << "] had " << *dn << dendl;
+	dn->first = rb.dnfirst;
+	ceph_assert(dn->last == rb.dnlast);
+      }
+      if (lump.is_importing())
+	dn->state_set(CDentry::STATE_AUTH);
+
+      if (!(++count % mds->heartbeat_reset_grace()))
+        mds->heartbeat_reset();
+    }
+
+    // null dentries
+    for (const auto& nb : lump.get_dnull()) {
+      CDentry *dn = dir->lookup_exact_snap(nb.dn, nb.dnlast);
+      if (!dn) {
+	dn = dir->add_null_dentry(nb.dn, nb.dnfirst, nb.dnlast);
+	dn->set_version(nb.dnv);
+	if (nb.dirty) dn->_mark_dirty(logseg);
+	dout(10) << "EMetaBlob.replay added (nullbit) " << *dn << dendl;
+      } else {
+	dn->first = nb.dnfirst;
+	if (!dn->get_linkage()->is_null()) {
+	  dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
+	  CInode *in = dn->get_linkage()->get_inode();
+	  // For renamed inode, We may call CInode::force_dirfrag() later.
+	  // CInode::force_dirfrag() doesn't work well when inode is detached
+	  // from the hierarchy.
+	  if (!renamed_diri || renamed_diri != in) {
+	    if (dn->get_linkage()->is_primary())
+	      unlinked[in] = dir;
+	    dir->unlink_inode(dn);
+	  }
+	}
+	dn->set_version(nb.dnv);
+	if (nb.dirty) dn->_mark_dirty(logseg);
+	dout(10) << "EMetaBlob.replay had " << *dn << dendl;
+	ceph_assert(dn->last == nb.dnlast);
+      }
+      olddir = dir;
+      if (lump.is_importing())
+	dn->state_set(CDentry::STATE_AUTH);
+
+      // Make null dentries the first things we trim
+      dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn << dendl;
+
+      if (!(++count % mds->heartbeat_reset_grace()))
+        mds->heartbeat_reset();
+    }
+  }
+
+  ceph_assert(g_conf()->mds_kill_journal_replay_at != 3);
+
+  if (renamed_dirino) {
+    if (renamed_diri) {
+      ceph_assert(unlinked.count(renamed_diri));
+      ceph_assert(linked.count(renamed_diri));
+      olddir = unlinked[renamed_diri];
+    } else {
+      // we imported a diri we haven't seen before
+      renamed_diri = mds->mdcache->get_inode(renamed_dirino);
+      ceph_assert(renamed_diri);  // it was in the metablob
+    }
+
+    if (olddir) {
+      if (olddir->authority() != CDIR_AUTH_UNDEF &&
+	  renamed_diri->authority() == CDIR_AUTH_UNDEF) {
+	ceph_assert(peerup); // auth to non-auth, must be peer prepare
+        frag_vec_t leaves;
+	renamed_diri->dirfragtree.get_leaves(leaves);
+	for (const auto& leaf : leaves) {
+	  CDir *dir = renamed_diri->get_dirfrag(leaf);
+	  ceph_assert(dir);
+	  if (dir->get_dir_auth() == CDIR_AUTH_UNDEF)
+	    // preserve subtree bound until peer commit
+	    peerup->olddirs.insert(dir->inode);
+	  else
+	    dir->state_set(CDir::STATE_AUTH);
+
+          if (!(++count % mds->heartbeat_reset_grace()))
+            mds->heartbeat_reset();
+	}
+      }
+
+      mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
+      
+      // see if we can discard the subtree we renamed out of
+      CDir *root = mds->mdcache->get_subtree_root(olddir);
+      if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
+	if (peerup) // preserve the old dir until peer commit
+	  peerup->olddirs.insert(olddir->inode);
+	else
+	  mds->mdcache->try_trim_non_auth_subtree(root);
+      }
+    }
+
+    // if we are the srci importer, we'll also have some dirfrags we have to open up...
+    if (renamed_diri->authority() != CDIR_AUTH_UNDEF) {
+      for (const auto& p : renamed_dir_frags) {
+	CDir *dir = renamed_diri->get_dirfrag(p);
+	if (dir) {
+	  // we already had the inode before, and we already adjusted this subtree accordingly.
+	  dout(10) << " already had+adjusted rename import bound " << *dir << dendl;
+	  ceph_assert(olddir); 
+	  continue;
+	}
+	dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, p);
+	dout(10) << " creating new rename import bound " << *dir << dendl;
+	dir->state_clear(CDir::STATE_AUTH);
+	mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
+
+        if (!(++count % mds->heartbeat_reset_grace()))
+          mds->heartbeat_reset();
+      }
+    }
+
+    // rename may overwrite an empty directory and move it into stray dir.
+    unlinked.erase(renamed_diri);
+    for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
+      if (!linked.count(p->first))
+	continue;
+      ceph_assert(p->first->is_dir());
+      mds->mdcache->adjust_subtree_after_rename(p->first, p->second, false);
+
+      if (!(++count % mds->heartbeat_reset_grace()))
+        mds->heartbeat_reset();
+    }
+  }
+
+  if (!unlinked.empty()) {
+    for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); ++p)
+      unlinked.erase(*p);
+    dout(10) << " unlinked set contains " << unlinked << dendl;
+    for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
+      CInode *in = p->first;
+      if (peerup) { // preserve unlinked inodes until peer commit
+	peerup->unlinked.insert(in);
+	if (in->snaprealm)
+	  in->snaprealm->adjust_parent();
+      } else
+	mds->mdcache->remove_inode_recursive(in);
+
+      if (!(++count % mds->heartbeat_reset_grace()))
+        mds->heartbeat_reset();
+    }
+  }
+
+  // table client transactions
+  for (const auto& p : table_tids) {
+    dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p.first)
+	     << " transaction " << p.second << dendl;
+    MDSTableClient *client = mds->get_table_client(p.first);
+    if (client)
+      client->got_journaled_agree(p.second, logseg);
+
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+
+  // opened ino?
+  if (opened_ino) {
+    CInode *in = mds->mdcache->get_inode(opened_ino);
+    ceph_assert(in);
+    dout(10) << "EMetaBlob.replay noting opened inode " << *in << dendl;
+    logseg->open_files.push_back(&in->item_open_file);
+  }
+
+  // allocated_inos
+  if (inotablev) {
+    if (mds->inotable->get_version() >= inotablev) {
+      dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
+	       << " <= table " << mds->inotable->get_version() << dendl;
+    } else {
+      dout(10) << "EMetaBlob.replay inotable v " << inotablev
+	       << " - 1 == table " << mds->inotable->get_version()
+	       << " allocated+used " << allocated_ino
+	       << " prealloc " << preallocated_inos
+	       << dendl;
+      if (allocated_ino)
+	mds->inotable->replay_alloc_id(allocated_ino);
+      if (preallocated_inos.size())
+	mds->inotable->replay_alloc_ids(preallocated_inos);
+
+      // [repair bad inotable updates]
+      if (inotablev > mds->inotable->get_version()) {
+	mds->clog->error() << "journal replay inotablev mismatch "
+	    << mds->inotable->get_version() << " -> " << inotablev;
+	mds->inotable->force_replay_version(inotablev);
+      }
+
+      ceph_assert(inotablev == mds->inotable->get_version());
+    }
+  }
+  if (sessionmapv) {
+    unsigned diff = (used_preallocated_ino && !preallocated_inos.empty()) ? 2 : 1;
+    if (mds->sessionmap.get_version() >= sessionmapv) {
+      dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
+	       << " <= table " << mds->sessionmap.get_version() << dendl;
+    } else if (mds->sessionmap.get_version() + diff == sessionmapv) {
+      dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
+	       << " - " << diff << " == table " << mds->sessionmap.get_version()
+	       << " prealloc " << preallocated_inos
+	       << " used " << used_preallocated_ino
+	       << dendl;
+      Session *session = mds->sessionmap.get_session(client_name);
+      if (session) {
+	dout(20) << " (session prealloc " << session->info.prealloc_inos << ")" << dendl;
+	if (used_preallocated_ino) {
+	  if (!session->info.prealloc_inos.empty()) {
+	    inodeno_t ino = session->take_ino(used_preallocated_ino);
+	    session->info.prealloc_inos.erase(ino);
+	    ceph_assert(ino == used_preallocated_ino);
+	  }
+          mds->sessionmap.replay_dirty_session(session);
+	}
+	if (!preallocated_inos.empty()) {
+	  session->free_prealloc_inos.insert(preallocated_inos);
+	  session->info.prealloc_inos.insert(preallocated_inos);
+          mds->sessionmap.replay_dirty_session(session);
+	}
+
+      } else {
+	dout(10) << "EMetaBlob.replay no session for " << client_name << dendl;
+	if (used_preallocated_ino)
+	  mds->sessionmap.replay_advance_version();
+
+	if (!preallocated_inos.empty())
+	  mds->sessionmap.replay_advance_version();
+      }
+      ceph_assert(sessionmapv == mds->sessionmap.get_version());
+    } else {
+      mds->clog->error() << "EMetaBlob.replay sessionmap v " << sessionmapv
+			 << " - " << diff << " > table " << mds->sessionmap.get_version();
+      ceph_assert(g_conf()->mds_wipe_sessions);
+      mds->sessionmap.wipe();
+      mds->sessionmap.set_version(sessionmapv);
+    }
+  }
+
+  // truncating inodes
+  for (const auto& ino : truncate_start) {
+    CInode *in = mds->mdcache->get_inode(ino);
+    ceph_assert(in);
+    mds->mdcache->add_recovered_truncate(in, logseg);
+
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+  for (const auto& p : truncate_finish) {
+    LogSegment *ls = mds->mdlog->get_segment(p.second);
+    if (ls) {
+      CInode *in = mds->mdcache->get_inode(p.first);
+      ceph_assert(in);
+      mds->mdcache->remove_recovered_truncate(in, ls);
+    }
+
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+
+  // destroyed inodes
+  if (!destroyed_inodes.empty()) {
+    for (vector<inodeno_t>::iterator p = destroyed_inodes.begin();
+	p != destroyed_inodes.end();
+	++p) {
+      CInode *in = mds->mdcache->get_inode(*p);
+      if (in) {
+	dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl;
+	CDentry *parent = in->get_parent_dn();
+	mds->mdcache->remove_inode(in);
+	if (parent) {
+	  dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl;
+	  ceph_assert(parent->get_linkage()->is_null());
+	}
+      } else {
+	dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl;
+      }
+
+      if (!(++count % mds->heartbeat_reset_grace()))
+        mds->heartbeat_reset();
+    }
+    mds->mdcache->open_file_table.note_destroyed_inos(logseg->seq, destroyed_inodes);
+  }
+
+  // client requests
+  for (const auto& p : client_reqs) {
+    if (p.first.name.is_client()) {
+      dout(10) << "EMetaBlob.replay request " << p.first << " trim_to " << p.second << dendl;
+      inodeno_t created = allocated_ino ? allocated_ino : used_preallocated_ino;
+      // if we allocated an inode, there should be exactly one client request id.
+      ceph_assert(created == inodeno_t() || client_reqs.size() == 1);
+
+      Session *session = mds->sessionmap.get_session(p.first.name);
+      if (session) {
+	session->add_completed_request(p.first.tid, created);
+	if (p.second)
+	  session->trim_completed_requests(p.second);
+      }
+    }
+
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+
+  // client flushes
+  for (const auto& p : client_flushes) {
+    if (p.first.name.is_client()) {
+      dout(10) << "EMetaBlob.replay flush " << p.first << " trim_to " << p.second << dendl;
+      Session *session = mds->sessionmap.get_session(p.first.name);
+      if (session) {
+	session->add_completed_flush(p.first.tid);
+	if (p.second)
+	  session->trim_completed_flushes(p.second);
+      }
+    }
+
+    if (!(++count % mds->heartbeat_reset_grace()))
+      mds->heartbeat_reset();
+  }
+
+  // update segment
+  update_segment(logseg);
+
+  ceph_assert(g_conf()->mds_kill_journal_replay_at != 4);
+}
+
+// -----------------------
+// EPurged
+void EPurged::update_segment()
+{
+  if (inos.size() && inotablev)
+    get_segment()->inotablev = inotablev;
+  return;
+}
+
+void EPurged::replay(MDSRank *mds)
+{
+  if (inos.size()) {
+    LogSegment *ls = mds->mdlog->get_segment(seq);
+    if (ls)
+      ls->purging_inodes.subtract(inos);
+
+    if (mds->inotable->get_version() >= inotablev) {
+      dout(10) << "EPurged.replay inotable " << mds->inotable->get_version()
+	       << " >= " << inotablev << ", noop" << dendl;
+    } else {
+      dout(10) << "EPurged.replay inotable " << mds->inotable->get_version()
+	       << " < " << inotablev << " " << dendl;
+      mds->inotable->replay_release_ids(inos);
+      assert(mds->inotable->get_version() == inotablev);
+    }
+  }
+  update_segment();
+}
+
+void EPurged::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(inos, bl);
+  encode(inotablev, bl);
+  encode(seq, bl);
+  ENCODE_FINISH(bl);
+}
+
+void EPurged::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(inos, bl);
+  decode(inotablev, bl);
+  decode(seq, bl);
+  DECODE_FINISH(bl);
+}
+
+void EPurged::dump(Formatter *f) const
+{
+  f->dump_stream("inos") << inos;
+  f->dump_int("inotable version", inotablev);
+  f->dump_int("segment seq", seq);
+}
+
+// -----------------------
+// ESession
+
+void ESession::update_segment()
+{
+  get_segment()->sessionmapv = cmapv;
+  if (inos_to_free.size() && inotablev)
+    get_segment()->inotablev = inotablev;
+}
+
+void ESession::replay(MDSRank *mds)
+{
+  if (inos_to_purge.size())
+    get_segment()->purging_inodes.insert(inos_to_purge);
+  
+  if (mds->sessionmap.get_version() >= cmapv) {
+    dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version() 
+	     << " >= " << cmapv << ", noop" << dendl;
+  } else if (mds->sessionmap.get_version() + 1 == cmapv) {
+    dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
+	     << " < " << cmapv << " " << (open ? "open":"close") << " " << client_inst << dendl;
+    Session *session;
+    if (open) {
+      session = mds->sessionmap.get_or_add_session(client_inst);
+      mds->sessionmap.set_state(session, Session::STATE_OPEN);
+      session->set_client_metadata(client_metadata);
+      dout(10) << " opened session " << session->info.inst << dendl;
+    } else {
+      session = mds->sessionmap.get_session(client_inst.name);
+      if (session) { // there always should be a session, but there's a bug
+	if (session->get_connection() == NULL) {
+	  dout(10) << " removed session " << session->info.inst << dendl;
+	  mds->sessionmap.remove_session(session);
+          session = NULL;
+	} else {
+	  session->clear();    // the client has reconnected; keep the Session, but reset
+	  dout(10) << " reset session " << session->info.inst << " (they reconnected)" << dendl;
+	}
+      } else {
+	mds->clog->error() << "replayed stray Session close event for " << client_inst
+			  << " from time " << stamp << ", ignoring";
+      }
+    }
+    if (session) {
+      mds->sessionmap.replay_dirty_session(session);
+    } else {
+      mds->sessionmap.replay_advance_version();
+    }
+    ceph_assert(mds->sessionmap.get_version() == cmapv);
+  } else {
+    mds->clog->error() << "ESession.replay sessionmap v " << cmapv
+		       << " - 1 > table " << mds->sessionmap.get_version();
+    ceph_assert(g_conf()->mds_wipe_sessions);
+    mds->sessionmap.wipe();
+    mds->sessionmap.set_version(cmapv);
+  }
+  
+  if (inos_to_free.size() && inotablev) {
+    if (mds->inotable->get_version() >= inotablev) {
+      dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
+	       << " >= " << inotablev << ", noop" << dendl;
+    } else {
+      dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
+	       << " < " << inotablev << " " << (open ? "add":"remove") << dendl;
+      ceph_assert(!open);  // for now
+      mds->inotable->replay_release_ids(inos_to_free);
+      ceph_assert(mds->inotable->get_version() == inotablev);
+    }
+  }
+
+  update_segment();
+}
+
+void ESession::encode(bufferlist &bl, uint64_t features) const
+{
+  ENCODE_START(6, 5, bl);
+  encode(stamp, bl);
+  encode(client_inst, bl, features);
+  encode(open, bl);
+  encode(cmapv, bl);
+  encode(inos_to_free, bl);
+  encode(inotablev, bl);
+  encode(client_metadata, bl);
+  encode(inos_to_purge, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ESession::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
+  if (struct_v >= 2)
+    decode(stamp, bl);
+  decode(client_inst, bl);
+  decode(open, bl);
+  decode(cmapv, bl);
+  decode(inos_to_free, bl);
+  decode(inotablev, bl);
+  if (struct_v == 4) {
+    decode(client_metadata.kv_map, bl);
+  } else if (struct_v >= 5) {
+    decode(client_metadata, bl);
+  }
+  if (struct_v >= 6){
+    decode(inos_to_purge, bl);
+  }
+    
+  DECODE_FINISH(bl);
+}
+
+void ESession::dump(Formatter *f) const
+{
+  f->dump_stream("client instance") << client_inst;
+  f->dump_string("open", open ? "true" : "false");
+  f->dump_int("client map version", cmapv);
+  f->dump_stream("inos_to_free") << inos_to_free;
+  f->dump_int("inotable version", inotablev);
+  f->open_object_section("client_metadata");
+  f->dump_stream("inos_to_purge") << inos_to_purge;
+  client_metadata.dump(f);
+  f->close_section();  // client_metadata
+}
+
+void ESession::generate_test_instances(std::list<ESession*>& ls)
+{
+  ls.push_back(new ESession);
+}
+
+// -----------------------
+// ESessions
+
+void ESessions::encode(bufferlist &bl, uint64_t features) const
+{
+  ENCODE_START(2, 1, bl);
+  encode(client_map, bl, features);
+  encode(cmapv, bl);
+  encode(stamp, bl);
+  encode(client_metadata_map, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ESessions::decode_old(bufferlist::const_iterator &bl)
+{
+  using ceph::decode;
+  decode(client_map, bl);
+  decode(cmapv, bl);
+  if (!bl.end())
+    decode(stamp, bl);
+}
+
+void ESessions::decode_new(bufferlist::const_iterator &bl)
+{
+  DECODE_START(2, bl);
+  decode(client_map, bl);
+  decode(cmapv, bl);
+  decode(stamp, bl);
+  if (struct_v >= 2)
+    decode(client_metadata_map, bl);
+  DECODE_FINISH(bl);
+}
+
+void ESessions::dump(Formatter *f) const
+{
+  f->dump_int("client map version", cmapv);
+
+  f->open_array_section("client map");
+  for (map<client_t,entity_inst_t>::const_iterator i = client_map.begin();
+       i != client_map.end(); ++i) {
+    f->open_object_section("client");
+    f->dump_int("client id", i->first.v);
+    f->dump_stream("client entity") << i->second;
+    f->close_section(); // client
+  }
+  f->close_section(); // client map
+}
+
+void ESessions::generate_test_instances(std::list<ESessions*>& ls)
+{
+  ls.push_back(new ESessions());
+}
+
+void ESessions::update_segment()
+{
+  get_segment()->sessionmapv = cmapv;
+}
+
+void ESessions::replay(MDSRank *mds)
+{
+  if (mds->sessionmap.get_version() >= cmapv) {
+    dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
+	     << " >= " << cmapv << ", noop" << dendl;
+  } else {
+    dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
+	     << " < " << cmapv << dendl;
+    mds->sessionmap.replay_open_sessions(cmapv, client_map, client_metadata_map);
+  }
+  update_segment();
+}
+
+
+// -----------------------
+// ETableServer
+
+void ETableServer::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(3, 3, bl);
+  encode(stamp, bl);
+  encode(table, bl);
+  encode(op, bl);
+  encode(reqid, bl);
+  encode(bymds, bl);
+  encode(mutation, bl);
+  encode(tid, bl);
+  encode(version, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ETableServer::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+  if (struct_v >= 2)
+    decode(stamp, bl);
+  decode(table, bl);
+  decode(op, bl);
+  decode(reqid, bl);
+  decode(bymds, bl);
+  decode(mutation, bl);
+  decode(tid, bl);
+  decode(version, bl);
+  DECODE_FINISH(bl);
+}
+
+void ETableServer::dump(Formatter *f) const
+{
+  f->dump_int("table id", table);
+  f->dump_int("op", op);
+  f->dump_int("request id", reqid);
+  f->dump_int("by mds", bymds);
+  f->dump_int("tid", tid);
+  f->dump_int("version", version);
+}
+
+void ETableServer::generate_test_instances(std::list<ETableServer*>& ls)
+{
+  ls.push_back(new ETableServer());
+}
+
+
+void ETableServer::update_segment()
+{
+  get_segment()->tablev[table] = version;
+}
+
+void ETableServer::replay(MDSRank *mds)
+{
+  MDSTableServer *server = mds->get_table_server(table);
+  if (!server)
+    return;
+
+  if (server->get_version() >= version) {
+    dout(10) << "ETableServer.replay " << get_mdstable_name(table)
+	     << " " << get_mdstableserver_opname(op)
+	     << " event " << version
+	     << " <= table " << server->get_version() << dendl;
+    return;
+  }
+  
+  dout(10) << " ETableServer.replay " << get_mdstable_name(table)
+	   << " " << get_mdstableserver_opname(op)
+	   << " event " << version << " - 1 == table " << server->get_version() << dendl;
+  ceph_assert(version-1 == server->get_version());
+
+  switch (op) {
+  case TABLESERVER_OP_PREPARE: {
+    server->_note_prepare(bymds, reqid, true);
+    bufferlist out;
+    server->_prepare(mutation, reqid, bymds, out);
+    mutation = std::move(out);
+    break;
+  }
+  case TABLESERVER_OP_COMMIT:
+    server->_commit(tid, ref_t<MMDSTableRequest>());
+    server->_note_commit(tid, true);
+    break;
+  case TABLESERVER_OP_ROLLBACK:
+    server->_rollback(tid);
+    server->_note_rollback(tid, true);
+    break;
+  case TABLESERVER_OP_SERVER_UPDATE:
+    server->_server_update(mutation);
+    server->_note_server_update(mutation, true);
+    break;
+  default:
+    mds->clog->error() << "invalid tableserver op in ETableServer";
+    mds->damaged();
+    ceph_abort();  // Should be unreachable because damaged() calls respawn()
+  }
+  
+  ceph_assert(version == server->get_version());
+  update_segment();
+}
+
+
+// ---------------------
+// ETableClient
+
+void ETableClient::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(3, 3, bl);
+  encode(stamp, bl);
+  encode(table, bl);
+  encode(op, bl);
+  encode(tid, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ETableClient::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+  if (struct_v >= 2)
+    decode(stamp, bl);
+  decode(table, bl);
+  decode(op, bl);
+  decode(tid, bl);
+  DECODE_FINISH(bl);
+}
+
+void ETableClient::dump(Formatter *f) const
+{
+  f->dump_int("table", table);
+  f->dump_int("op", op);
+  f->dump_int("tid", tid);
+}
+
+void ETableClient::generate_test_instances(std::list<ETableClient*>& ls)
+{
+  ls.push_back(new ETableClient());
+}
+
+void ETableClient::replay(MDSRank *mds)
+{
+  dout(10) << " ETableClient.replay " << get_mdstable_name(table)
+	   << " op " << get_mdstableserver_opname(op)
+	   << " tid " << tid << dendl;
+    
+  MDSTableClient *client = mds->get_table_client(table);
+  if (!client)
+    return;
+
+  ceph_assert(op == TABLESERVER_OP_ACK);
+  client->got_journaled_ack(tid);
+}
+
+
+// -----------------------
+// ESnap
+/*
+void ESnap::update_segment()
+{
+  get_segment()->tablev[TABLE_SNAP] = version;
+}
+
+void ESnap::replay(MDSRank *mds)
+{
+  if (mds->snaptable->get_version() >= version) {
+    dout(10) << "ESnap.replay event " << version
+	     << " <= table " << mds->snaptable->get_version() << dendl;
+    return;
+  } 
+  
+  dout(10) << " ESnap.replay event " << version
+	   << " - 1 == table " << mds->snaptable->get_version() << dendl;
+  ceph_assert(version-1 == mds->snaptable->get_version());
+
+  if (create) {
+    version_t v;
+    snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
+    ceph_assert(s == snap.snapid);
+  } else {
+    mds->snaptable->remove(snap.snapid);
+  }
+
+  ceph_assert(version == mds->snaptable->get_version());
+}
+*/
+
+
+
+// -----------------------
+// EUpdate
+
+void EUpdate::encode(bufferlist &bl, uint64_t features) const
+{
+  ENCODE_START(4, 4, bl);
+  encode(stamp, bl);
+  encode(type, bl);
+  encode(metablob, bl, features);
+  encode(client_map, bl);
+  encode(cmapv, bl);
+  encode(reqid, bl);
+  encode(had_peers, bl);
+  ENCODE_FINISH(bl);
+}
+ 
+void EUpdate::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl);
+  if (struct_v >= 2)
+    decode(stamp, bl);
+  decode(type, bl);
+  decode(metablob, bl);
+  decode(client_map, bl);
+  if (struct_v >= 3)
+    decode(cmapv, bl);
+  decode(reqid, bl);
+  decode(had_peers, bl);
+  DECODE_FINISH(bl);
+}
+
+void EUpdate::dump(Formatter *f) const
+{
+  f->open_object_section("metablob");
+  metablob.dump(f);
+  f->close_section(); // metablob
+
+  f->dump_string("type", type);
+  f->dump_int("client map length", client_map.length());
+  f->dump_int("client map version", cmapv);
+  f->dump_stream("reqid") << reqid;
+  f->dump_string("had peers", had_peers ? "true" : "false");
+}
+
+void EUpdate::generate_test_instances(std::list<EUpdate*>& ls)
+{
+  ls.push_back(new EUpdate());
+}
+
+
+void EUpdate::update_segment()
+{
+  auto&& segment = get_segment();
+  metablob.update_segment(segment);
+
+  if (client_map.length())
+    segment->sessionmapv = cmapv;
+
+  if (had_peers)
+    segment->uncommitted_leaders.insert(reqid);
+}
+
+void EUpdate::replay(MDSRank *mds)
+{
+  auto&& segment = get_segment();
+  metablob.replay(mds, segment);
+  
+  if (had_peers) {
+    dout(10) << "EUpdate.replay " << reqid << " had peers, expecting a matching ECommitted" << dendl;
+    segment->uncommitted_leaders.insert(reqid);
+    set<mds_rank_t> peers;
+    mds->mdcache->add_uncommitted_leader(reqid, segment, peers, true);
+  }
+  
+  if (client_map.length()) {
+    if (mds->sessionmap.get_version() >= cmapv) {
+      dout(10) << "EUpdate.replay sessionmap v " << cmapv
+	       << " <= table " << mds->sessionmap.get_version() << dendl;
+    } else {
+      dout(10) << "EUpdate.replay sessionmap " << mds->sessionmap.get_version()
+	       << " < " << cmapv << dendl;
+      // open client sessions?
+      map<client_t,entity_inst_t> cm;
+      map<client_t,client_metadata_t> cmm;
+      auto blp = client_map.cbegin();
+      using ceph::decode;
+      decode(cm, blp);
+      if (!blp.end())
+	decode(cmm, blp);
+      mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
+    }
+  }
+  update_segment();
+}
+
+
+// ------------------------
+// EOpen
+
+void EOpen::encode(bufferlist &bl, uint64_t features) const {
+  ENCODE_START(4, 3, bl);
+  encode(stamp, bl);
+  encode(metablob, bl, features);
+  encode(inos, bl);
+  encode(snap_inos, bl);
+  ENCODE_FINISH(bl);
+} 
+
+void EOpen::decode(bufferlist::const_iterator &bl) {
+  DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+  if (struct_v >= 2)
+    decode(stamp, bl);
+  decode(metablob, bl);
+  decode(inos, bl);
+  if (struct_v >= 4)
+    decode(snap_inos, bl);
+  DECODE_FINISH(bl);
+}
+
+void EOpen::dump(Formatter *f) const
+{
+  f->open_object_section("metablob");
+  metablob.dump(f);
+  f->close_section(); // metablob
+  f->open_array_section("inos involved");
+  for (vector<inodeno_t>::const_iterator i = inos.begin();
+       i != inos.end(); ++i) {
+    f->dump_int("ino", *i);
+  }
+  f->close_section(); // inos
+}
+
+void EOpen::generate_test_instances(std::list<EOpen*>& ls)
+{
+  ls.push_back(new EOpen());
+  ls.push_back(new EOpen());
+  ls.back()->add_ino(0);
+}
+
+void EOpen::update_segment()
+{
+  // ??
+}
+
+void EOpen::replay(MDSRank *mds)
+{
+  dout(10) << "EOpen.replay " << dendl;
+  auto&& segment = get_segment();
+  metablob.replay(mds, segment);
+
+  // note which segments inodes belong to, so we don't have to start rejournaling them
+  for (const auto &ino : inos) {
+    CInode *in = mds->mdcache->get_inode(ino);
+    if (!in) {
+      dout(0) << "EOpen.replay ino " << ino << " not in metablob" << dendl;
+      ceph_assert(in);
+    }
+    segment->open_files.push_back(&in->item_open_file);
+  }
+  for (const auto &vino : snap_inos) {
+    CInode *in = mds->mdcache->get_inode(vino);
+    if (!in) {
+      dout(0) << "EOpen.replay ino " << vino << " not in metablob" << dendl;
+      ceph_assert(in);
+    }
+    segment->open_files.push_back(&in->item_open_file);
+  }
+}
+
+
+// -----------------------
+// ECommitted
+
+void ECommitted::replay(MDSRank *mds)
+{
+  if (mds->mdcache->uncommitted_leaders.count(reqid)) {
+    dout(10) << "ECommitted.replay " << reqid << dendl;
+    mds->mdcache->uncommitted_leaders[reqid].ls->uncommitted_leaders.erase(reqid);
+    mds->mdcache->uncommitted_leaders.erase(reqid);
+  } else {
+    dout(10) << "ECommitted.replay " << reqid << " -- didn't see original op" << dendl;
+  }
+}
+
+void ECommitted::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(3, 3, bl);
+  encode(stamp, bl);
+  encode(reqid, bl);
+  ENCODE_FINISH(bl);
+} 
+
+void ECommitted::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+  if (struct_v >= 2)
+    decode(stamp, bl);
+  decode(reqid, bl);
+  DECODE_FINISH(bl);
+}
+
+void ECommitted::dump(Formatter *f) const {
+  f->dump_stream("stamp") << stamp;
+  f->dump_stream("reqid") << reqid;
+}
+
+void ECommitted::generate_test_instances(std::list<ECommitted*>& ls)
+{
+  ls.push_back(new ECommitted);
+  ls.push_back(new ECommitted);
+  ls.back()->stamp = utime_t(1, 2);
+  ls.back()->reqid = metareqid_t(entity_name_t::CLIENT(123), 456);
+}
+
+// -----------------------
+// EPeerUpdate
+
+void link_rollback::encode(bufferlist &bl) const
+{
+  ENCODE_START(3, 2, bl);
+  encode(reqid, bl);
+  encode(ino, bl);
+  encode(was_inc, bl);
+  encode(old_ctime, bl);
+  encode(old_dir_mtime, bl);
+  encode(old_dir_rctime, bl);
+  encode(snapbl, bl);
+  ENCODE_FINISH(bl);
+}
+
+void link_rollback::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+  decode(reqid, bl);
+  decode(ino, bl);
+  decode(was_inc, bl);
+  decode(old_ctime, bl);
+  decode(old_dir_mtime, bl);
+  decode(old_dir_rctime, bl);
+  if (struct_v >= 3)
+    decode(snapbl, bl);
+  DECODE_FINISH(bl);
+}
+
+void link_rollback::dump(Formatter *f) const
+{
+  f->dump_stream("metareqid") << reqid;
+  f->dump_int("ino", ino);
+  f->dump_string("was incremented", was_inc ? "true" : "false");
+  f->dump_stream("old_ctime") << old_ctime;
+  f->dump_stream("old_dir_mtime") << old_dir_mtime;
+  f->dump_stream("old_dir_rctime") << old_dir_rctime;
+}
+
+void link_rollback::generate_test_instances(std::list<link_rollback*>& ls)
+{
+  ls.push_back(new link_rollback());
+}
+
+void rmdir_rollback::encode(bufferlist& bl) const
+{
+  ENCODE_START(3, 2, bl);
+  encode(reqid, bl);
+  encode(src_dir, bl);
+  encode(src_dname, bl);
+  encode(dest_dir, bl);
+  encode(dest_dname, bl);
+  encode(snapbl, bl);
+  ENCODE_FINISH(bl);
+}
+
+void rmdir_rollback::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+  decode(reqid, bl);
+  decode(src_dir, bl);
+  decode(src_dname, bl);
+  decode(dest_dir, bl);
+  decode(dest_dname, bl);
+  if (struct_v >= 3)
+    decode(snapbl, bl);
+  DECODE_FINISH(bl);
+}
+
+void rmdir_rollback::dump(Formatter *f) const
+{
+  f->dump_stream("metareqid") << reqid;
+  f->dump_stream("source directory") << src_dir;
+  f->dump_string("source dname", src_dname);
+  f->dump_stream("destination directory") << dest_dir;
+  f->dump_string("destination dname", dest_dname);
+}
+
+void rmdir_rollback::generate_test_instances(std::list<rmdir_rollback*>& ls)
+{
+  ls.push_back(new rmdir_rollback());
+}
+
+void rename_rollback::drec::encode(bufferlist &bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(dirfrag, bl);
+  encode(dirfrag_old_mtime, bl);
+  encode(dirfrag_old_rctime, bl);
+  encode(ino, bl);
+  encode(remote_ino, bl);
+  encode(dname, bl);
+  encode(remote_d_type, bl);
+  encode(old_ctime, bl);
+  ENCODE_FINISH(bl);
+}
+
+void rename_rollback::drec::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(dirfrag, bl);
+  decode(dirfrag_old_mtime, bl);
+  decode(dirfrag_old_rctime, bl);
+  decode(ino, bl);
+  decode(remote_ino, bl);
+  decode(dname, bl);
+  decode(remote_d_type, bl);
+  decode(old_ctime, bl);
+  DECODE_FINISH(bl);
+}
+
+void rename_rollback::drec::dump(Formatter *f) const
+{
+  f->dump_stream("directory fragment") << dirfrag;
+  f->dump_stream("directory old mtime") << dirfrag_old_mtime;
+  f->dump_stream("directory old rctime") << dirfrag_old_rctime;
+  f->dump_int("ino", ino);
+  f->dump_int("remote ino", remote_ino);
+  f->dump_string("dname", dname);
+  uint32_t type = DTTOIF(remote_d_type) & S_IFMT; // convert to type entries
+  string type_string;
+  switch(type) {
+  case S_IFREG:
+    type_string = "file"; break;
+  case S_IFLNK:
+    type_string = "symlink"; break;
+  case S_IFDIR:
+    type_string = "directory"; break;
+  default:
+    type_string = "UNKNOWN-" + stringify((int)type); break;
+  }
+  f->dump_string("remote dtype", type_string);
+  f->dump_stream("old ctime") << old_ctime;
+}
+
+void rename_rollback::drec::generate_test_instances(std::list<drec*>& ls)
+{
+  ls.push_back(new drec());
+  ls.back()->remote_d_type = IFTODT(S_IFREG);
+}
+
+void rename_rollback::encode(bufferlist &bl) const
+{
+  ENCODE_START(3, 2, bl);
+  encode(reqid, bl);
+  encode(orig_src, bl);
+  encode(orig_dest, bl);
+  encode(stray, bl);
+  encode(ctime, bl);
+  encode(srci_snapbl, bl);
+  encode(desti_snapbl, bl);
+  ENCODE_FINISH(bl);
+}
+
+void rename_rollback::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+  decode(reqid, bl);
+  decode(orig_src, bl);
+  decode(orig_dest, bl);
+  decode(stray, bl);
+  decode(ctime, bl);
+  if (struct_v >= 3) {
+    decode(srci_snapbl, bl);
+    decode(desti_snapbl, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void rename_rollback::dump(Formatter *f) const
+{
+  f->dump_stream("request id") << reqid;
+  f->open_object_section("original src drec");
+  orig_src.dump(f);
+  f->close_section(); // original src drec
+  f->open_object_section("original dest drec");
+  orig_dest.dump(f);
+  f->close_section(); // original dest drec
+  f->open_object_section("stray drec");
+  stray.dump(f);
+  f->close_section(); // stray drec
+  f->dump_stream("ctime") << ctime;
+}
+
+void rename_rollback::generate_test_instances(std::list<rename_rollback*>& ls)
+{
+  ls.push_back(new rename_rollback());
+  ls.back()->orig_src.remote_d_type = IFTODT(S_IFREG);
+  ls.back()->orig_dest.remote_d_type = IFTODT(S_IFREG);
+  ls.back()->stray.remote_d_type = IFTODT(S_IFREG);
+}
+
+void EPeerUpdate::encode(bufferlist &bl, uint64_t features) const
+{
+  ENCODE_START(3, 3, bl);
+  encode(stamp, bl);
+  encode(type, bl);
+  encode(reqid, bl);
+  encode(leader, bl);
+  encode(op, bl);
+  encode(origop, bl);
+  encode(commit, bl, features);
+  encode(rollback, bl);
+  ENCODE_FINISH(bl);
+} 
+
+void EPeerUpdate::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+  if (struct_v >= 2)
+    decode(stamp, bl);
+  decode(type, bl);
+  decode(reqid, bl);
+  decode(leader, bl);
+  decode(op, bl);
+  decode(origop, bl);
+  decode(commit, bl);
+  decode(rollback, bl);
+  DECODE_FINISH(bl);
+}
+
+void EPeerUpdate::dump(Formatter *f) const
+{
+  f->open_object_section("metablob");
+  commit.dump(f);
+  f->close_section(); // metablob
+
+  f->dump_int("rollback length", rollback.length());
+  f->dump_string("type", type);
+  f->dump_stream("metareqid") << reqid;
+  f->dump_int("leader", leader);
+  f->dump_int("op", op);
+  f->dump_int("original op", origop);
+}
+
+void EPeerUpdate::generate_test_instances(std::list<EPeerUpdate*>& ls)
+{
+  ls.push_back(new EPeerUpdate());
+}
+
+void EPeerUpdate::replay(MDSRank *mds)
+{
+  MDPeerUpdate *su;
+  auto&& segment = get_segment();
+  switch (op) {
+  case EPeerUpdate::OP_PREPARE:
+    dout(10) << "EPeerUpdate.replay prepare " << reqid << " for mds." << leader
+	     << ": applying commit, saving rollback info" << dendl;
+    su = new MDPeerUpdate(origop, rollback);
+    commit.replay(mds, segment, su);
+    mds->mdcache->add_uncommitted_peer(reqid, segment, leader, su);
+    break;
+
+  case EPeerUpdate::OP_COMMIT:
+    dout(10) << "EPeerUpdate.replay commit " << reqid << " for mds." << leader << dendl;
+    mds->mdcache->finish_uncommitted_peer(reqid, false);
+    break;
+
+  case EPeerUpdate::OP_ROLLBACK:
+    dout(10) << "EPeerUpdate.replay abort " << reqid << " for mds." << leader
+	     << ": applying rollback commit blob" << dendl;
+    commit.replay(mds, segment);
+    mds->mdcache->finish_uncommitted_peer(reqid, false);
+    break;
+
+  default:
+    mds->clog->error() << "invalid op in EPeerUpdate";
+    mds->damaged();
+    ceph_abort();  // Should be unreachable because damaged() calls respawn()
+  }
+}
+
+
+// -----------------------
+// ESubtreeMap
+
+void ESubtreeMap::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(6, 5, bl);
+  encode(stamp, bl);
+  encode(metablob, bl, features);
+  encode(subtrees, bl);
+  encode(ambiguous_subtrees, bl);
+  encode(expire_pos, bl);
+  encode(event_seq, bl);
+  ENCODE_FINISH(bl);
+}
+ 
+void ESubtreeMap::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
+  if (struct_v >= 2)
+    decode(stamp, bl);
+  decode(metablob, bl);
+  decode(subtrees, bl);
+  if (struct_v >= 4)
+    decode(ambiguous_subtrees, bl);
+  if (struct_v >= 3)
+    decode(expire_pos, bl);
+  if (struct_v >= 6)
+    decode(event_seq, bl);
+  DECODE_FINISH(bl);
+}
+
+void ESubtreeMap::dump(Formatter *f) const
+{
+  f->open_object_section("metablob");
+  metablob.dump(f);
+  f->close_section(); // metablob
+  
+  f->open_array_section("subtrees");
+  for(map<dirfrag_t,vector<dirfrag_t> >::const_iterator i = subtrees.begin();
+      i != subtrees.end(); ++i) {
+    f->open_object_section("tree");
+    f->dump_stream("root dirfrag") << i->first;
+    for (vector<dirfrag_t>::const_iterator j = i->second.begin();
+	 j != i->second.end(); ++j) {
+      f->dump_stream("bound dirfrag") << *j;
+    }
+    f->close_section(); // tree
+  }
+  f->close_section(); // subtrees
+
+  f->open_array_section("ambiguous subtrees");
+  for(set<dirfrag_t>::const_iterator i = ambiguous_subtrees.begin();
+      i != ambiguous_subtrees.end(); ++i) {
+    f->dump_stream("dirfrag") << *i;
+  }
+  f->close_section(); // ambiguous subtrees
+
+  f->dump_int("expire position", expire_pos);
+}
+
+void ESubtreeMap::generate_test_instances(std::list<ESubtreeMap*>& ls)
+{
+  ls.push_back(new ESubtreeMap());
+}
+
+void ESubtreeMap::replay(MDSRank *mds) 
+{
+  if (expire_pos && expire_pos > mds->mdlog->journaler->get_expire_pos())
+    mds->mdlog->journaler->set_expire_pos(expire_pos);
+
+  // suck up the subtree map?
+  if (mds->mdcache->is_subtrees()) {
+    dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl;
+    int errors = 0;
+
+    for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
+	 p != subtrees.end();
+	 ++p) {
+      CDir *dir = mds->mdcache->get_dirfrag(p->first);
+      if (!dir) {
+	mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+			  << " subtree root " << p->first << " not in cache";
+	++errors;
+	continue;
+      }
+      
+      if (!mds->mdcache->is_subtree(dir)) {
+	mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+			  << " subtree root " << p->first << " not a subtree in cache";
+	++errors;
+	continue;
+      }
+      if (dir->get_dir_auth().first != mds->get_nodeid()) {
+	mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+			  << " subtree root " << p->first
+			  << " is not mine in cache (it's " << dir->get_dir_auth() << ")";
+	++errors;
+	continue;
+      }
+
+      for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
+	mds->mdcache->get_force_dirfrag(*q, true);
+
+      set<CDir*> bounds;
+      mds->mdcache->get_subtree_bounds(dir, bounds);
+      for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
+	CDir *b = mds->mdcache->get_dirfrag(*q);
+	if (!b) {
+	  mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+			    << " subtree " << p->first << " bound " << *q << " not in cache";
+	++errors;
+	  continue;
+	}
+	if (bounds.count(b) == 0) {
+	  mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+			    << " subtree " << p->first << " bound " << *q << " not a bound in cache";
+	++errors;
+	  continue;
+	}
+	bounds.erase(b);
+      }
+      for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q) {
+	mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+			  << " subtree " << p->first << " has extra bound in cache " << (*q)->dirfrag();
+	++errors;
+      }
+      
+      if (ambiguous_subtrees.count(p->first)) {
+	if (!mds->mdcache->have_ambiguous_import(p->first)) {
+	  mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+			    << " subtree " << p->first << " is ambiguous but is not in our cache";
+	  ++errors;
+	}
+      } else {
+	if (mds->mdcache->have_ambiguous_import(p->first)) {
+	  mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+			    << " subtree " << p->first << " is not ambiguous but is in our cache";
+	  ++errors;
+	}
+      }
+    }
+    
+    std::vector<CDir*> dirs;
+    mds->mdcache->get_subtrees(dirs);
+    for (const auto& dir : dirs) {
+      if (dir->get_dir_auth().first != mds->get_nodeid())
+	continue;
+      if (subtrees.count(dir->dirfrag()) == 0) {
+	mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+			  << " does not include cache subtree " << dir->dirfrag();
+	++errors;
+      }
+    }
+
+    if (errors) {
+      dout(0) << "journal subtrees: " << subtrees << dendl;
+      dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees << dendl;
+      mds->mdcache->show_subtrees();
+      ceph_assert(!g_conf()->mds_debug_subtrees || errors == 0);
+    }
+    return;
+  }
+
+  dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl;
+  
+  // first, stick the spanning tree in my cache
+  //metablob.print(*_dout);
+  metablob.replay(mds, get_segment());
+  
+  // restore import/export maps
+  for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
+       p != subtrees.end();
+       ++p) {
+    CDir *dir = mds->mdcache->get_dirfrag(p->first);
+    ceph_assert(dir);
+    if (ambiguous_subtrees.count(p->first)) {
+      // ambiguous!
+      mds->mdcache->add_ambiguous_import(p->first, p->second);
+      mds->mdcache->adjust_bounded_subtree_auth(dir, p->second,
+						mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
+    } else {
+      // not ambiguous
+      mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid());
+    }
+  }
+
+  mds->mdcache->recalc_auth_bits(true);
+
+  mds->mdcache->show_subtrees();
+}
+
+
+
+// -----------------------
+// EFragment
+
+void EFragment::replay(MDSRank *mds)
+{
+  dout(10) << "EFragment.replay " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << dendl;
+
+  std::vector<CDir*> resultfrags;
+  MDSContext::vec waiters;
+
+  // in may be NULL if it wasn't in our cache yet.  if it's a prepare
+  // it will be once we replay the metablob , but first we need to
+  // refragment anything we already have in the cache.
+  CInode *in = mds->mdcache->get_inode(ino);
+
+  auto&& segment = get_segment();
+  switch (op) {
+  case OP_PREPARE:
+    mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, segment, &rollback);
+
+    if (in)
+      mds->mdcache->adjust_dir_fragments(in, basefrag, bits, &resultfrags, waiters, true);
+    break;
+
+  case OP_ROLLBACK: {
+    frag_vec_t old_frags;
+    if (in) {
+      in->dirfragtree.get_leaves_under(basefrag, old_frags);
+      if (orig_frags.empty()) {
+	// old format EFragment
+	mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, &resultfrags, waiters, true);
+      } else {
+	for (const auto& fg : orig_frags)
+	  mds->mdcache->force_dir_fragment(in, fg);
+      }
+    }
+    mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), std::move(old_frags));
+    break;
+  }
+
+  case OP_COMMIT:
+  case OP_FINISH:
+    mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op);
+    break;
+
+  default:
+    ceph_abort();
+  }
+
+  metablob.replay(mds, segment);
+  if (in && g_conf()->mds_debug_frag)
+    in->verify_dirfrags();
+}
+
+void EFragment::encode(bufferlist &bl, uint64_t features) const {
+  ENCODE_START(5, 4, bl);
+  encode(stamp, bl);
+  encode(op, bl);
+  encode(ino, bl);
+  encode(basefrag, bl);
+  encode(bits, bl);
+  encode(metablob, bl, features);
+  encode(orig_frags, bl);
+  encode(rollback, bl);
+  ENCODE_FINISH(bl);
+}
+
+void EFragment::decode(bufferlist::const_iterator &bl) {
+  DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
+  if (struct_v >= 2)
+    decode(stamp, bl);
+  if (struct_v >= 3)
+    decode(op, bl);
+  decode(ino, bl);
+  decode(basefrag, bl);
+  decode(bits, bl);
+  decode(metablob, bl);
+  if (struct_v >= 5) {
+    decode(orig_frags, bl);
+    decode(rollback, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void EFragment::dump(Formatter *f) const
+{
+  /*f->open_object_section("Metablob");
+  metablob.dump(f); // sadly we don't have this; dunno if we'll get it
+  f->close_section();*/
+  f->dump_string("op", op_name(op));
+  f->dump_stream("ino") << ino;
+  f->dump_stream("base frag") << basefrag;
+  f->dump_int("bits", bits);
+}
+
+void EFragment::generate_test_instances(std::list<EFragment*>& ls)
+{
+  ls.push_back(new EFragment);
+  ls.push_back(new EFragment);
+  ls.back()->op = OP_PREPARE;
+  ls.back()->ino = 1;
+  ls.back()->bits = 5;
+}
+
+void dirfrag_rollback::encode(bufferlist &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(*fnode, bl);
+  ENCODE_FINISH(bl);
+}
+
+void dirfrag_rollback::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  {
+    auto _fnode = CDir::allocate_fnode();
+    decode(*_fnode, bl);
+    fnode = std::move(_fnode);
+  }
+  DECODE_FINISH(bl);
+}
+
+
+
+// =========================================================================
+
+// -----------------------
+// EExport
+
+void EExport::replay(MDSRank *mds)
+{
+  dout(10) << "EExport.replay " << base << dendl;
+  auto&& segment = get_segment();
+  metablob.replay(mds, segment);
+  
+  CDir *dir = mds->mdcache->get_dirfrag(base);
+  ceph_assert(dir);
+  
+  set<CDir*> realbounds;
+  for (set<dirfrag_t>::iterator p = bounds.begin();
+       p != bounds.end();
+       ++p) {
+    CDir *bd = mds->mdcache->get_dirfrag(*p);
+    ceph_assert(bd);
+    realbounds.insert(bd);
+  }
+
+  // adjust auth away
+  mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, CDIR_AUTH_UNDEF);
+
+  mds->mdcache->try_trim_non_auth_subtree(dir);
+}
+
+void EExport::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(4, 3, bl);
+  encode(stamp, bl);
+  encode(metablob, bl, features);
+  encode(base, bl);
+  encode(bounds, bl);
+  encode(target, bl);
+  ENCODE_FINISH(bl);
+}
+
+void EExport::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+  if (struct_v >= 2)
+    decode(stamp, bl);
+  decode(metablob, bl);
+  decode(base, bl);
+  decode(bounds, bl);
+  if (struct_v >= 4)
+    decode(target, bl);
+  DECODE_FINISH(bl);
+}
+
+void EExport::dump(Formatter *f) const
+{
+  f->dump_float("stamp", (double)stamp);
+  /*f->open_object_section("Metablob");
+  metablob.dump(f); // sadly we don't have this; dunno if we'll get it
+  f->close_section();*/
+  f->dump_stream("base dirfrag") << base;
+  f->open_array_section("bounds dirfrags");
+  for (set<dirfrag_t>::const_iterator i = bounds.begin();
+      i != bounds.end(); ++i) {
+    f->dump_stream("dirfrag") << *i;
+  }
+  f->close_section(); // bounds dirfrags
+}
+
+void EExport::generate_test_instances(std::list<EExport*>& ls)
+{
+  EExport *sample = new EExport();
+  ls.push_back(sample);
+}
+
+
+// -----------------------
+// EImportStart
+
+void EImportStart::update_segment()
+{
+  get_segment()->sessionmapv = cmapv;
+}
+
+void EImportStart::replay(MDSRank *mds)
+{
+  dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl;
+  //metablob.print(*_dout);
+  auto&& segment = get_segment();
+  metablob.replay(mds, segment);
+
+  // put in ambiguous import list
+  mds->mdcache->add_ambiguous_import(base, bounds);
+
+  // set auth partially to us so we don't trim it
+  CDir *dir = mds->mdcache->get_dirfrag(base);
+  ceph_assert(dir);
+
+  set<CDir*> realbounds;
+  for (vector<dirfrag_t>::iterator p = bounds.begin();
+       p != bounds.end();
+       ++p) {
+    CDir *bd = mds->mdcache->get_dirfrag(*p);
+    ceph_assert(bd);
+    if (!bd->is_subtree_root())
+      bd->state_clear(CDir::STATE_AUTH);
+    realbounds.insert(bd);
+  }
+
+  mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds,
+					    mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
+
+  // open client sessions?
+  if (mds->sessionmap.get_version() >= cmapv) {
+    dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version() 
+	     << " >= " << cmapv << ", noop" << dendl;
+  } else {
+    dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version() 
+	     << " < " << cmapv << dendl;
+    map<client_t,entity_inst_t> cm;
+    map<client_t,client_metadata_t> cmm;
+    auto blp = client_map.cbegin();
+    using ceph::decode;
+    decode(cm, blp);
+    if (!blp.end())
+      decode(cmm, blp);
+    mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
+  }
+  update_segment();
+}
+
+void EImportStart::encode(bufferlist &bl, uint64_t features) const {
+  ENCODE_START(4, 3, bl);
+  encode(stamp, bl);
+  encode(base, bl);
+  encode(metablob, bl, features);
+  encode(bounds, bl);
+  encode(cmapv, bl);
+  encode(client_map, bl);
+  encode(from, bl);
+  ENCODE_FINISH(bl);
+}
+
+void EImportStart::decode(bufferlist::const_iterator &bl) {
+  DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+  if (struct_v >= 2)
+    decode(stamp, bl);
+  decode(base, bl);
+  decode(metablob, bl);
+  decode(bounds, bl);
+  decode(cmapv, bl);
+  decode(client_map, bl);
+  if (struct_v >= 4)
+    decode(from, bl);
+  DECODE_FINISH(bl);
+}
+
+void EImportStart::dump(Formatter *f) const
+{
+  f->dump_stream("base dirfrag") << base;
+  f->open_array_section("boundary dirfrags");
+  for (vector<dirfrag_t>::const_iterator iter = bounds.begin();
+      iter != bounds.end(); ++iter) {
+    f->dump_stream("frag") << *iter;
+  }
+  f->close_section();
+}
+
+void EImportStart::generate_test_instances(std::list<EImportStart*>& ls)
+{
+  ls.push_back(new EImportStart);
+}
+
+// -----------------------
+// EImportFinish
+
+void EImportFinish::replay(MDSRank *mds)
+{
+  if (mds->mdcache->have_ambiguous_import(base)) {
+    dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl;
+    if (success) {
+      mds->mdcache->finish_ambiguous_import(base);
+    } else {
+      CDir *dir = mds->mdcache->get_dirfrag(base);
+      ceph_assert(dir);
+      vector<dirfrag_t> bounds;
+      mds->mdcache->get_ambiguous_import_bounds(base, bounds);
+      mds->mdcache->adjust_bounded_subtree_auth(dir, bounds, CDIR_AUTH_UNDEF);
+      mds->mdcache->cancel_ambiguous_import(dir);
+      mds->mdcache->try_trim_non_auth_subtree(dir);
+   }
+  } else {
+    // this shouldn't happen unless this is an old journal
+    dout(10) << "EImportFinish.replay " << base << " success=" << success
+	     << " on subtree not marked as ambiguous" 
+	     << dendl;
+    mds->clog->error() << "failure replaying journal (EImportFinish)";
+    mds->damaged();
+    ceph_abort();  // Should be unreachable because damaged() calls respawn()
+  }
+}
+
+void EImportFinish::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(3, 3, bl);
+  encode(stamp, bl);
+  encode(base, bl);
+  encode(success, bl);
+  ENCODE_FINISH(bl);
+}
+
+void EImportFinish::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+  if (struct_v >= 2)
+    decode(stamp, bl);
+  decode(base, bl);
+  decode(success, bl);
+  DECODE_FINISH(bl);
+}
+
+void EImportFinish::dump(Formatter *f) const
+{
+  f->dump_stream("base dirfrag") << base;
+  f->dump_string("success", success ? "true" : "false");
+}
+void EImportFinish::generate_test_instances(std::list<EImportFinish*>& ls)
+{
+  ls.push_back(new EImportFinish);
+  ls.push_back(new EImportFinish);
+  ls.back()->success = true;
+}
+
+
+// ------------------------
+// EResetJournal
+
+void EResetJournal::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(stamp, bl);
+  ENCODE_FINISH(bl);
+}
+ 
+void EResetJournal::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(stamp, bl);
+  DECODE_FINISH(bl);
+}
+
+void EResetJournal::dump(Formatter *f) const
+{
+  f->dump_stream("timestamp") << stamp;
+}
+
+void EResetJournal::generate_test_instances(std::list<EResetJournal*>& ls)
+{
+  ls.push_back(new EResetJournal());
+}
+
+void EResetJournal::replay(MDSRank *mds)
+{
+  dout(1) << "EResetJournal" << dendl;
+
+  mds->sessionmap.wipe();
+  mds->inotable->replay_reset();
+
+  if (mds->mdsmap->get_root() == mds->get_nodeid()) {
+    CDir *rootdir = mds->mdcache->get_root()->get_or_open_dirfrag(mds->mdcache, frag_t());
+    mds->mdcache->adjust_subtree_auth(rootdir, mds->get_nodeid());   
+  }
+
+  CDir *mydir = mds->mdcache->get_myin()->get_or_open_dirfrag(mds->mdcache, frag_t());
+  mds->mdcache->adjust_subtree_auth(mydir, mds->get_nodeid());   
+
+  mds->mdcache->recalc_auth_bits(true);
+
+  mds->mdcache->show_subtrees();
+}
+
+
+void ENoOp::encode(bufferlist &bl, uint64_t features) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(pad_size, bl);
+  uint8_t const pad = 0xff;
+  for (unsigned int i = 0; i < pad_size; ++i) {
+    encode(pad, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+
+void ENoOp::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(2, bl);
+  decode(pad_size, bl);
+  if (bl.get_remaining() != pad_size) {
+    // This is spiritually an assertion, but expressing in a way that will let
+    // journal debug tools catch it and recognise a malformed entry.
+    throw buffer::end_of_buffer();
+  } else {
+    bl += pad_size;
+  }
+  DECODE_FINISH(bl);
+}
+
+
+void ENoOp::replay(MDSRank *mds)
+{
+  dout(4) << "ENoOp::replay, " << pad_size << " bytes skipped in journal" << dendl;
+}
+
+/**
+ * If re-formatting an old journal that used absolute log position
+ * references as segment sequence numbers, use this function to update
+ * it.
+ *
+ * @param mds
+ * MDSRank instance, just used for logging
+ * @param old_to_new
+ * Map of old journal segment sequence numbers to new journal segment sequence numbers
+ *
+ * @return
+ * True if the event was modified.
+ */
+bool EMetaBlob::rewrite_truncate_finish(MDSRank const *mds,
+    std::map<LogSegment::seq_t, LogSegment::seq_t> const &old_to_new)
+{
+  bool modified = false;
+  map<inodeno_t, LogSegment::seq_t> new_trunc_finish;
+  for (const auto& p : truncate_finish) {
+    auto q = old_to_new.find(p.second);
+    if (q != old_to_new.end()) {
+      dout(20) << __func__ << " applying segment seq mapping "
+        << p.second << " -> " << q->second << dendl;
+      new_trunc_finish.emplace(p.first, q->second);
+      modified = true;
+    } else {
+      dout(20) << __func__ << " no segment seq mapping found for "
+        << p.second << dendl;
+      new_trunc_finish.insert(p);
+    }
+  }
+  truncate_finish.swap(new_trunc_finish);
+
+  return modified;
+}
diff --git a/src/mds/locks.c b/src/mds/locks.c
new file mode 100644
index 000000000..dbe3ab8eb
--- /dev/null
+++ b/src/mds/locks.c
@@ -0,0 +1,159 @@
+#include "include/int_types.h"
+
+#include "locks.h"
+
+/* Duplicated from ceph_fs.h, which we cannot include into a C file.  */
+#define CEPH_CAP_GSHARED     1  /* client can reads */
+#define CEPH_CAP_GEXCL       2  /* client can read and update */
+#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
+#define CEPH_CAP_GRD         8  /* (file) client can read */
+#define CEPH_CAP_GWR        16  /* (file) client can write */
+#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
+
+static const struct sm_state_t simplelock[LOCK_MAX] = {
+                      // stable     loner  rep state  r     rp   rd   wr   fwr  l    x    caps,other
+    [LOCK_SYNC]      = { 0,         false, LOCK_SYNC, ANY,  0,   ANY, 0,   0,   ANY, 0,   CEPH_CAP_GSHARED,0,0,CEPH_CAP_GSHARED },
+    [LOCK_LOCK_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, AUTH, XCL, XCL, 0,   0,   XCL, 0,   0,0,0,0 },
+    [LOCK_EXCL_SYNC] = { LOCK_SYNC, true,  LOCK_LOCK, 0,    0,   0,   0,   XCL, 0,   0,   0,CEPH_CAP_GSHARED,0,0 },
+    [LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0,    0,   0,   0,   AUTH,0,   0,   0,0,0,0 },
+
+    [LOCK_LOCK]      = { 0,         false, LOCK_LOCK, AUTH, 0,   REQ, 0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, ANY,  0,   0,   0,   0,   0,   0,   0,0,0,0 }, 
+    [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0,    0,   0,   0,   XCL, 0,   0,   0,0,0,0 },
+
+    [LOCK_PREXLOCK]  = { LOCK_LOCK, false, LOCK_LOCK, 0,    XCL, 0,   0,   0,   0,   ANY, 0,0,0,0 },
+    [LOCK_XLOCK]     = { LOCK_SYNC, false, LOCK_LOCK, 0,    XCL, 0,   0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_XLOCKDONE] = { LOCK_SYNC, false, LOCK_LOCK, XCL,  XCL, XCL, 0,   0,   XCL, 0,   0,0,CEPH_CAP_GSHARED,0 },
+    [LOCK_LOCK_XLOCK]= { LOCK_PREXLOCK,false,LOCK_LOCK,0,   XCL, 0,   0,   0,   0,   XCL, 0,0,0,0 },
+
+    [LOCK_EXCL]      = { 0,         true,  LOCK_LOCK, 0,    0,   REQ, XCL, 0,   0,   0,   0,CEPH_CAP_GEXCL|CEPH_CAP_GSHARED,0,0 },
+    [LOCK_SYNC_EXCL] = { LOCK_EXCL, true,  LOCK_LOCK, ANY,  0,   0,   0,   0,   0,   0,   0,CEPH_CAP_GSHARED,0,0 },
+    [LOCK_LOCK_EXCL] = { LOCK_EXCL, false, LOCK_LOCK, AUTH, 0,   0,   0,   0,   0,   0,   CEPH_CAP_GSHARED,0,0,0 },
+
+    [LOCK_REMOTEXLOCK]={ LOCK_LOCK, false, LOCK_LOCK, 0,    0,   0,   0,   0,   0,   0,   0,0,0,0 },
+
+};
+
+const struct sm_t sm_simplelock = {
+	.states = simplelock,
+	.allowed_ever_auth = CEPH_CAP_GSHARED | CEPH_CAP_GEXCL,
+	.allowed_ever_replica = CEPH_CAP_GSHARED,
+	.careful = CEPH_CAP_GSHARED | CEPH_CAP_GEXCL,
+	.can_remote_xlock = 1,
+};
+
+
+// lock state machine states:
+//  Sync  --  Lock  --  sCatter
+//  Tempsync _/
+// (out of date)
+
+static const struct sm_state_t scatterlock[LOCK_MAX] = {
+                      // stable     loner  rep state  r     rp   rd   wr   fwr  l    x    caps,other
+    [LOCK_SYNC]      = { 0,         false, LOCK_SYNC, ANY,  0,   ANY, 0,   0,   ANY, 0,   CEPH_CAP_GSHARED,0,0,CEPH_CAP_GSHARED },
+    [LOCK_LOCK_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, AUTH, 0,   0,   0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_MIX_SYNC]  = { LOCK_SYNC, false, LOCK_LOCK, 0,    0,   0,   0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0,    0,   0,   0,   AUTH,0,   0,   0,0,0,0 },
+   
+    [LOCK_LOCK]      = { 0,         false, LOCK_LOCK, AUTH, 0,   REQ, AUTH,0,   0,   ANY, 0,0,0,0 },
+    [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, ANY,  0,   0,   0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_MIX_LOCK]  = { LOCK_LOCK, false, LOCK_MIX,  0,    0,   0,   0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, 0,    0,   0,   0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_TSYN_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0,   0,   0,   0,   0,   0,   0,0,0,0 },
+    
+    [LOCK_TSYN]      = { 0,         false, LOCK_LOCK, AUTH, 0,   AUTH,0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_LOCK_TSYN] = { LOCK_TSYN, false, LOCK_LOCK, 0,    0,   0,   0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_MIX_TSYN]  = { LOCK_TSYN, false, LOCK_LOCK, 0,    0,   0,   0,   0,   0,   0,   0,0,0,0 },
+
+    [LOCK_MIX]       = { 0,         false, LOCK_MIX,  0,    0,   REQ, ANY, 0,   0,   0,   0,0,0,0 },
+    [LOCK_TSYN_MIX]  = { LOCK_MIX,  false, LOCK_LOCK, 0,    0,   0,   0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_SYNC_MIX]  = { LOCK_MIX,  false, LOCK_SYNC_MIX2,ANY,0, 0,   0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_SYNC_MIX2] = { LOCK_MIX,  false, 0,         0,    0,   0,   0,   0,   0,   0,   0,0,0,0 },
+};
+
+const struct sm_t sm_scatterlock = {
+	.states = scatterlock,
+	.allowed_ever_auth = CEPH_CAP_GSHARED | CEPH_CAP_GEXCL,
+	.allowed_ever_replica = CEPH_CAP_GSHARED,
+	.careful = CEPH_CAP_GSHARED | CEPH_CAP_GEXCL,
+	.can_remote_xlock = 0,
+};
+
+const struct sm_state_t filelock[LOCK_MAX] = {
+                      // stable     loner  rep state  r     rp   rd   wr   fwr  l    x    caps(any,loner,xlocker,replica)
+    [LOCK_SYNC]      = { 0,         false, LOCK_SYNC, ANY,  0,   ANY, 0,   0,   ANY, 0,   CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD },
+    [LOCK_LOCK_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, AUTH, 0,   0,   0,   0,   0,   0,   CEPH_CAP_GCACHE,0,0,0 },
+    [LOCK_EXCL_SYNC] = { LOCK_SYNC, true,  LOCK_LOCK, 0,    0,   0,   0,   XCL, 0,   0,   0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD,0,0 },
+    [LOCK_MIX_SYNC]  = { LOCK_SYNC, false, LOCK_MIX_SYNC2,0,0,   0,   0,   0,   0,   0,   CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
+    [LOCK_MIX_SYNC2] = { LOCK_SYNC, false, 0,         0,    0,   0,   0,   0,   0,   0,   CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
+    [LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0,    0,   0,   0,   AUTH,0,   0,   0,0,0,0 },
+    [LOCK_XSYN_SYNC] = { LOCK_SYNC, true,  LOCK_LOCK, AUTH, 0,   AUTH,0,   0,   0,   0,   0,CEPH_CAP_GCACHE,0,0 },
+  
+    [LOCK_LOCK]      = { 0,         false, LOCK_LOCK, AUTH, 0,   REQ, AUTH,0,   0,   0,   CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
+    [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, ANY,  0,   REQ, 0,   0,   0,   0,   CEPH_CAP_GCACHE,0,0,0 },
+    [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0,    0,   0,   0,   XCL, 0,   0,   CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
+    [LOCK_MIX_LOCK]  = { LOCK_LOCK, false, LOCK_MIX,  0,    0,   REQ, 0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, 0,    0,   REQ, 0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_XSYN_LOCK] = { LOCK_LOCK, true,  LOCK_LOCK, AUTH, 0,   0,   XCL, 0,   0,   0,   0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
+
+    [LOCK_PREXLOCK]  = { LOCK_LOCK, false, LOCK_LOCK, 0,    XCL, 0,   0,   0,   0,   ANY, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
+    [LOCK_XLOCK]     = { LOCK_LOCK, false, LOCK_LOCK, 0,    XCL, 0,   0,   0,   0,   0,   CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
+    [LOCK_XLOCKDONE] = { LOCK_LOCK, false, LOCK_LOCK, XCL,  XCL, XCL, 0,   0,   XCL, 0,   CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,CEPH_CAP_GSHARED,0 },
+    [LOCK_XLOCKSNAP] = { LOCK_LOCK, false, LOCK_LOCK, 0,    XCL, 0,   0,   0,   0,   0,   CEPH_CAP_GCACHE,0,0,0 },
+    [LOCK_LOCK_XLOCK]= { LOCK_PREXLOCK,false,LOCK_LOCK,0,   XCL, 0,   0,   0,   0,   XCL, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
+
+    [LOCK_MIX]       = { 0,         false, LOCK_MIX,  0,    0,   REQ, ANY, 0,   0,   0,   CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
+    [LOCK_SYNC_MIX]  = { LOCK_MIX,  false, LOCK_SYNC_MIX2,ANY,0, 0,   0,   0,   0,   0,   CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
+    [LOCK_SYNC_MIX2] = { LOCK_MIX,  false, 0,         0,    0,   0,   0,   0,   0,   0,   CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
+    [LOCK_EXCL_MIX]  = { LOCK_MIX,  true,  LOCK_LOCK, 0,    0,   0,   XCL, 0,   0,   0,   0,CEPH_CAP_GRD|CEPH_CAP_GWR,0,0 },
+    [LOCK_XSYN_MIX]  = { LOCK_MIX,  true,  LOCK_LOCK, 0,    0,   0,   XCL, 0,   0,   0,   0,0,0,0 },
+    
+    [LOCK_EXCL]      = { 0,         true,  LOCK_LOCK, 0,    0,   XCL, XCL, 0,   0,   0,   0,CEPH_CAP_GSHARED|CEPH_CAP_GEXCL|CEPH_CAP_GCACHE|CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GBUFFER,0,0 },
+    [LOCK_SYNC_EXCL] = { LOCK_EXCL, true,  LOCK_LOCK, ANY,  0,   0,   0,   0,   0,   0,   0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD,0,0 },
+    [LOCK_MIX_EXCL]  = { LOCK_EXCL, true,  LOCK_LOCK, 0,    0,   0,   XCL, 0,   0,   0,   0,CEPH_CAP_GRD|CEPH_CAP_GWR,0,0 },
+    [LOCK_LOCK_EXCL] = { LOCK_EXCL, true,  LOCK_LOCK, AUTH, 0,   0,   0,   0,   0,   0,   0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
+    [LOCK_XSYN_EXCL] = { LOCK_EXCL, true,  LOCK_LOCK, AUTH, 0,   XCL, 0,   0,   0,   0,   0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
+
+    [LOCK_XSYN]      = { 0,         true,  LOCK_LOCK, AUTH, AUTH,AUTH,XCL, 0,   0,   0,   0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
+    [LOCK_EXCL_XSYN] = { LOCK_XSYN, false, LOCK_LOCK, 0,    0,   XCL, 0,   0,   0,   0,   0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
+
+    [LOCK_PRE_SCAN]  = { LOCK_SCAN, false, LOCK_LOCK, 0,    0,   0,   0,   0,   0,   0,   0,0,0,0 },
+    [LOCK_SCAN]      = { LOCK_LOCK, false, LOCK_LOCK, 0,    0,   0,   0,   0,   0,   0,   0,0,0,0 },
+};
+
+const struct sm_t sm_filelock = {
+	.states = filelock,
+	.allowed_ever_auth = (CEPH_CAP_GSHARED |
+			      CEPH_CAP_GEXCL |
+			      CEPH_CAP_GCACHE |
+			      CEPH_CAP_GRD |
+			      CEPH_CAP_GWR |
+			      CEPH_CAP_GWREXTEND |
+			      CEPH_CAP_GBUFFER | 
+			      CEPH_CAP_GLAZYIO),
+	.allowed_ever_replica = (CEPH_CAP_GSHARED |
+				 CEPH_CAP_GCACHE |
+				 CEPH_CAP_GRD | 
+				 CEPH_CAP_GLAZYIO),
+	.careful = (CEPH_CAP_GSHARED | 
+		    CEPH_CAP_GEXCL | 
+		    CEPH_CAP_GCACHE |
+		    CEPH_CAP_GBUFFER),
+	.can_remote_xlock = 0,
+};
+
+
+const struct sm_state_t locallock[LOCK_MAX] = {
+                      // stable     loner  rep state  r     rp   rd   wr   fwr  l    x    caps(any,loner,xlocker,replica)
+    [LOCK_LOCK]      = { 0,         false, LOCK_LOCK, ANY,  0,   ANY, 0,   0,   ANY, AUTH,0,0,0,0 },
+};
+
+const struct sm_t sm_locallock = {
+  .states = locallock,
+  .allowed_ever_auth = 0,
+  .allowed_ever_replica = 0,
+  .careful = 0,
+  .can_remote_xlock = 0,
+};
diff --git a/src/mds/locks.h b/src/mds/locks.h
new file mode 100644
index 000000000..e6fdc1cfc
--- /dev/null
+++ b/src/mds/locks.h
@@ -0,0 +1,126 @@
+#ifndef CEPH_MDS_LOCKS_H
+#define CEPH_MDS_LOCKS_H
+#include <stdbool.h>
+
+struct sm_state_t {
+  int next;         // 0 if stable
+  bool loner;
+  int replica_state;
+  char can_read;
+  char can_read_projected;
+  char can_rdlock;
+  char can_wrlock;
+  char can_force_wrlock;
+  char can_lease;
+  char can_xlock;
+  int caps;
+  int loner_caps;
+  int xlocker_caps;
+  int replica_caps;
+};
+
+struct sm_t {
+  const struct sm_state_t *states;
+  int allowed_ever_auth;
+  int allowed_ever_replica;
+  int careful;
+  int can_remote_xlock;
+};
+
+#define ANY  1 // auth or replica
+#define AUTH 2 // auth only
+#define XCL  3 // auth or exclusive client
+//#define FW   4 // fw to auth, if replica
+#define REQ  5 // req state change from auth, if replica
+
+extern const struct sm_t sm_simplelock;
+extern const struct sm_t sm_filelock;
+extern const struct sm_t sm_scatterlock;
+extern const struct sm_t sm_locallock;
+
+
+
+// -- lock states --
+// sync <-> lock
+enum {
+  LOCK_UNDEF = 0,
+
+  //                                    auth               rep
+  LOCK_SYNC,    // AR   R . RD L . / C .   R RD L . / C . 
+  LOCK_LOCK,    // AR   R . .. . X / . .   . .. . . / . .
+
+  LOCK_PREXLOCK,    // A    . . .. . . / . .   (lock)
+  LOCK_XLOCK,       // A    . . .. . . / . .   (lock)
+  LOCK_XLOCKDONE,   // A    r p rd l x / . .   (lock)  <-- by same client only!!
+  LOCK_XLOCKSNAP,   // also revoke Fb
+  LOCK_LOCK_XLOCK,
+
+  LOCK_SYNC_LOCK,    // AR   R . .. . . / . .   R .. . . / . .
+  LOCK_LOCK_SYNC,    // A    R p rd l . / . .   (lock)  <-- lc by same client only
+
+  LOCK_EXCL,         // A    . . .. . . / c x * (lock)
+  LOCK_EXCL_SYNC,    // A    . . .. . . / c . * (lock)
+  LOCK_EXCL_LOCK,    // A    . . .. . . / . .   (lock)
+  LOCK_SYNC_EXCL,    // Ar   R . .. . . / c . * (sync->lock)
+  LOCK_LOCK_EXCL,    // A    R . .. . . / . .   (lock)
+
+  LOCK_REMOTEXLOCK,  // on NON-auth
+
+  // * = loner mode
+
+  LOCK_MIX,
+  LOCK_SYNC_MIX,
+  LOCK_SYNC_MIX2,
+  LOCK_LOCK_MIX,
+  LOCK_EXCL_MIX,
+  LOCK_MIX_SYNC,
+  LOCK_MIX_SYNC2,
+  LOCK_MIX_LOCK,
+  LOCK_MIX_LOCK2,
+  LOCK_MIX_EXCL,
+
+  LOCK_TSYN,
+  LOCK_TSYN_LOCK,
+  LOCK_TSYN_MIX,
+  LOCK_LOCK_TSYN,
+  LOCK_MIX_TSYN,
+
+  LOCK_PRE_SCAN,
+  LOCK_SCAN,
+
+  LOCK_SNAP_SYNC,
+
+  LOCK_XSYN,
+  LOCK_XSYN_EXCL,
+  LOCK_EXCL_XSYN,
+  LOCK_XSYN_SYNC,
+  LOCK_XSYN_LOCK,
+  LOCK_XSYN_MIX,
+
+  LOCK_MAX,
+};
+
+// -------------------------
+// lock actions
+
+// for replicas
+#define LOCK_AC_SYNC        -1
+#define LOCK_AC_MIX         -2
+#define LOCK_AC_LOCK        -3
+#define LOCK_AC_LOCKFLUSHED -4
+
+// for auth
+#define LOCK_AC_SYNCACK      1
+#define LOCK_AC_MIXACK     2
+#define LOCK_AC_LOCKACK      3
+
+#define LOCK_AC_REQSCATTER   7
+#define LOCK_AC_REQUNSCATTER 8
+#define LOCK_AC_NUDGE        9
+#define LOCK_AC_REQRDLOCK   10
+
+#define LOCK_AC_FOR_REPLICA(a)  ((a) < 0)
+#define LOCK_AC_FOR_AUTH(a)     ((a) > 0)
+
+
+#endif
diff --git a/src/mds/mds_table_types.h b/src/mds/mds_table_types.h
new file mode 100644
index 000000000..bfb2baa95
--- /dev/null
+++ b/src/mds/mds_table_types.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDSTABLETYPES_H
+#define CEPH_MDSTABLETYPES_H
+
+// MDS TABLES
+
+#include <string_view>
+
+enum {
+  TABLE_ANCHOR,
+  TABLE_SNAP,
+};
+
+inline std::string_view get_mdstable_name(int t) {
+  switch (t) {
+  case TABLE_ANCHOR: return "anchortable";
+  case TABLE_SNAP: return "snaptable";
+  default: ceph_abort(); return std::string_view();
+  }
+}
+
+enum {
+  TABLESERVER_OP_QUERY        =  1,
+  TABLESERVER_OP_QUERY_REPLY  = -2,
+  TABLESERVER_OP_PREPARE      =  3,
+  TABLESERVER_OP_AGREE        = -4,
+  TABLESERVER_OP_COMMIT       =  5,
+  TABLESERVER_OP_ACK          = -6,
+  TABLESERVER_OP_ROLLBACK     =  7,
+  TABLESERVER_OP_SERVER_UPDATE = 8,
+  TABLESERVER_OP_SERVER_READY = -9,
+  TABLESERVER_OP_NOTIFY_ACK   = 10,
+  TABLESERVER_OP_NOTIFY_PREP  = -11,
+};
+
+inline std::string_view get_mdstableserver_opname(int op) {
+  switch (op) {
+  case TABLESERVER_OP_QUERY: return "query";
+  case TABLESERVER_OP_QUERY_REPLY: return "query_reply";
+  case TABLESERVER_OP_PREPARE: return "prepare";
+  case TABLESERVER_OP_AGREE: return "agree";
+  case TABLESERVER_OP_COMMIT: return "commit";
+  case TABLESERVER_OP_ACK: return "ack";
+  case TABLESERVER_OP_ROLLBACK: return "rollback";
+  case TABLESERVER_OP_SERVER_UPDATE: return "server_update";
+  case TABLESERVER_OP_SERVER_READY: return "server_ready";
+  case TABLESERVER_OP_NOTIFY_ACK: return "notify_ack";
+  case TABLESERVER_OP_NOTIFY_PREP: return "notify_prep";
+  default: ceph_abort(); return std::string_view();
+  }
+}
+
+enum {
+  TABLE_OP_CREATE,
+  TABLE_OP_UPDATE,
+  TABLE_OP_DESTROY,
+};
+
+inline std::string_view get_mdstable_opname(int op) {
+  switch (op) {
+  case TABLE_OP_CREATE: return "create";
+  case TABLE_OP_UPDATE: return "update";
+  case TABLE_OP_DESTROY: return "destroy";
+  default: ceph_abort(); return std::string_view();
+  }
+}
+
+#endif
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
new file mode 100644
index 000000000..9d46c8d86
--- /dev/null
+++ b/src/mds/mdstypes.cc
@@ -0,0 +1,969 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "mdstypes.h"
+#include "MDSContext.h"
+#include "common/Formatter.h"
+#include "common/StackStringStream.h"
+
+const mds_gid_t MDS_GID_NONE = mds_gid_t(0);
+
+using std::list;
+using std::make_pair;
+using std::ostream;
+using std::set;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+/*
+ * frag_info_t
+ */
+
+void frag_info_t::encode(bufferlist &bl) const
+{
+  ENCODE_START(3, 2, bl);
+  encode(version, bl);
+  encode(mtime, bl);
+  encode(nfiles, bl);
+  encode(nsubdirs, bl);
+  encode(change_attr, bl);
+  ENCODE_FINISH(bl);
+}
+
+void frag_info_t::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+  decode(version, bl);
+  decode(mtime, bl);
+  decode(nfiles, bl);
+  decode(nsubdirs, bl);
+  if (struct_v >= 3)
+    decode(change_attr, bl);
+  else
+    change_attr = 0;
+  DECODE_FINISH(bl);
+}
+
+void frag_info_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("version", version);
+  f->dump_stream("mtime") << mtime;
+  f->dump_unsigned("num_files", nfiles);
+  f->dump_unsigned("num_subdirs", nsubdirs);
+  f->dump_unsigned("change_attr", change_attr);
+}
+
+void frag_info_t::decode_json(JSONObj *obj){
+
+  JSONDecoder::decode_json("version", version, obj, true);
+  JSONDecoder::decode_json("mtime", mtime, obj, true);
+  JSONDecoder::decode_json("num_files", nfiles, obj, true);
+  JSONDecoder::decode_json("num_subdirs", nsubdirs, obj, true);
+  JSONDecoder::decode_json("change_attr", change_attr, obj, true);
+}
+
+void frag_info_t::generate_test_instances(std::list<frag_info_t*>& ls)
+{
+  ls.push_back(new frag_info_t);
+  ls.push_back(new frag_info_t);
+  ls.back()->version = 1;
+  ls.back()->mtime = utime_t(2, 3);
+  ls.back()->nfiles = 4;
+  ls.back()->nsubdirs = 5;
+}
+
+ostream& operator<<(ostream &out, const frag_info_t &f)
+{
+  if (f == frag_info_t())
+    return out << "f()";
+  out << "f(v" << f.version;
+  if (f.mtime != utime_t())
+    out << " m" << f.mtime;
+  if (f.nfiles || f.nsubdirs)
+    out << " " << f.size() << "=" << f.nfiles << "+" << f.nsubdirs;
+  out << ")";
+  return out;
+}
+
+
+/*
+ * nest_info_t
+ */
+
+void nest_info_t::encode(bufferlist &bl) const
+{
+  ENCODE_START(3, 2, bl);
+  encode(version, bl);
+  encode(rbytes, bl);
+  encode(rfiles, bl);
+  encode(rsubdirs, bl);
+  {
+    // removed field
+    int64_t ranchors = 0;
+    encode(ranchors, bl);
+  }
+  encode(rsnaps, bl);
+  encode(rctime, bl);
+  ENCODE_FINISH(bl);
+}
+
+void nest_info_t::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+  decode(version, bl);
+  decode(rbytes, bl);
+  decode(rfiles, bl);
+  decode(rsubdirs, bl);
+  {
+    int64_t ranchors;
+    decode(ranchors, bl);
+  }
+  decode(rsnaps, bl);
+  decode(rctime, bl);
+  DECODE_FINISH(bl);
+}
+
+void nest_info_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("version", version);
+  f->dump_unsigned("rbytes", rbytes);
+  f->dump_unsigned("rfiles", rfiles);
+  f->dump_unsigned("rsubdirs", rsubdirs);
+  f->dump_unsigned("rsnaps", rsnaps);
+  f->dump_stream("rctime") << rctime;
+}
+
+void nest_info_t::decode_json(JSONObj *obj){
+
+  JSONDecoder::decode_json("version", version, obj, true);
+  JSONDecoder::decode_json("rbytes", rbytes, obj, true);
+  JSONDecoder::decode_json("rfiles", rfiles, obj, true);
+  JSONDecoder::decode_json("rsubdirs", rsubdirs, obj, true);
+  JSONDecoder::decode_json("rsnaps", rsnaps, obj, true);
+  JSONDecoder::decode_json("rctime", rctime, obj, true);
+}
+
+void nest_info_t::generate_test_instances(std::list<nest_info_t*>& ls)
+{
+  ls.push_back(new nest_info_t);
+  ls.push_back(new nest_info_t);
+  ls.back()->version = 1;
+  ls.back()->rbytes = 2;
+  ls.back()->rfiles = 3;
+  ls.back()->rsubdirs = 4;
+  ls.back()->rsnaps = 6;
+  ls.back()->rctime = utime_t(7, 8);
+}
+
+ostream& operator<<(ostream &out, const nest_info_t &n)
+{
+  if (n == nest_info_t())
+    return out << "n()";
+  out << "n(v" << n.version;
+  if (n.rctime != utime_t())
+    out << " rc" << n.rctime;
+  if (n.rbytes)
+    out << " b" << n.rbytes;
+  if (n.rsnaps)
+    out << " rs" << n.rsnaps;
+  if (n.rfiles || n.rsubdirs)
+    out << " " << n.rsize() << "=" << n.rfiles << "+" << n.rsubdirs;
+  out << ")";    
+  return out;
+}
+
+/*
+ * quota_info_t
+ */
+void quota_info_t::dump(Formatter *f) const
+{
+  f->dump_int("max_bytes", max_bytes);
+  f->dump_int("max_files", max_files);
+}
+
+void  quota_info_t::decode_json(JSONObj *obj){
+
+  JSONDecoder::decode_json("max_bytes", max_bytes, obj, true);
+  JSONDecoder::decode_json("max_files", max_files, obj, true);
+}
+
+void quota_info_t::generate_test_instances(std::list<quota_info_t *>& ls)
+{
+  ls.push_back(new quota_info_t);
+  ls.push_back(new quota_info_t);
+  ls.back()->max_bytes = 16;
+  ls.back()->max_files = 16;
+}
+
+ostream& operator<<(ostream &out, const quota_info_t &n)
+{
+  out << "quota("
+      << "max_bytes = " << n.max_bytes
+      << " max_files = " << n.max_files
+      << ")";
+  return out;
+}
+
+/*
+ * client_writeable_range_t
+ */
+
+void client_writeable_range_t::encode(bufferlist &bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(range.first, bl);
+  encode(range.last, bl);
+  encode(follows, bl);
+  ENCODE_FINISH(bl);
+}
+
+void client_writeable_range_t::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(range.first, bl);
+  decode(range.last, bl);
+  decode(follows, bl);
+  DECODE_FINISH(bl);
+}
+
+void client_writeable_range_t::dump(Formatter *f) const
+{
+  f->open_object_section("byte range");
+  f->dump_unsigned("first", range.first);
+  f->dump_unsigned("last", range.last);
+  f->close_section();
+  f->dump_unsigned("follows", follows);
+}
+
+void client_writeable_range_t::byte_range_t::decode_json(JSONObj *obj){
+
+  JSONDecoder::decode_json("first", first, obj, true);
+  JSONDecoder::decode_json("last", last, obj, true);
+}
+
+void client_writeable_range_t::generate_test_instances(std::list<client_writeable_range_t*>& ls)
+{
+  ls.push_back(new client_writeable_range_t);
+  ls.push_back(new client_writeable_range_t);
+  ls.back()->range.first = 123;
+  ls.back()->range.last = 456;
+  ls.back()->follows = 12;
+}
+
+ostream& operator<<(ostream& out, const client_writeable_range_t& r)
+{
+  return out << r.range.first << '-' << r.range.last << "@" << r.follows;
+}
+
+/*
+ * inline_data_t
+ */
+void inline_data_t::encode(bufferlist &bl) const
+{
+  using ceph::encode;
+  encode(version, bl);
+  if (blp)
+    encode(*blp, bl);
+  else
+    encode(bufferlist(), bl);
+}
+void inline_data_t::decode(bufferlist::const_iterator &p)
+{
+  using ceph::decode;
+  decode(version, p);
+  uint32_t inline_len;
+  decode(inline_len, p);
+  if (inline_len > 0) {
+    ceph::buffer::list bl;
+    decode_nohead(inline_len, bl, p);
+    set_data(bl);
+  } else
+    free_data();
+}
+
+
+/*
+ * fnode_t
+ */
+void fnode_t::encode(bufferlist &bl) const
+{
+  ENCODE_START(4, 3, bl);
+  encode(version, bl);
+  encode(snap_purged_thru, bl);
+  encode(fragstat, bl);
+  encode(accounted_fragstat, bl);
+  encode(rstat, bl);
+  encode(accounted_rstat, bl);
+  encode(damage_flags, bl);
+  encode(recursive_scrub_version, bl);
+  encode(recursive_scrub_stamp, bl);
+  encode(localized_scrub_version, bl);
+  encode(localized_scrub_stamp, bl);
+  ENCODE_FINISH(bl);
+}
+
+void fnode_t::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+  decode(version, bl);
+  decode(snap_purged_thru, bl);
+  decode(fragstat, bl);
+  decode(accounted_fragstat, bl);
+  decode(rstat, bl);
+  decode(accounted_rstat, bl);
+  if (struct_v >= 3) {
+    decode(damage_flags, bl);
+  }
+  if (struct_v >= 4) {
+    decode(recursive_scrub_version, bl);
+    decode(recursive_scrub_stamp, bl);
+    decode(localized_scrub_version, bl);
+    decode(localized_scrub_stamp, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void fnode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("version", version);
+  f->dump_unsigned("snap_purged_thru", snap_purged_thru);
+
+  f->open_object_section("fragstat");
+  fragstat.dump(f);
+  f->close_section();
+
+  f->open_object_section("accounted_fragstat");
+  accounted_fragstat.dump(f);
+  f->close_section();
+
+  f->open_object_section("rstat");
+  rstat.dump(f);
+  f->close_section();
+
+  f->open_object_section("accounted_rstat");
+  accounted_rstat.dump(f);
+  f->close_section();
+}
+void fnode_t::decode_json(JSONObj *obj){
+  JSONDecoder::decode_json("version", version, obj, true);
+  uint64_t tmp;
+  JSONDecoder::decode_json("snap_purged_thru", tmp, obj, true);
+  snap_purged_thru.val = tmp;
+  JSONDecoder::decode_json("fragstat", fragstat, obj, true);
+  JSONDecoder::decode_json("accounted_fragstat", accounted_fragstat, obj, true);
+  JSONDecoder::decode_json("rstat", rstat, obj, true);
+  JSONDecoder::decode_json("accounted_rstat", accounted_rstat, obj, true);
+}
+void fnode_t::generate_test_instances(std::list<fnode_t*>& ls)
+{
+  ls.push_back(new fnode_t);
+  ls.push_back(new fnode_t);
+  ls.back()->version = 1;
+  ls.back()->snap_purged_thru = 2;
+  list<frag_info_t*> fls;
+  frag_info_t::generate_test_instances(fls);
+  ls.back()->fragstat = *fls.back();
+  ls.back()->accounted_fragstat = *fls.front();
+  list<nest_info_t*> nls;
+  nest_info_t::generate_test_instances(nls);
+  ls.back()->rstat = *nls.front();
+  ls.back()->accounted_rstat = *nls.back();
+}
+
+
+/*
+ * old_rstat_t
+ */
+void old_rstat_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(first, bl);
+  encode(rstat, bl);
+  encode(accounted_rstat, bl);
+  ENCODE_FINISH(bl);
+}
+
+void old_rstat_t::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(first, bl);
+  decode(rstat, bl);
+  decode(accounted_rstat, bl);
+  DECODE_FINISH(bl);
+}
+
+void old_rstat_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("snapid", first);
+  f->open_object_section("rstat");
+  rstat.dump(f);
+  f->close_section();
+  f->open_object_section("accounted_rstat");
+  accounted_rstat.dump(f);
+  f->close_section();
+}
+
+void old_rstat_t::generate_test_instances(std::list<old_rstat_t*>& ls)
+{
+  ls.push_back(new old_rstat_t());
+  ls.push_back(new old_rstat_t());
+  ls.back()->first = 12;
+  list<nest_info_t*> nls;
+  nest_info_t::generate_test_instances(nls);
+  ls.back()->rstat = *nls.back();
+  ls.back()->accounted_rstat = *nls.front();
+}
+
+/*
+ * feature_bitset_t
+ */
+feature_bitset_t::feature_bitset_t(unsigned long value)
+{
+  if (value) {
+    for (size_t i = 0; i < sizeof(value) * 8; i += bits_per_block) {
+      _vec.push_back((block_type)(value >> i));
+    }
+  }
+}
+
+feature_bitset_t::feature_bitset_t(const vector<size_t>& array)
+{
+  if (!array.empty()) {
+    size_t n = array.back();
+    n += bits_per_block;
+    n /= bits_per_block;
+    _vec.resize(n, 0);
+
+    size_t last = 0;
+    for (auto& bit : array) {
+      if (bit > last)
+	last = bit;
+      else
+	ceph_assert(bit == last);
+      _vec[bit / bits_per_block] |= (block_type)1 << (bit % bits_per_block);
+    }
+  }
+}
+
+feature_bitset_t& feature_bitset_t::operator-=(const feature_bitset_t& other)
+{
+  for (size_t i = 0; i < _vec.size(); ++i) {
+    if (i >= other._vec.size())
+      break;
+    _vec[i] &= ~other._vec[i];
+  }
+  return *this;
+}
+
+void feature_bitset_t::encode(bufferlist& bl) const {
+  using ceph::encode;
+  using ceph::encode_nohead;
+  uint32_t len = _vec.size() * sizeof(block_type);
+  encode(len, bl);
+  encode_nohead(_vec, bl);
+}
+
+void feature_bitset_t::decode(bufferlist::const_iterator &p) {
+  using ceph::decode;
+  using ceph::decode_nohead;
+  uint32_t len;
+  decode(len, p);
+
+  _vec.clear();
+  if (len >= sizeof(block_type))
+    decode_nohead(len / sizeof(block_type), _vec, p);
+
+  if (len % sizeof(block_type)) {
+    ceph_le64 buf{};
+    p.copy(len % sizeof(block_type), (char*)&buf);
+    _vec.push_back((block_type)buf);
+  }
+}
+
+void feature_bitset_t::dump(Formatter *f) const {
+  CachedStackStringStream css;
+  print(*css);
+  f->dump_string("feature_bits", css->strv());
+}
+
+void feature_bitset_t::print(ostream& out) const
+{
+  std::ios_base::fmtflags f(out.flags());
+  out << "0x";
+  for (int i = _vec.size() - 1; i >= 0; --i)
+    out << std::setfill('0') << std::setw(sizeof(block_type) * 2)
+        << std::hex << _vec[i];
+  out.flags(f);
+}
+
+/*
+ * metric_spec_t
+ */
+void metric_spec_t::encode(bufferlist& bl) const {
+  using ceph::encode;
+  ENCODE_START(1, 1, bl);
+  encode(metric_flags, bl);
+  ENCODE_FINISH(bl);
+}
+
+void metric_spec_t::decode(bufferlist::const_iterator &p) {
+  using ceph::decode;
+  DECODE_START(1, p);
+  decode(metric_flags, p);
+  DECODE_FINISH(p);
+}
+
+void metric_spec_t::dump(Formatter *f) const {
+  f->dump_object("metric_flags", metric_flags);
+}
+
+void metric_spec_t::print(ostream& out) const
+{
+  out << "{metric_flags: '" << metric_flags << "'}";
+}
+
+/*
+ * client_metadata_t
+ */
+void client_metadata_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(3, 1, bl);
+  encode(kv_map, bl);
+  encode(features, bl);
+  encode(metric_spec, bl);
+  ENCODE_FINISH(bl);
+}
+
+void client_metadata_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(3, p);
+  decode(kv_map, p);
+  if (struct_v >= 2)
+    decode(features, p);
+  if (struct_v >= 3) {
+    decode(metric_spec, p);
+  }
+  DECODE_FINISH(p);
+}
+
+void client_metadata_t::dump(Formatter *f) const
+{
+  f->dump_object("client_features", features);
+  f->dump_object("metric_spec", metric_spec);
+  for (const auto& [name, val] : kv_map)
+    f->dump_string(name.c_str(), val);
+}
+
+/*
+ * session_info_t
+ */
+void session_info_t::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(7, 7, bl);
+  encode(inst, bl, features);
+  encode(completed_requests, bl);
+  encode(prealloc_inos, bl);   // hacky, see below.
+  encode((__u32)0, bl); // used_inos
+  encode(completed_flushes, bl);
+  encode(auth_name, bl);
+  encode(client_metadata, bl);
+  ENCODE_FINISH(bl);
+}
+
+void session_info_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(7, 2, 2, p);
+  decode(inst, p);
+  if (struct_v <= 2) {
+    set<ceph_tid_t> s;
+    decode(s, p);
+    while (!s.empty()) {
+      completed_requests[*s.begin()] = inodeno_t();
+      s.erase(s.begin());
+    }
+  } else {
+    decode(completed_requests, p);
+  }
+  decode(prealloc_inos, p);
+  {
+    interval_set<inodeno_t> used_inos;
+    decode(used_inos, p);
+    prealloc_inos.insert(used_inos);
+  }
+  if (struct_v >= 4 && struct_v < 7) {
+    decode(client_metadata.kv_map, p);
+  }
+  if (struct_v >= 5) {
+    decode(completed_flushes, p);
+  }
+  if (struct_v >= 6) {
+    decode(auth_name, p);
+  }
+  if (struct_v >= 7) {
+    decode(client_metadata, p);
+  }
+  DECODE_FINISH(p);
+}
+
+void session_info_t::dump(Formatter *f) const
+{
+  f->dump_stream("inst") << inst;
+
+  f->open_array_section("completed_requests");
+  for (const auto& [tid, ino] : completed_requests) {
+    f->open_object_section("request");
+    f->dump_unsigned("tid", tid);
+    f->dump_stream("created_ino") << ino;
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("prealloc_inos");
+  for (const auto& [start, len] : prealloc_inos) {
+    f->open_object_section("ino_range");
+    f->dump_stream("start") << start;
+    f->dump_unsigned("length", len);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->dump_object("client_metadata", client_metadata);
+}
+
+void session_info_t::generate_test_instances(std::list<session_info_t*>& ls)
+{
+  ls.push_back(new session_info_t);
+  ls.push_back(new session_info_t);
+  ls.back()->inst = entity_inst_t(entity_name_t::MDS(12), entity_addr_t());
+  ls.back()->completed_requests.insert(make_pair(234, inodeno_t(111222)));
+  ls.back()->completed_requests.insert(make_pair(237, inodeno_t(222333)));
+  ls.back()->prealloc_inos.insert(333, 12);
+  ls.back()->prealloc_inos.insert(377, 112);
+  // we can't add used inos; they're cleared on decode
+}
+
+
+/*
+ * string_snap_t
+ */
+void string_snap_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(name, bl);
+  encode(snapid, bl);
+  ENCODE_FINISH(bl);
+}
+
+void string_snap_t::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(name, bl);
+  decode(snapid, bl);
+  DECODE_FINISH(bl);
+}
+
+void string_snap_t::dump(Formatter *f) const
+{
+  f->dump_string("name", name);
+  f->dump_unsigned("snapid", snapid);
+}
+
+void string_snap_t::generate_test_instances(std::list<string_snap_t*>& ls)
+{
+  ls.push_back(new string_snap_t);
+  ls.push_back(new string_snap_t);
+  ls.back()->name = "foo";
+  ls.back()->snapid = 123;
+  ls.push_back(new string_snap_t);
+  ls.back()->name = "bar";
+  ls.back()->snapid = 456;
+}
+
+
+/*
+ * MDSCacheObjectInfo
+ */
+void MDSCacheObjectInfo::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(ino, bl);
+  encode(dirfrag, bl);
+  encode(dname, bl);
+  encode(snapid, bl);
+  ENCODE_FINISH(bl);
+}
+
+void MDSCacheObjectInfo::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
+  decode(ino, p);
+  decode(dirfrag, p);
+  decode(dname, p);
+  decode(snapid, p);
+  DECODE_FINISH(p);
+}
+
+void MDSCacheObjectInfo::dump(Formatter *f) const
+{
+  f->dump_unsigned("ino", ino);
+  f->dump_stream("dirfrag") << dirfrag;
+  f->dump_string("name", dname);
+  f->dump_unsigned("snapid", snapid);
+}
+
+void MDSCacheObjectInfo::generate_test_instances(std::list<MDSCacheObjectInfo*>& ls)
+{
+  ls.push_back(new MDSCacheObjectInfo);
+  ls.push_back(new MDSCacheObjectInfo);
+  ls.back()->ino = 1;
+  ls.back()->dirfrag = dirfrag_t(2, 3);
+  ls.back()->dname = "fooname";
+  ls.back()->snapid = CEPH_NOSNAP;
+  ls.push_back(new MDSCacheObjectInfo);
+  ls.back()->ino = 121;
+  ls.back()->dirfrag = dirfrag_t(222, 0);
+  ls.back()->dname = "bar foo";
+  ls.back()->snapid = 21322;
+}
+
+/*
+ * mds_table_pending_t
+ */
+void mds_table_pending_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(reqid, bl);
+  encode(mds, bl);
+  encode(tid, bl);
+  ENCODE_FINISH(bl);
+}
+
+void mds_table_pending_t::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(reqid, bl);
+  decode(mds, bl);
+  decode(tid, bl);
+  DECODE_FINISH(bl);
+}
+
+void mds_table_pending_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("reqid", reqid);
+  f->dump_unsigned("mds", mds);
+  f->dump_unsigned("tid", tid);
+}
+
+void mds_table_pending_t::generate_test_instances(std::list<mds_table_pending_t*>& ls)
+{
+  ls.push_back(new mds_table_pending_t);
+  ls.push_back(new mds_table_pending_t);
+  ls.back()->reqid = 234;
+  ls.back()->mds = 2;
+  ls.back()->tid = 35434;
+}
+
+
+/*
+ * inode_load_vec_t
+ */
+void inode_load_vec_t::encode(bufferlist &bl) const
+{
+  ENCODE_START(2, 2, bl);
+  for (const auto &i : vec) {
+    encode(i, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void inode_load_vec_t::decode(bufferlist::const_iterator &p)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
+  for (auto &i : vec) {
+    decode(i, p);
+  }
+  DECODE_FINISH(p);
+}
+
+void inode_load_vec_t::dump(Formatter *f) const
+{
+  f->open_array_section("Decay Counters");
+  for (const auto &i : vec) {
+    f->open_object_section("Decay Counter");
+    i.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void inode_load_vec_t::generate_test_instances(std::list<inode_load_vec_t*>& ls)
+{
+  ls.push_back(new inode_load_vec_t(DecayRate()));
+}
+
+
+/*
+ * dirfrag_load_vec_t
+ */
+void dirfrag_load_vec_t::dump(Formatter *f) const
+{
+  f->open_array_section("Decay Counters");
+  for (const auto &i : vec) {
+    f->open_object_section("Decay Counter");
+    i.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void dirfrag_load_vec_t::dump(Formatter *f, const DecayRate& rate) const
+{
+  f->dump_float("meta_load", meta_load());
+  f->dump_float("IRD", get(META_POP_IRD).get());
+  f->dump_float("IWR", get(META_POP_IWR).get());
+  f->dump_float("READDIR", get(META_POP_READDIR).get());
+  f->dump_float("FETCH", get(META_POP_FETCH).get());
+  f->dump_float("STORE", get(META_POP_STORE).get());
+}
+
+void dirfrag_load_vec_t::generate_test_instances(std::list<dirfrag_load_vec_t*>& ls)
+{
+  ls.push_back(new dirfrag_load_vec_t(DecayRate()));
+}
+
+/*
+ * mds_load_t
+ */
+void mds_load_t::encode(bufferlist &bl) const {
+  ENCODE_START(2, 2, bl);
+  encode(auth, bl);
+  encode(all, bl);
+  encode(req_rate, bl);
+  encode(cache_hit_rate, bl);
+  encode(queue_len, bl);
+  encode(cpu_load_avg, bl);
+  ENCODE_FINISH(bl);
+}
+
+void mds_load_t::decode(bufferlist::const_iterator &bl) {
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(auth, bl);
+  decode(all, bl);
+  decode(req_rate, bl);
+  decode(cache_hit_rate, bl);
+  decode(queue_len, bl);
+  decode(cpu_load_avg, bl);
+  DECODE_FINISH(bl);
+}
+
+void mds_load_t::dump(Formatter *f) const
+{
+  f->dump_float("request rate", req_rate);
+  f->dump_float("cache hit rate", cache_hit_rate);
+  f->dump_float("queue length", queue_len);
+  f->dump_float("cpu load", cpu_load_avg);
+  f->open_object_section("auth dirfrag");
+  auth.dump(f);
+  f->close_section();
+  f->open_object_section("all dirfrags");
+  all.dump(f);
+  f->close_section();
+}
+
+void mds_load_t::generate_test_instances(std::list<mds_load_t*>& ls)
+{
+  ls.push_back(new mds_load_t(DecayRate()));
+}
+
+/*
+ * cap_reconnect_t
+ */
+void cap_reconnect_t::encode(bufferlist& bl) const {
+  ENCODE_START(2, 1, bl);
+  encode_old(bl); // extract out when something changes
+  encode(snap_follows, bl);
+  ENCODE_FINISH(bl);
+}
+
+void cap_reconnect_t::encode_old(bufferlist& bl) const {
+  using ceph::encode;
+  encode(path, bl);
+  capinfo.flock_len = flockbl.length();
+  encode(capinfo, bl);
+  ceph::encode_nohead(flockbl, bl);
+}
+
+void cap_reconnect_t::decode(bufferlist::const_iterator& bl) {
+  DECODE_START(2, bl);
+  decode_old(bl); // extract out when something changes
+  if (struct_v >= 2)
+    decode(snap_follows, bl);
+  DECODE_FINISH(bl);
+}
+
+void cap_reconnect_t::decode_old(bufferlist::const_iterator& bl) {
+  using ceph::decode;
+  decode(path, bl);
+  decode(capinfo, bl);
+  ceph::decode_nohead(capinfo.flock_len, flockbl, bl);
+}
+
+void cap_reconnect_t::dump(Formatter *f) const
+{
+  f->dump_string("path", path);
+  f->dump_int("cap_id", capinfo.cap_id);
+  f->dump_string("cap wanted", ccap_string(capinfo.wanted));
+  f->dump_string("cap issued", ccap_string(capinfo.issued));
+  f->dump_int("snaprealm", capinfo.snaprealm);
+  f->dump_int("path base ino", capinfo.pathbase);
+  f->dump_string("has file locks", capinfo.flock_len ? "true" : "false");
+}
+
+void cap_reconnect_t::generate_test_instances(std::list<cap_reconnect_t*>& ls)
+{
+  ls.push_back(new cap_reconnect_t);
+  ls.back()->path = "/test/path";
+  ls.back()->capinfo.cap_id = 1;
+}
+
+/*
+ * snaprealm_reconnect_t
+ */
+void snaprealm_reconnect_t::encode(bufferlist& bl) const {
+  ENCODE_START(1, 1, bl);
+  encode_old(bl); // extract out when something changes
+  ENCODE_FINISH(bl);
+}
+
+void snaprealm_reconnect_t::encode_old(bufferlist& bl) const {
+  using ceph::encode;
+  encode(realm, bl);
+}
+
+void snaprealm_reconnect_t::decode(bufferlist::const_iterator& bl) {
+  DECODE_START(1, bl);
+  decode_old(bl); // extract out when something changes
+  DECODE_FINISH(bl);
+}
+
+void snaprealm_reconnect_t::decode_old(bufferlist::const_iterator& bl) {
+  using ceph::decode;
+  decode(realm, bl);
+}
+
+void snaprealm_reconnect_t::dump(Formatter *f) const
+{
+  f->dump_int("ino", realm.ino);
+  f->dump_int("seq", realm.seq);
+  f->dump_int("parent", realm.parent);
+}
+
+void snaprealm_reconnect_t::generate_test_instances(std::list<snaprealm_reconnect_t*>& ls)
+{
+  ls.push_back(new snaprealm_reconnect_t);
+  ls.back()->realm.ino = 0x10000000001ULL;
+  ls.back()->realm.seq = 2;
+  ls.back()->realm.parent = 1;
+}
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
new file mode 100644
index 000000000..bfb279108
--- /dev/null
+++ b/src/mds/mdstypes.h
@@ -0,0 +1,1938 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_MDSTYPES_H
+#define CEPH_MDSTYPES_H
+
+#include "include/int_types.h"
+
+#include <ostream>
+#include <set>
+#include <map>
+#include <string_view>
+
+#include "common/config.h"
+#include "common/Clock.h"
+#include "common/DecayCounter.h"
+#include "common/StackStringStream.h"
+#include "common/entity_name.h"
+
+#include "include/compat.h"
+#include "include/Context.h"
+#include "include/frag.h"
+#include "include/xlist.h"
+#include "include/interval_set.h"
+#include "include/compact_set.h"
+#include "include/fs_types.h"
+#include "include/ceph_fs.h"
+
+#include "inode_backtrace.h"
+
+#include <boost/spirit/include/qi.hpp>
+#include <boost/pool/pool.hpp>
+#include "include/ceph_assert.h"
+#include <boost/serialization/strong_typedef.hpp>
+#include "common/ceph_json.h"
+
+#define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
+
+#define MDS_PORT_CACHE   0x200
+#define MDS_PORT_LOCKER  0x300
+#define MDS_PORT_MIGRATOR 0x400
+
+#define MAX_MDS                   0x100
+#define NUM_STRAY                 10
+
+// Inode numbers 1,2 and 4 please see CEPH_INO_* in include/ceph_fs.h
+
+#define MDS_INO_MDSDIR_OFFSET     (1*MAX_MDS)
+#define MDS_INO_STRAY_OFFSET      (6*MAX_MDS)
+
+// Locations for journal data
+#define MDS_INO_LOG_OFFSET        (2*MAX_MDS)
+#define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS)
+#define MDS_INO_LOG_POINTER_OFFSET    (4*MAX_MDS)
+#define MDS_INO_PURGE_QUEUE       (5*MAX_MDS)
+
+#define MDS_INO_SYSTEM_BASE       ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY))
+
+#define MDS_INO_STRAY(x,i)  (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i))))
+#define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x))
+
+#define MDS_INO_IS_STRAY(i)  ((i) >= MDS_INO_STRAY_OFFSET  && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
+#define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
+#define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
+#define MDS_INO_IS_BASE(i)   ((i) == CEPH_INO_ROOT || (i) == CEPH_INO_GLOBAL_SNAPREALM || MDS_INO_IS_MDSDIR(i))
+#define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
+#define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
+
+#define MDS_IS_PRIVATE_INO(i) ((i) < MDS_INO_SYSTEM_BASE && (i) >= MDS_INO_MDSDIR_OFFSET)
+
+typedef int32_t mds_rank_t;
+constexpr mds_rank_t MDS_RANK_NONE		= -1;
+constexpr mds_rank_t MDS_RANK_EPHEMERAL_DIST	= -2;
+constexpr mds_rank_t MDS_RANK_EPHEMERAL_RAND	= -3;
+
+BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
+extern const mds_gid_t MDS_GID_NONE;
+
+typedef int32_t fs_cluster_id_t;
+constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1;
+
+// The namespace ID of the anonymous default filesystem from legacy systems
+constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0;
+
+class mds_role_t {
+public:
+  mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_)
+    : fscid(fscid_), rank(rank_)
+  {}
+  mds_role_t() {}
+
+  bool operator<(mds_role_t const &rhs) const {
+    if (fscid < rhs.fscid) {
+      return true;
+    } else if (fscid == rhs.fscid) {
+      return rank < rhs.rank;
+    } else {
+      return false;
+    }
+  }
+
+  bool is_none() const {
+    return (rank == MDS_RANK_NONE);
+  }
+
+  fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+  mds_rank_t rank = MDS_RANK_NONE;
+};
+inline std::ostream& operator<<(std::ostream& out, const mds_role_t& role) {
+  return out << role.fscid << ":" << role.rank;
+}
+
+// CAPS
+inline std::string gcap_string(int cap)
+{
+  std::string s;
+  if (cap & CEPH_CAP_GSHARED) s += "s";  
+  if (cap & CEPH_CAP_GEXCL) s += "x";
+  if (cap & CEPH_CAP_GCACHE) s += "c";
+  if (cap & CEPH_CAP_GRD) s += "r";
+  if (cap & CEPH_CAP_GWR) s += "w";
+  if (cap & CEPH_CAP_GBUFFER) s += "b";
+  if (cap & CEPH_CAP_GWREXTEND) s += "a";
+  if (cap & CEPH_CAP_GLAZYIO) s += "l";
+  return s;
+}
+inline std::string ccap_string(int cap)
+{
+  std::string s;
+  if (cap & CEPH_CAP_PIN) s += "p";
+
+  int a = (cap >> CEPH_CAP_SAUTH) & 3;
+  if (a) s += 'A' + gcap_string(a);
+
+  a = (cap >> CEPH_CAP_SLINK) & 3;
+  if (a) s += 'L' + gcap_string(a);
+
+  a = (cap >> CEPH_CAP_SXATTR) & 3;
+  if (a) s += 'X' + gcap_string(a);
+
+  a = cap >> CEPH_CAP_SFILE;
+  if (a) s += 'F' + gcap_string(a);
+
+  if (s.length() == 0)
+    s = "-";
+  return s;
+}
+
+struct scatter_info_t {
+  version_t version = 0;
+};
+
+struct frag_info_t : public scatter_info_t {
+  int64_t size() const { return nfiles + nsubdirs; }
+
+  void zero() {
+    *this = frag_info_t();
+  }
+
+  // *this += cur - acc;
+  void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) {
+    if (cur.mtime > mtime) {
+      mtime = cur.mtime;
+      if (touched_mtime)
+	*touched_mtime = true;
+    }
+    if (cur.change_attr > change_attr) {
+      change_attr = cur.change_attr;
+      if (touched_chattr)
+	*touched_chattr = true;
+    }
+    nfiles += cur.nfiles - acc.nfiles;
+    nsubdirs += cur.nsubdirs - acc.nsubdirs;
+  }
+
+  void add(const frag_info_t& other) {
+    if (other.mtime > mtime)
+      mtime = other.mtime;
+    if (other.change_attr > change_attr)
+      change_attr = other.change_attr;
+    nfiles += other.nfiles;
+    nsubdirs += other.nsubdirs;
+  }
+
+  bool same_sums(const frag_info_t &o) const {
+    return mtime <= o.mtime &&
+	nfiles == o.nfiles &&
+	nsubdirs == o.nsubdirs;
+  }
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<frag_info_t*>& ls);
+
+  // this frag
+  utime_t mtime;
+  uint64_t change_attr = 0;
+  int64_t nfiles = 0;        // files
+  int64_t nsubdirs = 0;      // subdirs
+};
+WRITE_CLASS_ENCODER(frag_info_t)
+
+inline bool operator==(const frag_info_t &l, const frag_info_t &r) {
+  return memcmp(&l, &r, sizeof(l)) == 0;
+}
+inline bool operator!=(const frag_info_t &l, const frag_info_t &r) {
+  return !(l == r);
+}
+
+std::ostream& operator<<(std::ostream &out, const frag_info_t &f);
+
+
+struct nest_info_t : public scatter_info_t {
+  int64_t rsize() const { return rfiles + rsubdirs; }
+
+  void zero() {
+    *this = nest_info_t();
+  }
+
+  void sub(const nest_info_t &other) {
+    add(other, -1);
+  }
+  void add(const nest_info_t &other, int fac=1) {
+    if (other.rctime > rctime)
+      rctime = other.rctime;
+    rbytes += fac*other.rbytes;
+    rfiles += fac*other.rfiles;
+    rsubdirs += fac*other.rsubdirs;
+    rsnaps += fac*other.rsnaps;
+  }
+
+  // *this += cur - acc;
+  void add_delta(const nest_info_t &cur, const nest_info_t &acc) {
+    if (cur.rctime > rctime)
+      rctime = cur.rctime;
+    rbytes += cur.rbytes - acc.rbytes;
+    rfiles += cur.rfiles - acc.rfiles;
+    rsubdirs += cur.rsubdirs - acc.rsubdirs;
+    rsnaps += cur.rsnaps - acc.rsnaps;
+  }
+
+  bool same_sums(const nest_info_t &o) const {
+    return rctime <= o.rctime &&
+        rbytes == o.rbytes &&
+        rfiles == o.rfiles &&
+        rsubdirs == o.rsubdirs &&
+        rsnaps == o.rsnaps;
+  }
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<nest_info_t*>& ls);
+
+  // this frag + children
+  utime_t rctime;
+  int64_t rbytes = 0;
+  int64_t rfiles = 0;
+  int64_t rsubdirs = 0;
+  int64_t rsnaps = 0;
+};
+WRITE_CLASS_ENCODER(nest_info_t)
+
+inline bool operator==(const nest_info_t &l, const nest_info_t &r) {
+  return memcmp(&l, &r, sizeof(l)) == 0;
+}
+inline bool operator!=(const nest_info_t &l, const nest_info_t &r) {
+  return !(l == r);
+}
+
+std::ostream& operator<<(std::ostream &out, const nest_info_t &n);
+
+struct vinodeno_t {
+  vinodeno_t() {}
+  vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {}
+
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    encode(ino, bl);
+    encode(snapid, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    using ceph::decode;
+    decode(ino, p);
+    decode(snapid, p);
+  }
+
+  inodeno_t ino;
+  snapid_t snapid;
+};
+WRITE_CLASS_ENCODER(vinodeno_t)
+
+inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
+  return l.ino == r.ino && l.snapid == r.snapid;
+}
+inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
+  return !(l == r);
+}
+inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
+  return 
+    l.ino < r.ino ||
+    (l.ino == r.ino && l.snapid < r.snapid);
+}
+
+struct quota_info_t
+{
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(max_bytes, bl);
+    encode(max_files, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
+    decode(max_bytes, p);
+    decode(max_files, p);
+    DECODE_FINISH(p);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<quota_info_t *>& ls);
+
+  bool is_valid() const {
+    return max_bytes >=0 && max_files >=0;
+  }
+  bool is_enable() const {
+    return max_bytes || max_files;
+  }
+  void decode_json(JSONObj *obj);
+
+  int64_t max_bytes = 0;
+  int64_t max_files = 0;
+};
+WRITE_CLASS_ENCODER(quota_info_t)
+
+inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
+  return memcmp(&l, &r, sizeof(l)) == 0;
+}
+
+std::ostream& operator<<(std::ostream &out, const quota_info_t &n);
+
+namespace std {
+  template<> struct hash<vinodeno_t> {
+    size_t operator()(const vinodeno_t &vino) const { 
+      hash<inodeno_t> H;
+      hash<uint64_t> I;
+      return H(vino.ino) ^ I(vino.snapid);
+    }
+  };
+}
+
+inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) {
+  out << vino.ino;
+  if (vino.snapid == CEPH_NOSNAP)
+    out << ".head";
+  else if (vino.snapid)
+    out << '.' << vino.snapid;
+  return out;
+}
+
+struct client_writeable_range_t {
+  struct byte_range_t {
+    uint64_t first = 0, last = 0;    // interval client can write to
+    byte_range_t() {}
+    void decode_json(JSONObj *obj);
+  };
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<client_writeable_range_t*>& ls);
+
+  byte_range_t range;
+  snapid_t follows = 0;     // aka "data+metadata flushed thru"
+};
+
+inline void decode(client_writeable_range_t::byte_range_t& range, ceph::buffer::list::const_iterator& bl) {
+  using ceph::decode;
+  decode(range.first, bl);
+  decode(range.last, bl);
+}
+
+WRITE_CLASS_ENCODER(client_writeable_range_t)
+
+std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r);
+
+inline bool operator==(const client_writeable_range_t& l,
+		       const client_writeable_range_t& r) {
+  return l.range.first == r.range.first && l.range.last == r.range.last &&
+    l.follows == r.follows;
+}
+
+struct inline_data_t {
+public:
+  inline_data_t() {}
+  inline_data_t(const inline_data_t& o) : version(o.version) {
+    if (o.blp)
+      set_data(*o.blp);
+  }
+  inline_data_t& operator=(const inline_data_t& o) {
+    version = o.version;
+    if (o.blp)
+      set_data(*o.blp);
+    else
+      free_data();
+    return *this;
+  }
+
+  void free_data() {
+    blp.reset();
+  }
+  void get_data(ceph::buffer::list& ret) const {
+    if (blp)
+      ret = *blp;
+    else
+      ret.clear();
+  }
+  void set_data(const ceph::buffer::list& bl) {
+    if (!blp)
+      blp.reset(new ceph::buffer::list);
+    *blp = bl;
+  }
+  size_t length() const { return blp ? blp->length() : 0; }
+
+  bool operator==(const inline_data_t& o) const {
+   return length() == o.length() &&
+	  (length() == 0 ||
+	   (*const_cast<ceph::buffer::list*>(blp.get()) == *const_cast<ceph::buffer::list*>(o.blp.get())));
+  }
+  bool operator!=(const inline_data_t& o) const {
+    return !(*this == o);
+  }
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+
+  version_t version = 1;
+
+private:
+  std::unique_ptr<ceph::buffer::list> blp;
+};
+WRITE_CLASS_ENCODER(inline_data_t)
+
+enum {
+  DAMAGE_STATS,     // statistics (dirstat, size, etc)
+  DAMAGE_RSTATS,    // recursive statistics (rstat, accounted_rstat)
+  DAMAGE_FRAGTREE   // fragtree -- repair by searching
+};
+typedef uint32_t damage_flags_t;
+
+template<template<typename> class Allocator = std::allocator>
+struct inode_t {
+  /**
+   * ***************
+   * Do not forget to add any new fields to the compare() function.
+   * ***************
+   */
+  using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>;
+
+  inode_t()
+  {
+    clear_layout();
+  }
+
+  // file type
+  bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
+  bool is_dir()     const { return (mode & S_IFMT) == S_IFDIR; }
+  bool is_file()    const { return (mode & S_IFMT) == S_IFREG; }
+
+  bool is_truncating() const { return (truncate_pending > 0); }
+  void truncate(uint64_t old_size, uint64_t new_size) {
+    ceph_assert(new_size < old_size);
+    if (old_size > max_size_ever)
+      max_size_ever = old_size;
+    truncate_from = old_size;
+    size = new_size;
+    rstat.rbytes = new_size;
+    truncate_size = size;
+    truncate_seq++;
+    truncate_pending++;
+  }
+
+  bool has_layout() const {
+    return layout != file_layout_t();
+  }
+
+  void clear_layout() {
+    layout = file_layout_t();
+  }
+
+  uint64_t get_layout_size_increment() const {
+    return layout.get_period();
+  }
+
+  bool is_dirty_rstat() const { return !(rstat == accounted_rstat); }
+
+  uint64_t get_client_range(client_t client) const {
+    auto it = client_ranges.find(client);
+    return it != client_ranges.end() ? it->second.range.last : 0;
+  }
+
+  uint64_t get_max_size() const {
+    uint64_t max = 0;
+      for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin();
+	   p != client_ranges.end();
+	   ++p)
+	if (p->second.range.last > max)
+	  max = p->second.range.last;
+      return max;
+  }
+  void set_max_size(uint64_t new_max) {
+    if (new_max == 0) {
+      client_ranges.clear();
+    } else {
+      for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin();
+	   p != client_ranges.end();
+	   ++p)
+	p->second.range.last = new_max;
+    }
+  }
+
+  void trim_client_ranges(snapid_t last) {
+    std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin();
+    while (p != client_ranges.end()) {
+      if (p->second.follows >= last)
+	client_ranges.erase(p++);
+      else
+	++p;
+    }
+  }
+
+  bool is_backtrace_updated() const {
+    return backtrace_version == version;
+  }
+  void update_backtrace(version_t pv=0) {
+    backtrace_version = pv ? pv : version;
+  }
+
+  void add_old_pool(int64_t l) {
+    backtrace_version = version;
+    old_pools.insert(l);
+  }
+
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void client_ranges_cb(client_range_map& c, JSONObj *obj);
+  static void old_pools_cb(compact_set<int64_t, std::less<int64_t>, Allocator<int64_t> >& c, JSONObj *obj);
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<inode_t*>& ls);
+  /**
+   * Compare this inode_t with another that represent *the same inode*
+   * at different points in time.
+   * @pre The inodes are the same ino
+   *
+   * @param other The inode_t to compare ourselves with
+   * @param divergent A bool pointer which will be set to true
+   * if the values are different in a way that can't be explained
+   * by one being a newer version than the other.
+   *
+   * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
+   */
+  int compare(const inode_t &other, bool *divergent) const;
+
+  // base (immutable)
+  inodeno_t ino = 0;
+  uint32_t   rdev = 0;    // if special file
+
+  // affected by any inode change...
+  utime_t    ctime;   // inode change time
+  utime_t    btime;   // birth time
+
+  // perm (namespace permissions)
+  uint32_t   mode = 0;
+  uid_t      uid = 0;
+  gid_t      gid = 0;
+
+  // nlink
+  int32_t    nlink = 0;
+
+  // file (data access)
+  ceph_dir_layout dir_layout = {};    // [dir only]
+  file_layout_t layout;
+  compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools;
+  uint64_t   size = 0;        // on directory, # dentries
+  uint64_t   max_size_ever = 0; // max size the file has ever been
+  uint32_t   truncate_seq = 0;
+  uint64_t   truncate_size = 0, truncate_from = 0;
+  uint32_t   truncate_pending = 0;
+  utime_t    mtime;   // file data modify time.
+  utime_t    atime;   // file data access time.
+  uint32_t   time_warp_seq = 0;  // count of (potential) mtime/atime timewarps (i.e., utimes())
+  inline_data_t inline_data; // FIXME check
+
+  // change attribute
+  uint64_t   change_attr = 0;
+
+  client_range_map client_ranges;  // client(s) can write to these ranges
+
+  // dirfrag, recursive accountin
+  frag_info_t dirstat;         // protected by my filelock
+  nest_info_t rstat;           // protected by my nestlock
+  nest_info_t accounted_rstat; // protected by parent's nestlock
+
+  quota_info_t quota;
+
+  mds_rank_t export_pin = MDS_RANK_NONE;
+
+  double export_ephemeral_random_pin = 0;
+  bool export_ephemeral_distributed_pin = false;
+
+  // special stuff
+  version_t version = 0;           // auth only
+  version_t file_data_version = 0; // auth only
+  version_t xattr_version = 0;
+
+  utime_t last_scrub_stamp;    // start time of last complete scrub
+  version_t last_scrub_version = 0;// (parent) start version of last complete scrub
+
+  version_t backtrace_version = 0;
+
+  snapid_t oldest_snap;
+
+  std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink
+
+  bool fscrypt = false; // fscrypt enabled ?
+
+private:
+  bool older_is_consistent(const inode_t &other) const;
+};
+
+// These methods may be moved back to mdstypes.cc when we have pmr
+template<template<typename> class Allocator>
+void inode_t<Allocator>::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+  ENCODE_START(17, 6, bl);
+
+  encode(ino, bl);
+  encode(rdev, bl);
+  encode(ctime, bl);
+
+  encode(mode, bl);
+  encode(uid, bl);
+  encode(gid, bl);
+
+  encode(nlink, bl);
+  {
+    // removed field
+    bool anchored = 0;
+    encode(anchored, bl);
+  }
+
+  encode(dir_layout, bl);
+  encode(layout, bl, features);
+  encode(size, bl);
+  encode(truncate_seq, bl);
+  encode(truncate_size, bl);
+  encode(truncate_from, bl);
+  encode(truncate_pending, bl);
+  encode(mtime, bl);
+  encode(atime, bl);
+  encode(time_warp_seq, bl);
+  encode(client_ranges, bl);
+
+  encode(dirstat, bl);
+  encode(rstat, bl);
+  encode(accounted_rstat, bl);
+
+  encode(version, bl);
+  encode(file_data_version, bl);
+  encode(xattr_version, bl);
+  encode(backtrace_version, bl);
+  encode(old_pools, bl);
+  encode(max_size_ever, bl);
+  encode(inline_data, bl);
+  encode(quota, bl);
+
+  encode(stray_prior_path, bl);
+
+  encode(last_scrub_version, bl);
+  encode(last_scrub_stamp, bl);
+
+  encode(btime, bl);
+  encode(change_attr, bl);
+
+  encode(export_pin, bl);
+
+  encode(export_ephemeral_random_pin, bl);
+  encode(export_ephemeral_distributed_pin, bl);
+
+  encode(fscrypt, bl);
+
+  ENCODE_FINISH(bl);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::decode(ceph::buffer::list::const_iterator &p)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(17, 6, 6, p);
+
+  decode(ino, p);
+  decode(rdev, p);
+  decode(ctime, p);
+
+  decode(mode, p);
+  decode(uid, p);
+  decode(gid, p);
+
+  decode(nlink, p);
+  {
+    bool anchored;
+    decode(anchored, p);
+  }
+
+  if (struct_v >= 4)
+    decode(dir_layout, p);
+  else {
+    // FIPS zeroization audit 20191117: this memset is not security related.
+    memset(&dir_layout, 0, sizeof(dir_layout));
+  }
+  decode(layout, p);
+  decode(size, p);
+  decode(truncate_seq, p);
+  decode(truncate_size, p);
+  decode(truncate_from, p);
+  if (struct_v >= 5)
+    decode(truncate_pending, p);
+  else
+    truncate_pending = 0;
+  decode(mtime, p);
+  decode(atime, p);
+  decode(time_warp_seq, p);
+  if (struct_v >= 3) {
+    decode(client_ranges, p);
+  } else {
+    std::map<client_t, client_writeable_range_t::byte_range_t> m;
+    decode(m, p);
+    for (auto q = m.begin(); q != m.end(); ++q)
+      client_ranges[q->first].range = q->second;
+  }
+
+  decode(dirstat, p);
+  decode(rstat, p);
+  decode(accounted_rstat, p);
+
+  decode(version, p);
+  decode(file_data_version, p);
+  decode(xattr_version, p);
+  if (struct_v >= 2)
+    decode(backtrace_version, p);
+  if (struct_v >= 7)
+    decode(old_pools, p);
+  if (struct_v >= 8)
+    decode(max_size_ever, p);
+  if (struct_v >= 9) {
+    decode(inline_data, p);
+  } else {
+    inline_data.version = CEPH_INLINE_NONE;
+  }
+  if (struct_v < 10)
+    backtrace_version = 0; // force update backtrace
+  if (struct_v >= 11)
+    decode(quota, p);
+
+  if (struct_v >= 12) {
+    std::string tmp;
+    decode(tmp, p);
+    stray_prior_path = std::string_view(tmp);
+  }
+
+  if (struct_v >= 13) {
+    decode(last_scrub_version, p);
+    decode(last_scrub_stamp, p);
+  }
+  if (struct_v >= 14) {
+    decode(btime, p);
+    decode(change_attr, p);
+  } else {
+    btime = utime_t();
+    change_attr = 0;
+  }
+
+  if (struct_v >= 15) {
+    decode(export_pin, p);
+  } else {
+    export_pin = MDS_RANK_NONE;
+  }
+
+  if (struct_v >= 16) {
+    decode(export_ephemeral_random_pin, p);
+    decode(export_ephemeral_distributed_pin, p);
+  } else {
+    export_ephemeral_random_pin = 0;
+    export_ephemeral_distributed_pin = false;
+  }
+
+  if (struct_v >= 17) {
+    decode(fscrypt, p);
+  } else {
+    fscrypt = 0;
+  }
+
+  DECODE_FINISH(p);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("ino", ino);
+  f->dump_unsigned("rdev", rdev);
+  f->dump_stream("ctime") << ctime;
+  f->dump_stream("btime") << btime;
+  f->dump_unsigned("mode", mode);
+  f->dump_unsigned("uid", uid);
+  f->dump_unsigned("gid", gid);
+  f->dump_unsigned("nlink", nlink);
+
+  f->open_object_section("dir_layout");
+  ::dump(dir_layout, f);
+  f->close_section();
+
+  f->dump_object("layout", layout);
+
+  f->open_array_section("old_pools");
+  for (const auto &p : old_pools) {
+    f->dump_int("pool", p);
+  }
+  f->close_section();
+
+  f->dump_unsigned("size", size);
+  f->dump_unsigned("truncate_seq", truncate_seq);
+  f->dump_unsigned("truncate_size", truncate_size);
+  f->dump_unsigned("truncate_from", truncate_from);
+  f->dump_unsigned("truncate_pending", truncate_pending);
+  f->dump_stream("mtime") << mtime;
+  f->dump_stream("atime") << atime;
+  f->dump_unsigned("time_warp_seq", time_warp_seq);
+  f->dump_unsigned("change_attr", change_attr);
+  f->dump_int("export_pin", export_pin);
+  f->dump_int("export_ephemeral_random_pin", export_ephemeral_random_pin);
+  f->dump_bool("export_ephemeral_distributed_pin", export_ephemeral_distributed_pin);
+
+  f->open_array_section("client_ranges");
+  for (const auto &p : client_ranges) {
+    f->open_object_section("client");
+    f->dump_unsigned("client", p.first.v);
+    p.second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_object_section("dirstat");
+  dirstat.dump(f);
+  f->close_section();
+
+  f->open_object_section("rstat");
+  rstat.dump(f);
+  f->close_section();
+
+  f->open_object_section("accounted_rstat");
+  accounted_rstat.dump(f);
+  f->close_section();
+
+  f->dump_unsigned("version", version);
+  f->dump_unsigned("file_data_version", file_data_version);
+  f->dump_unsigned("xattr_version", xattr_version);
+  f->dump_unsigned("backtrace_version", backtrace_version);
+
+  f->dump_string("stray_prior_path", stray_prior_path);
+  f->dump_unsigned("max_size_ever", max_size_ever);
+
+  f->open_object_section("quota");
+  quota.dump(f);
+  f->close_section();
+
+  f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
+  f->dump_unsigned("last_scrub_version", last_scrub_version);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::client_ranges_cb(typename inode_t<Allocator>::client_range_map& c, JSONObj *obj){
+
+  int64_t client;
+  JSONDecoder::decode_json("client", client, obj, true);
+  client_writeable_range_t client_range_tmp;
+  JSONDecoder::decode_json("byte range", client_range_tmp.range, obj, true);
+  JSONDecoder::decode_json("follows", client_range_tmp.follows.val, obj, true);
+  c[client] = client_range_tmp;
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::old_pools_cb(compact_set<int64_t, std::less<int64_t>, Allocator<int64_t> >& c, JSONObj *obj){
+
+  int64_t tmp;
+  decode_json_obj(tmp, obj);
+  c.insert(tmp);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::decode_json(JSONObj *obj)
+{
+
+  JSONDecoder::decode_json("ino", ino.val, obj, true);
+  JSONDecoder::decode_json("rdev", rdev, obj, true);
+  //JSONDecoder::decode_json("ctime", ctime, obj, true);
+  //JSONDecoder::decode_json("btime", btime, obj, true);
+  JSONDecoder::decode_json("mode", mode, obj, true);
+  JSONDecoder::decode_json("uid", uid, obj, true);
+  JSONDecoder::decode_json("gid", gid, obj, true);
+  JSONDecoder::decode_json("nlink", nlink, obj, true);
+  JSONDecoder::decode_json("dir_layout", dir_layout, obj, true);
+  JSONDecoder::decode_json("layout", layout, obj, true);
+  JSONDecoder::decode_json("old_pools", old_pools, inode_t<Allocator>::old_pools_cb, obj, true);
+  JSONDecoder::decode_json("size", size, obj, true);
+  JSONDecoder::decode_json("truncate_seq", truncate_seq, obj, true);
+  JSONDecoder::decode_json("truncate_size", truncate_size, obj, true);
+  JSONDecoder::decode_json("truncate_from", truncate_from, obj, true);
+  JSONDecoder::decode_json("truncate_pending", truncate_pending, obj, true);
+  //JSONDecoder::decode_json("mtime", mtime, obj, true);
+  //JSONDecoder::decode_json("atime", atime, obj, true);
+  JSONDecoder::decode_json("time_warp_seq", time_warp_seq, obj, true);
+  JSONDecoder::decode_json("change_attr", change_attr, obj, true);
+  JSONDecoder::decode_json("export_pin", export_pin, obj, true);
+  JSONDecoder::decode_json("client_ranges", client_ranges, inode_t<Allocator>::client_ranges_cb, obj, true);
+  JSONDecoder::decode_json("dirstat", dirstat, obj, true);
+  JSONDecoder::decode_json("rstat", rstat, obj, true);
+  JSONDecoder::decode_json("accounted_rstat", accounted_rstat, obj, true);
+  JSONDecoder::decode_json("version", version, obj, true);
+  JSONDecoder::decode_json("file_data_version", file_data_version, obj, true);
+  JSONDecoder::decode_json("xattr_version", xattr_version, obj, true);
+  JSONDecoder::decode_json("backtrace_version", backtrace_version, obj, true);
+  JSONDecoder::decode_json("stray_prior_path", stray_prior_path, obj, true);
+  JSONDecoder::decode_json("max_size_ever", max_size_ever, obj, true);
+  JSONDecoder::decode_json("quota", quota, obj, true);
+  JSONDecoder::decode_json("last_scrub_stamp", last_scrub_stamp, obj, true);
+  JSONDecoder::decode_json("last_scrub_version", last_scrub_version, obj, true);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::generate_test_instances(std::list<inode_t*>& ls)
+{
+  ls.push_back(new inode_t<Allocator>);
+  ls.push_back(new inode_t<Allocator>);
+  ls.back()->ino = 1;
+  // i am lazy.
+}
+
+template<template<typename> class Allocator>
+int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const
+{
+  ceph_assert(ino == other.ino);
+  *divergent = false;
+  if (version == other.version) {
+    if (rdev != other.rdev ||
+        ctime != other.ctime ||
+        btime != other.btime ||
+        mode != other.mode ||
+        uid != other.uid ||
+        gid != other.gid ||
+        nlink != other.nlink ||
+        memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
+        layout != other.layout ||
+        old_pools != other.old_pools ||
+        size != other.size ||
+        max_size_ever != other.max_size_ever ||
+        truncate_seq != other.truncate_seq ||
+        truncate_size != other.truncate_size ||
+        truncate_from != other.truncate_from ||
+        truncate_pending != other.truncate_pending ||
+	change_attr != other.change_attr ||
+        mtime != other.mtime ||
+        atime != other.atime ||
+        time_warp_seq != other.time_warp_seq ||
+        inline_data != other.inline_data ||
+        client_ranges != other.client_ranges ||
+        !(dirstat == other.dirstat) ||
+        !(rstat == other.rstat) ||
+        !(accounted_rstat == other.accounted_rstat) ||
+        file_data_version != other.file_data_version ||
+        xattr_version != other.xattr_version ||
+        backtrace_version != other.backtrace_version) {
+      *divergent = true;
+    }
+    return 0;
+  } else if (version > other.version) {
+    *divergent = !older_is_consistent(other);
+    return 1;
+  } else {
+    ceph_assert(version < other.version);
+    *divergent = !other.older_is_consistent(*this);
+    return -1;
+  }
+}
+
+template<template<typename> class Allocator>
+bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const
+{
+  if (max_size_ever < other.max_size_ever ||
+      truncate_seq < other.truncate_seq ||
+      time_warp_seq < other.time_warp_seq ||
+      inline_data.version < other.inline_data.version ||
+      dirstat.version < other.dirstat.version ||
+      rstat.version < other.rstat.version ||
+      accounted_rstat.version < other.accounted_rstat.version ||
+      file_data_version < other.file_data_version ||
+      xattr_version < other.xattr_version ||
+      backtrace_version < other.backtrace_version) {
+    return false;
+  }
+  return true;
+}
+
+template<template<typename> class Allocator>
+inline void encode(const inode_t<Allocator> &c, ::ceph::buffer::list &bl, uint64_t features)
+{
+  ENCODE_DUMP_PRE();
+  c.encode(bl, features);
+  ENCODE_DUMP_POST(cl);
+}
+template<template<typename> class Allocator>
+inline void decode(inode_t<Allocator> &c, ::ceph::buffer::list::const_iterator &p)
+{
+  c.decode(p);
+}
+
+template<template<typename> class Allocator>
+using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>;
+
+template<template<typename> class Allocator>
+using xattr_map = std::map<alloc_string<Allocator>,
+			   ceph::bufferptr,
+			   std::less<alloc_string<Allocator>>,
+			   Allocator<std::pair<const alloc_string<Allocator>,
+					       ceph::bufferptr>>>; // FIXME bufferptr not in mempool
+
+template<template<typename> class Allocator>
+inline void decode_noshare(xattr_map<Allocator>& xattrs, ceph::buffer::list::const_iterator &p)
+{
+  __u32 n;
+  decode(n, p);
+  while (n-- > 0) {
+    alloc_string<Allocator> key;
+    decode(key, p);
+    __u32 len;
+    decode(len, p);
+    p.copy_deep(len, xattrs[key]);
+  }
+}
+
+template<template<typename> class Allocator = std::allocator>
+struct old_inode_t {
+  snapid_t first;
+  inode_t<Allocator> inode;
+  xattr_map<Allocator> xattrs;
+
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<old_inode_t*>& ls);
+};
+
+// These methods may be moved back to mdstypes.cc when we have pmr
+template<template<typename> class Allocator>
+void old_inode_t<Allocator>::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(first, bl);
+  encode(inode, bl, features);
+  encode(xattrs, bl);
+  ENCODE_FINISH(bl);
+}
+
+template<template<typename> class Allocator>
+void old_inode_t<Allocator>::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(first, bl);
+  decode(inode, bl);
+  decode_noshare<Allocator>(xattrs, bl);
+  DECODE_FINISH(bl);
+}
+
+template<template<typename> class Allocator>
+void old_inode_t<Allocator>::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("first", first);
+  inode.dump(f);
+  f->open_object_section("xattrs");
+  for (const auto &p : xattrs) {
+    std::string v(p.second.c_str(), p.second.length());
+    f->dump_string(p.first.c_str(), v);
+  }
+  f->close_section();
+}
+
+template<template<typename> class Allocator>
+void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls)
+{
+  ls.push_back(new old_inode_t<Allocator>);
+  ls.push_back(new old_inode_t<Allocator>);
+  ls.back()->first = 2;
+  std::list<inode_t<Allocator>*> ils;
+  inode_t<Allocator>::generate_test_instances(ils);
+  ls.back()->inode = *ils.back();
+  ls.back()->xattrs["user.foo"] = ceph::buffer::copy("asdf", 4);
+  ls.back()->xattrs["user.unprintable"] = ceph::buffer::copy("\000\001\002", 3);
+}
+
+template<template<typename> class Allocator>
+inline void encode(const old_inode_t<Allocator> &c, ::ceph::buffer::list &bl, uint64_t features)
+{
+  ENCODE_DUMP_PRE();
+  c.encode(bl, features);
+  ENCODE_DUMP_POST(cl);
+}
+template<template<typename> class Allocator>
+inline void decode(old_inode_t<Allocator> &c, ::ceph::buffer::list::const_iterator &p)
+{
+  c.decode(p);
+}
+
+/*
+ * like an inode, but for a dir frag 
+ */
+struct fnode_t {
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<fnode_t*>& ls);
+
+  version_t version = 0;
+  snapid_t snap_purged_thru;   // the max_last_destroy snapid we've been purged thru
+  frag_info_t fragstat, accounted_fragstat;
+  nest_info_t rstat, accounted_rstat;
+  damage_flags_t damage_flags = 0;
+
+  // we know we and all our descendants have been scrubbed since this version
+  version_t recursive_scrub_version = 0;
+  utime_t recursive_scrub_stamp;
+  // version at which we last scrubbed our personal data structures
+  version_t localized_scrub_version = 0;
+  utime_t localized_scrub_stamp;
+};
+WRITE_CLASS_ENCODER(fnode_t)
+
+
+struct old_rstat_t {
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<old_rstat_t*>& ls);
+
+  snapid_t first;
+  nest_info_t rstat, accounted_rstat;
+};
+WRITE_CLASS_ENCODER(old_rstat_t)
+
+inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) {
+  return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")";
+}
+
+class feature_bitset_t {
+public:
+  typedef uint64_t block_type;
+  static const size_t bits_per_block = sizeof(block_type) * 8;
+
+  feature_bitset_t(const feature_bitset_t& other) : _vec(other._vec) {}
+  feature_bitset_t(feature_bitset_t&& other) : _vec(std::move(other._vec)) {}
+  feature_bitset_t(unsigned long value = 0);
+  feature_bitset_t(const std::vector<size_t>& array);
+  feature_bitset_t& operator=(const feature_bitset_t& other) {
+    _vec = other._vec;
+    return *this;
+  }
+  feature_bitset_t& operator=(feature_bitset_t&& other) {
+    _vec = std::move(other._vec);
+    return *this;
+  }
+  feature_bitset_t& operator-=(const feature_bitset_t& other);
+  bool empty() const {
+    //block_type is a uint64_t. If the vector is only composed of 0s, then it's still "empty"
+    for (auto& v : _vec) {
+      if (v)
+	return false;
+    }
+    return true;
+  }
+  bool test(size_t bit) const {
+    if (bit >= bits_per_block * _vec.size())
+      return false;
+    return _vec[bit / bits_per_block] & ((block_type)1 << (bit % bits_per_block));
+  }
+  void insert(size_t bit) {
+    size_t n = bit / bits_per_block;
+    if (n >= _vec.size())
+      _vec.resize(n + 1);
+    _vec[n] |= ((block_type)1 << (bit % bits_per_block));
+  }
+  void erase(size_t bit) {
+    size_t n = bit / bits_per_block;
+    if (n >= _vec.size())
+      return;
+    _vec[n] &= ~((block_type)1 << (bit % bits_per_block));
+    if (n + 1 == _vec.size()) {
+      while (!_vec.empty() && _vec.back() == 0)
+	_vec.pop_back();
+    }
+  }
+  void clear() {
+    _vec.clear();
+  }
+  bool operator==(const feature_bitset_t& other) const {
+    return _vec == other._vec;
+  }
+  bool operator!=(const feature_bitset_t& other) const {
+    return _vec != other._vec;
+  }
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator &p);
+  void dump(ceph::Formatter *f) const;
+  void print(std::ostream& out) const;
+private:
+  std::vector<block_type> _vec;
+};
+WRITE_CLASS_ENCODER(feature_bitset_t)
+
+inline std::ostream& operator<<(std::ostream& out, const feature_bitset_t& s) {
+  s.print(out);
+  return out;
+}
+
+struct metric_spec_t {
+  metric_spec_t() {}
+  metric_spec_t(const metric_spec_t& other) :
+    metric_flags(other.metric_flags) {}
+  metric_spec_t(metric_spec_t&& other) :
+    metric_flags(std::move(other.metric_flags)) {}
+  metric_spec_t(const feature_bitset_t& mf) :
+    metric_flags(mf) {}
+  metric_spec_t(feature_bitset_t&& mf) :
+    metric_flags(std::move(mf)) {}
+
+  metric_spec_t& operator=(const metric_spec_t& other) {
+    metric_flags = other.metric_flags;
+    return *this;
+  }
+  metric_spec_t& operator=(metric_spec_t&& other) {
+    metric_flags = std::move(other.metric_flags);
+    return *this;
+  }
+
+  bool empty() const {
+    return metric_flags.empty();
+  }
+
+  void clear() {
+    metric_flags.clear();
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  void print(std::ostream& out) const;
+
+  // set of metrics that a client is capable of forwarding
+  feature_bitset_t metric_flags;
+};
+WRITE_CLASS_ENCODER(metric_spec_t)
+
+inline std::ostream& operator<<(std::ostream& out, const metric_spec_t& mst) {
+  mst.print(out);
+  return out;
+}
+
+/*
+ * client_metadata_t
+ */
+struct client_metadata_t {
+  using kv_map_t = std::map<std::string,std::string>;
+  using iterator = kv_map_t::const_iterator;
+
+  client_metadata_t() {}
+  client_metadata_t(const kv_map_t& kv, const feature_bitset_t &f, const metric_spec_t &mst) :
+    kv_map(kv),
+    features(f),
+    metric_spec(mst) {}
+  client_metadata_t& operator=(const client_metadata_t& other) {
+    kv_map = other.kv_map;
+    features = other.features;
+    metric_spec = other.metric_spec;
+    return *this;
+  }
+
+  bool empty() const { return kv_map.empty() && features.empty() && metric_spec.empty(); }
+  iterator find(const std::string& key) const { return kv_map.find(key); }
+  iterator begin() const { return kv_map.begin(); }
+  iterator end() const { return kv_map.end(); }
+  void erase(iterator it) { kv_map.erase(it); }
+  std::string& operator[](const std::string& key) { return kv_map[key]; }
+  void merge(const client_metadata_t& other) {
+    kv_map.insert(other.kv_map.begin(), other.kv_map.end());
+    features = other.features;
+    metric_spec = other.metric_spec;
+  }
+  void clear() {
+    kv_map.clear();
+    features.clear();
+    metric_spec.clear();
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+
+  kv_map_t kv_map;
+  feature_bitset_t features;
+  metric_spec_t metric_spec;
+};
+WRITE_CLASS_ENCODER(client_metadata_t)
+
+/*
+ * session_info_t - durable part of a Session
+ */
+struct session_info_t {
+  client_t get_client() const { return client_t(inst.name.num()); }
+  bool has_feature(size_t bit) const { return client_metadata.features.test(bit); }
+  const entity_name_t& get_source() const { return inst.name; }
+
+  void clear_meta() {
+    prealloc_inos.clear();
+    completed_requests.clear();
+    completed_flushes.clear();
+    client_metadata.clear();
+  }
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<session_info_t*>& ls);
+
+  entity_inst_t inst;
+  std::map<ceph_tid_t,inodeno_t> completed_requests;
+  interval_set<inodeno_t> prealloc_inos;   // preallocated, ready to use.
+  client_metadata_t client_metadata;
+  std::set<ceph_tid_t> completed_flushes;
+  EntityName auth_name;
+};
+WRITE_CLASS_ENCODER_FEATURES(session_info_t)
+
+// dentries
+struct dentry_key_t {
+  dentry_key_t() {}
+  dentry_key_t(snapid_t s, std::string_view n, __u32 h=0) :
+    snapid(s), name(n), hash(h) {}
+
+  bool is_valid() { return name.length() || snapid; }
+
+  // encode into something that can be decoded as a string.
+  // name_ (head) or name_%x (!head)
+  void encode(ceph::buffer::list& bl) const {
+    std::string key;
+    encode(key);
+    using ceph::encode;
+    encode(key, bl);
+  }
+  void encode(std::string& key) const {
+    char b[20];
+    if (snapid != CEPH_NOSNAP) {
+      uint64_t val(snapid);
+      snprintf(b, sizeof(b), "%" PRIx64, val);
+    } else {
+      snprintf(b, sizeof(b), "%s", "head");
+    }
+    CachedStackStringStream css;
+    *css << name << "_" << b;
+    key = css->strv();
+  }
+  static void decode_helper(ceph::buffer::list::const_iterator& bl, std::string& nm,
+			    snapid_t& sn) {
+    std::string key;
+    using ceph::decode;
+    decode(key, bl);
+    decode_helper(key, nm, sn);
+  }
+  static void decode_helper(std::string_view key, std::string& nm, snapid_t& sn) {
+    size_t i = key.find_last_of('_');
+    ceph_assert(i != std::string::npos);
+    if (key.compare(i+1, std::string_view::npos, "head") == 0) {
+      // name_head
+      sn = CEPH_NOSNAP;
+    } else {
+      // name_%x
+      long long unsigned x = 0;
+      std::string x_str(key.substr(i+1));
+      sscanf(x_str.c_str(), "%llx", &x);
+      sn = x;
+    }
+    nm = key.substr(0, i);
+  }
+
+  snapid_t snapid = 0;
+  std::string_view name;
+  __u32 hash = 0;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k)
+{
+  return out << "(" << k.name << "," << k.snapid << ")";
+}
+
+inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2)
+{
+  /*
+   * order by hash, name, snap
+   */
+  int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash);
+  if (c)
+    return c < 0;
+  c = k1.name.compare(k2.name);
+  if (c)
+    return c < 0;
+  return k1.snapid < k2.snapid;
+}
+
+/*
+ * string_snap_t is a simple (string, snapid_t) pair
+ */
+struct string_snap_t {
+  string_snap_t() {}
+  string_snap_t(std::string_view n, snapid_t s) : name(n), snapid(s) {}
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<string_snap_t*>& ls);
+
+  std::string name;
+  snapid_t snapid;
+};
+WRITE_CLASS_ENCODER(string_snap_t)
+
+inline bool operator<(const string_snap_t& l, const string_snap_t& r) {
+  int c = l.name.compare(r.name);
+  return c < 0 || (c == 0 && l.snapid < r.snapid);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k)
+{
+  return out << "(" << k.name << "," << k.snapid << ")";
+}
+
+/*
+ * mds_table_pending_t
+ *
+ * For mds's requesting any pending ops, child needs to encode the corresponding
+ * pending mutation state in the table.
+ */
+struct mds_table_pending_t {
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<mds_table_pending_t*>& ls);
+
+  uint64_t reqid = 0;
+  __s32 mds = 0;
+  version_t tid = 0;
+};
+WRITE_CLASS_ENCODER(mds_table_pending_t)
+
+// requests
+struct metareqid_t {
+  metareqid_t() {}
+  metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {}
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    encode(name, bl);
+    encode(tid, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &p) {
+    using ceph::decode;
+    decode(name, p);
+    decode(tid, p);
+  }
+
+  entity_name_t name;
+  uint64_t tid = 0;
+};
+WRITE_CLASS_ENCODER(metareqid_t)
+
+inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) {
+  return out << r.name << ":" << r.tid;
+}
+
+inline bool operator==(const metareqid_t& l, const metareqid_t& r) {
+  return (l.name == r.name) && (l.tid == r.tid);
+}
+inline bool operator!=(const metareqid_t& l, const metareqid_t& r) {
+  return (l.name != r.name) || (l.tid != r.tid);
+}
+inline bool operator<(const metareqid_t& l, const metareqid_t& r) {
+  return (l.name < r.name) || 
+    (l.name == r.name && l.tid < r.tid);
+}
+inline bool operator<=(const metareqid_t& l, const metareqid_t& r) {
+  return (l.name < r.name) ||
+    (l.name == r.name && l.tid <= r.tid);
+}
+inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); }
+inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); }
+
+namespace std {
+  template<> struct hash<metareqid_t> {
+    size_t operator()(const metareqid_t &r) const { 
+      hash<uint64_t> H;
+      return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid);
+    }
+  };
+} // namespace std
+
+// cap info for client reconnect
+struct cap_reconnect_t {
+  cap_reconnect_t() {}
+  cap_reconnect_t(uint64_t cap_id, inodeno_t pino, std::string_view p, int w, int i,
+		  inodeno_t sr, snapid_t sf, ceph::buffer::list& lb) :
+    path(p) {
+    capinfo.cap_id = cap_id;
+    capinfo.wanted = w;
+    capinfo.issued = i;
+    capinfo.snaprealm = sr;
+    capinfo.pathbase = pino;
+    capinfo.flock_len = 0;
+    snap_follows = sf;
+    flockbl = std::move(lb);
+  }
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void encode_old(ceph::buffer::list& bl) const;
+  void decode_old(ceph::buffer::list::const_iterator& bl);
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<cap_reconnect_t*>& ls);
+
+  std::string path;
+  mutable ceph_mds_cap_reconnect capinfo = {};
+  snapid_t snap_follows = 0;
+  ceph::buffer::list flockbl;
+};
+WRITE_CLASS_ENCODER(cap_reconnect_t)
+
+struct snaprealm_reconnect_t {
+  snaprealm_reconnect_t() {}
+  snaprealm_reconnect_t(inodeno_t ino, snapid_t seq, inodeno_t parent) {
+    realm.ino = ino;
+    realm.seq = seq;
+    realm.parent = parent;
+  }
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void encode_old(ceph::buffer::list& bl) const;
+  void decode_old(ceph::buffer::list::const_iterator& bl);
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<snaprealm_reconnect_t*>& ls);
+
+  mutable ceph_mds_snaprealm_reconnect realm = {};
+};
+WRITE_CLASS_ENCODER(snaprealm_reconnect_t)
+
+// compat for pre-FLOCK feature
+struct old_ceph_mds_cap_reconnect {
+	ceph_le64 cap_id;
+	ceph_le32 wanted;
+	ceph_le32 issued;
+  ceph_le64 old_size;
+  struct ceph_timespec old_mtime, old_atime;
+	ceph_le64 snaprealm;
+	ceph_le64 pathbase;        /* base ino for our path to this ino */
+} __attribute__ ((packed));
+WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect)
+
+struct old_cap_reconnect_t {
+  const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) {
+    path = n.path;
+    capinfo.cap_id = n.capinfo.cap_id;
+    capinfo.wanted = n.capinfo.wanted;
+    capinfo.issued = n.capinfo.issued;
+    capinfo.snaprealm = n.capinfo.snaprealm;
+    capinfo.pathbase = n.capinfo.pathbase;
+    return *this;
+  }
+  operator cap_reconnect_t() {
+    cap_reconnect_t n;
+    n.path = path;
+    n.capinfo.cap_id = capinfo.cap_id;
+    n.capinfo.wanted = capinfo.wanted;
+    n.capinfo.issued = capinfo.issued;
+    n.capinfo.snaprealm = capinfo.snaprealm;
+    n.capinfo.pathbase = capinfo.pathbase;
+    return n;
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    encode(path, bl);
+    encode(capinfo, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    using ceph::decode;
+    decode(path, bl);
+    decode(capinfo, bl);
+  }
+
+  std::string path;
+  old_ceph_mds_cap_reconnect capinfo;
+};
+WRITE_CLASS_ENCODER(old_cap_reconnect_t)
+
+// dir frag
+struct dirfrag_t {
+  dirfrag_t() {}
+  dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
+
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    encode(ino, bl);
+    encode(frag, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    using ceph::decode;
+    decode(ino, bl);
+    decode(frag, bl);
+  }
+
+  inodeno_t ino = 0;
+  frag_t frag;
+};
+WRITE_CLASS_ENCODER(dirfrag_t)
+
+inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) {
+  out << df.ino;
+  if (!df.frag.is_root()) out << "." << df.frag;
+  return out;
+}
+inline bool operator<(dirfrag_t l, dirfrag_t r) {
+  if (l.ino < r.ino) return true;
+  if (l.ino == r.ino && l.frag < r.frag) return true;
+  return false;
+}
+inline bool operator==(dirfrag_t l, dirfrag_t r) {
+  return l.ino == r.ino && l.frag == r.frag;
+}
+
+namespace std {
+  template<> struct hash<dirfrag_t> {
+    size_t operator()(const dirfrag_t &df) const { 
+      static rjhash<uint64_t> H;
+      static rjhash<uint32_t> I;
+      return H(df.ino) ^ I(df.frag);
+    }
+  };
+} // namespace std
+
+// ================================================================
+#define META_POP_IRD     0
+#define META_POP_IWR     1
+#define META_POP_READDIR 2
+#define META_POP_FETCH   3
+#define META_POP_STORE   4
+#define META_NPOP        5
+
+class inode_load_vec_t {
+public:
+  using time = DecayCounter::time;
+  using clock = DecayCounter::clock;
+  static const size_t NUM = 2;
+
+  inode_load_vec_t() : vec{DecayCounter(DecayRate()), DecayCounter(DecayRate())} {}
+  inode_load_vec_t(const DecayRate &rate) : vec{DecayCounter(rate), DecayCounter(rate)} {}
+
+  DecayCounter &get(int t) {
+    return vec[t];
+  }
+  void zero() {
+    for (auto &d : vec) {
+      d.reset();
+    }
+  }
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<inode_load_vec_t*>& ls);
+
+private:
+  std::array<DecayCounter, NUM> vec;
+};
+inline void encode(const inode_load_vec_t &c, ceph::buffer::list &bl) {
+  c.encode(bl);
+}
+inline void decode(inode_load_vec_t & c, ceph::buffer::list::const_iterator &p) {
+  c.decode(p);
+}
+
+class dirfrag_load_vec_t {
+public:
+  using time = DecayCounter::time;
+  using clock = DecayCounter::clock;
+  static const size_t NUM = 5;
+
+  dirfrag_load_vec_t() :
+      vec{DecayCounter(DecayRate()),
+          DecayCounter(DecayRate()),
+          DecayCounter(DecayRate()),
+          DecayCounter(DecayRate()),
+          DecayCounter(DecayRate())
+         }
+  {}
+  dirfrag_load_vec_t(const DecayRate &rate) : 
+      vec{DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate)}
+  {}
+
+  void encode(ceph::buffer::list &bl) const {
+    ENCODE_START(2, 2, bl);
+    for (const auto &i : vec) {
+      encode(i, bl);
+    }
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &p) {
+    DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
+    for (auto &i : vec) {
+      decode(i, p);
+    }
+    DECODE_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  void dump(ceph::Formatter *f, const DecayRate& rate) const;
+  static void generate_test_instances(std::list<dirfrag_load_vec_t*>& ls);
+
+  const DecayCounter &get(int t) const {
+    return vec[t];
+  }
+  DecayCounter &get(int t) {
+    return vec[t];
+  }
+  void adjust(double d) {
+    for (auto &i : vec) {
+      i.adjust(d);
+    }
+  }
+  void zero() {
+    for (auto &i : vec) {
+      i.reset();
+    }
+  }
+  double meta_load() const {
+    return 
+      1*vec[META_POP_IRD].get() + 
+      2*vec[META_POP_IWR].get() +
+      1*vec[META_POP_READDIR].get() +
+      2*vec[META_POP_FETCH].get() +
+      4*vec[META_POP_STORE].get();
+  }
+
+  void add(dirfrag_load_vec_t& r) {
+    for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
+      vec[i].adjust(r.vec[i].get());
+  }
+  void sub(dirfrag_load_vec_t& r) {
+    for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
+      vec[i].adjust(-r.vec[i].get());
+  }
+  void scale(double f) {
+    for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
+      vec[i].scale(f);
+  }
+
+private:
+  friend inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl);
+  std::array<DecayCounter, NUM> vec;
+};
+
+inline void encode(const dirfrag_load_vec_t &c, ceph::buffer::list &bl) {
+  c.encode(bl);
+}
+inline void decode(dirfrag_load_vec_t& c, ceph::buffer::list::const_iterator &p) {
+  c.decode(p);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl)
+{
+  CachedStackStringStream css;
+  *css << std::setprecision(1) << std::fixed
+     << "[pop"
+        " IRD:" << dl.vec[0]
+     << " IWR:" << dl.vec[1]
+     << " RDR:" << dl.vec[2]
+     << " FET:" << dl.vec[3]
+     << " STR:" << dl.vec[4]
+     << " *LOAD:" << dl.meta_load() << "]";
+  return out << css->strv() << std::endl;
+}
+
+struct mds_load_t {
+  using clock = dirfrag_load_vec_t::clock;
+  using time = dirfrag_load_vec_t::time;
+
+  dirfrag_load_vec_t auth;
+  dirfrag_load_vec_t all;
+
+  mds_load_t() : auth(DecayRate()), all(DecayRate()) {}
+  mds_load_t(const DecayRate &rate) : auth(rate), all(rate) {}
+
+  double req_rate = 0.0;
+  double cache_hit_rate = 0.0;
+  double queue_len = 0.0;
+
+  double cpu_load_avg = 0.0;
+
+  double mds_load() const;  // defiend in MDBalancer.cc
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<mds_load_t*>& ls);
+};
+inline void encode(const mds_load_t &c, ceph::buffer::list &bl) {
+  c.encode(bl);
+}
+inline void decode(mds_load_t &c, ceph::buffer::list::const_iterator &p) {
+  c.decode(p);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load)
+{
+  return out << "mdsload<" << load.auth << "/" << load.all
+             << ", req " << load.req_rate 
+             << ", hr " << load.cache_hit_rate
+             << ", qlen " << load.queue_len
+	     << ", cpu " << load.cpu_load_avg
+             << ">";
+}
+
+class load_spread_t {
+public:
+  using time = DecayCounter::time;
+  using clock = DecayCounter::clock;
+  static const int MAX = 4;
+
+  load_spread_t(const DecayRate &rate) : count(rate)
+  {}
+
+  load_spread_t() = delete;
+
+  double hit(int who) {
+    for (int i=0; i<n; i++)
+      if (last[i] == who) 
+	return count.get_last();
+
+    // we're new(ish)
+    last[p++] = who;
+    if (n < MAX) n++;
+    if (n == 1) return 0.0;
+
+    if (p == MAX) p = 0;
+
+    return count.hit();
+  }
+  double get() const {
+    return count.get();
+  }
+
+  std::array<int, MAX> last = {-1, -1, -1, -1};
+  int p = 0, n = 0;
+  DecayCounter count;
+};
+
+// ================================================================
+typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t;
+
+// -- authority delegation --
+// directory authority types
+//  >= 0 is the auth mds
+#define CDIR_AUTH_PARENT   mds_rank_t(-1)   // default
+#define CDIR_AUTH_UNKNOWN  mds_rank_t(-2)
+#define CDIR_AUTH_DEFAULT  mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN)
+#define CDIR_AUTH_UNDEF    mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)
+//#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
+
+class MDSCacheObjectInfo {
+public:
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<MDSCacheObjectInfo*>& ls);
+
+  inodeno_t ino = 0;
+  dirfrag_t dirfrag;
+  std::string dname;
+  snapid_t snapid;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) {
+  if (info.ino) return out << info.ino << "." << info.snapid;
+  if (info.dname.length()) return out << info.dirfrag << "/" << info.dname
+    << " snap " << info.snapid;
+  return out << info.dirfrag;
+}
+
+inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) {
+  if (l.ino || r.ino)
+    return l.ino == r.ino && l.snapid == r.snapid;
+  else
+    return l.dirfrag == r.dirfrag && l.dname == r.dname;
+}
+WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
+
+// parse a map of keys/values.
+namespace qi = boost::spirit::qi;
+
+template <typename Iterator>
+struct keys_and_values
+  : qi::grammar<Iterator, std::map<std::string, std::string>()>
+{
+    keys_and_values()
+      : keys_and_values::base_type(query)
+    {
+      query =  pair >> *(qi::lit(' ') >> pair);
+      pair  =  key >> '=' >> value;
+      key   =  qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
+      value = +qi::char_("a-zA-Z0-9-_.");
+    }
+  qi::rule<Iterator, std::map<std::string, std::string>()> query;
+  qi::rule<Iterator, std::pair<std::string, std::string>()> pair;
+  qi::rule<Iterator, std::string()> key, value;
+};
+
+#endif
diff --git a/src/mds/snap.cc b/src/mds/snap.cc
new file mode 100644
index 000000000..f988ca513
--- /dev/null
+++ b/src/mds/snap.cc
@@ -0,0 +1,228 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004- Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <string_view>
+
+#include "snap.h"
+
+#include "common/Formatter.h"
+
+/*
+ * SnapInfo
+ */
+
+void SnapInfo::encode(bufferlist& bl) const
+{
+  ENCODE_START(3, 2, bl);
+  encode(snapid, bl);
+  encode(ino, bl);
+  encode(stamp, bl);
+  encode(name, bl);
+  encode(metadata, bl);
+  ENCODE_FINISH(bl);
+}
+
+void SnapInfo::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+  decode(snapid, bl);
+  decode(ino, bl);
+  decode(stamp, bl);
+  decode(name, bl);
+  if (struct_v >= 3) {
+    decode(metadata, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void SnapInfo::dump(Formatter *f) const
+{
+  f->dump_unsigned("snapid", snapid);
+  f->dump_unsigned("ino", ino);
+  f->dump_stream("stamp") << stamp;
+  f->dump_string("name", name);
+  f->open_object_section("metadata");
+  for (auto &[key, value] : metadata) {
+    f->dump_string(key, value);
+  }
+  f->close_section();
+}
+
+void SnapInfo::generate_test_instances(std::list<SnapInfo*>& ls)
+{
+  ls.push_back(new SnapInfo);
+  ls.push_back(new SnapInfo);
+  ls.back()->snapid = 1;
+  ls.back()->ino = 2;
+  ls.back()->stamp = utime_t(3, 4);
+  ls.back()->name = "foo";
+  ls.back()->metadata = {{"foo", "bar"}};
+}
+
+ostream& operator<<(ostream& out, const SnapInfo &sn)
+{
+  return out << "snap(" << sn.snapid
+	     << " " << sn.ino
+	     << " '" << sn.name
+	     << "' " << sn.stamp << ")";
+}
+
+std::string_view SnapInfo::get_long_name() const
+{
+  if (long_name.empty() ||
+      long_name.compare(1, name.size(), name) ||
+      long_name.find_last_of("_") != name.size() + 1) {
+    char nm[80];
+    snprintf(nm, sizeof(nm), "_%s_%llu", name.c_str(), (unsigned long long)ino);
+    long_name = nm;
+  }
+  return long_name;
+}
+
+/*
+ * snaplink_t
+ */
+
+void snaplink_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(ino, bl);
+  encode(first, bl);
+  ENCODE_FINISH(bl);
+}
+
+void snaplink_t::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(ino, bl);
+  decode(first, bl);
+  DECODE_FINISH(bl);
+}
+
+void snaplink_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("ino", ino);
+  f->dump_unsigned("first", first);
+}
+
+void snaplink_t::generate_test_instances(std::list<snaplink_t*>& ls)
+{
+  ls.push_back(new snaplink_t);
+  ls.push_back(new snaplink_t);
+  ls.back()->ino = 2;
+  ls.back()->first = 123;
+}
+
+ostream& operator<<(ostream& out, const snaplink_t &l)
+{
+  return out << l.ino << "@" << l.first;
+}
+
+/*
+ * sr_t
+ */
+
+void sr_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(6, 4, bl);
+  encode(seq, bl);
+  encode(created, bl);
+  encode(last_created, bl);
+  encode(last_destroyed, bl);
+  encode(current_parent_since, bl);
+  encode(snaps, bl);
+  encode(past_parents, bl);
+  encode(past_parent_snaps, bl);
+  encode(flags, bl);
+  ENCODE_FINISH(bl);
+}
+
+void sr_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, p);
+  if (struct_v == 2) {
+    __u8 struct_v;
+    decode(struct_v, p);  // yes, really: extra byte for v2 encoding only, see 6ee52e7d.
+  }
+  decode(seq, p);
+  decode(created, p);
+  decode(last_created, p);
+  decode(last_destroyed, p);
+  decode(current_parent_since, p);
+  decode(snaps, p);
+  decode(past_parents, p);
+  if (struct_v >= 5)
+    decode(past_parent_snaps, p);
+  if (struct_v >= 6)
+    decode(flags, p);
+  else
+    flags = 0;
+  DECODE_FINISH(p);
+}
+
+void sr_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("seq", seq);
+  f->dump_unsigned("created", created);
+  f->dump_unsigned("last_created", last_created);
+  f->dump_unsigned("last_destroyed", last_destroyed);
+  f->dump_unsigned("current_parent_since", current_parent_since);
+
+  f->open_array_section("snaps");
+  for (map<snapid_t,SnapInfo>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
+    f->open_object_section("snapinfo");
+    f->dump_unsigned("last", p->first);
+    p->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("past_parents");
+  for (map<snapid_t,snaplink_t>::const_iterator p = past_parents.begin(); p != past_parents.end(); ++p) {
+    f->open_object_section("past_parent");
+    f->dump_unsigned("last", p->first);
+    p->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("past_parent_snaps");
+  for (auto p = past_parent_snaps.begin(); p != past_parent_snaps.end(); ++p) {
+    f->open_object_section("snapinfo");
+    f->dump_unsigned("snapid", *p);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void sr_t::generate_test_instances(std::list<sr_t*>& ls)
+{
+  ls.push_back(new sr_t);
+  ls.push_back(new sr_t);
+  ls.back()->seq = 1;
+  ls.back()->created = 2;
+  ls.back()->last_created = 3;
+  ls.back()->last_destroyed = 4;
+  ls.back()->current_parent_since = 5;
+  ls.back()->snaps[123].snapid = 7;
+  ls.back()->snaps[123].ino = 8;
+  ls.back()->snaps[123].stamp = utime_t(9, 10);
+  ls.back()->snaps[123].name = "name1";
+  ls.back()->past_parents[12].ino = 12;
+  ls.back()->past_parents[12].first = 3;
+
+  ls.back()->past_parent_snaps.insert(5);
+  ls.back()->past_parent_snaps.insert(6);
+}
+
diff --git a/src/mds/snap.h b/src/mds/snap.h
new file mode 100644
index 000000000..f8b5b701a
--- /dev/null
+++ b/src/mds/snap.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MDS_SNAP_H
+#define CEPH_MDS_SNAP_H
+
+#include <map>
+#include <string_view>
+
+#include "mdstypes.h"
+#include "common/snap_types.h"
+
+#include "Capability.h"
+
+/*
+ * generic snap descriptor.
+ */
+struct SnapInfo {
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<SnapInfo*>& ls);
+
+  std::string_view get_long_name() const;
+
+  snapid_t snapid;
+  inodeno_t ino;
+  utime_t stamp;
+  std::string name;
+
+  mutable std::string long_name; ///< cached _$ino_$name
+  std::map<std::string,std::string> metadata;
+};
+WRITE_CLASS_ENCODER(SnapInfo)
+
+inline bool operator==(const SnapInfo &l, const SnapInfo &r)
+{
+  return l.snapid == r.snapid && l.ino == r.ino &&
+	 l.stamp == r.stamp && l.name == r.name;
+}
+
+std::ostream& operator<<(std::ostream& out, const SnapInfo &sn);
+
+/*
+ * SnapRealm - a subtree that shares the same set of snapshots.
+ */
+struct SnapRealm;
+
+struct snaplink_t {
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<snaplink_t*>& ls);
+
+  inodeno_t ino;
+  snapid_t first;
+};
+WRITE_CLASS_ENCODER(snaplink_t)
+
+std::ostream& operator<<(std::ostream& out, const snaplink_t &l);
+
+// carry data about a specific version of a SnapRealm
+struct sr_t {
+  void mark_parent_global() { flags |= PARENT_GLOBAL; }
+  void clear_parent_global() { flags &= ~PARENT_GLOBAL; }
+  bool is_parent_global() const { return flags & PARENT_GLOBAL; }
+
+  void mark_subvolume() { flags |= SUBVOLUME; }
+  void clear_subvolume() { flags &= ~SUBVOLUME; }
+  bool is_subvolume() const { return flags & SUBVOLUME; }
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<sr_t*>& ls);
+
+  snapid_t seq = 0;                     // basically, a version/seq # for changes to _this_ realm.
+  snapid_t created = 0;                 // when this realm was created.
+  snapid_t last_created = 0;            // last snap created in _this_ realm.
+  snapid_t last_destroyed = 0;          // seq for last removal
+  snapid_t current_parent_since = 1;
+  std::map<snapid_t, SnapInfo> snaps;
+  std::map<snapid_t, snaplink_t> past_parents;  // key is "last" (or NOSNAP)
+  std::set<snapid_t> past_parent_snaps;
+
+  __u32 flags = 0;
+  enum {
+    PARENT_GLOBAL	= 1 << 0,
+    SUBVOLUME		= 1 << 1,
+  };
+};
+WRITE_CLASS_ENCODER(sr_t)
+
+class MDCache;
+#endif
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
commit	19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree	42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/mds
parent	Initial commit. (diff)
download	ceph-upstream.tar.xz ceph-upstream.zip