summaryrefslogtreecommitdiffstats
path: root/src/mds
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
commit483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
treee5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/mds
parentInitial commit. (diff)
downloadceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/mds')
-rw-r--r--src/mds/Anchor.cc60
-rw-r--r--src/mds/Anchor.h73
-rw-r--r--src/mds/Beacon.cc484
-rw-r--r--src/mds/Beacon.h114
-rw-r--r--src/mds/CDentry.cc630
-rw-r--r--src/mds/CDentry.h381
-rwxr-xr-xsrc/mds/CDir.cc3520
-rw-r--r--src/mds/CDir.h782
-rw-r--r--src/mds/CInode.cc4959
-rw-r--r--src/mds/CInode.h1227
-rw-r--r--src/mds/CMakeLists.txt47
-rw-r--r--src/mds/Capability.cc299
-rw-r--r--src/mds/Capability.h406
-rw-r--r--src/mds/DamageTable.cc280
-rw-r--r--src/mds/DamageTable.h207
-rw-r--r--src/mds/FSMap.cc1029
-rw-r--r--src/mds/FSMap.h532
-rw-r--r--src/mds/FSMapUser.cc81
-rw-r--r--src/mds/FSMapUser.h65
-rw-r--r--src/mds/InoTable.cc235
-rw-r--r--src/mds/InoTable.h103
-rw-r--r--src/mds/JournalPointer.cc122
-rw-r--r--src/mds/JournalPointer.h87
-rw-r--r--src/mds/LocalLock.h65
-rw-r--r--src/mds/Locker.cc5479
-rw-r--r--src/mds/Locker.h291
-rw-r--r--src/mds/LogEvent.cc209
-rw-r--r--src/mds/LogEvent.h132
-rw-r--r--src/mds/LogSegment.h98
-rw-r--r--src/mds/MDBalancer.cc1456
-rw-r--r--src/mds/MDBalancer.h161
-rw-r--r--src/mds/MDCache.cc13084
-rw-r--r--src/mds/MDCache.h1363
-rw-r--r--src/mds/MDLog.cc1530
-rw-r--r--src/mds/MDLog.h337
-rw-r--r--src/mds/MDSAuthCaps.cc434
-rw-r--r--src/mds/MDSAuthCaps.h191
-rw-r--r--src/mds/MDSCacheObject.cc71
-rw-r--r--src/mds/MDSCacheObject.h415
-rw-r--r--src/mds/MDSContext.cc140
-rw-r--r--src/mds/MDSContext.h212
-rw-r--r--src/mds/MDSContinuation.h33
-rw-r--r--src/mds/MDSDaemon.cc1268
-rw-r--r--src/mds/MDSDaemon.h176
-rw-r--r--src/mds/MDSMap.cc930
-rw-r--r--src/mds/MDSMap.h686
-rw-r--r--src/mds/MDSRank.cc3824
-rw-r--r--src/mds/MDSRank.h673
-rw-r--r--src/mds/MDSTable.cc201
-rw-r--r--src/mds/MDSTable.h95
-rw-r--r--src/mds/MDSTableClient.cc264
-rw-r--r--src/mds/MDSTableClient.h103
-rw-r--r--src/mds/MDSTableServer.cc373
-rw-r--r--src/mds/MDSTableServer.h126
-rw-r--r--src/mds/Mantle.cc139
-rw-r--r--src/mds/Mantle.h40
-rw-r--r--src/mds/Migrator.cc3611
-rw-r--r--src/mds/Migrator.h376
-rw-r--r--src/mds/Mutation.cc473
-rw-r--r--src/mds/Mutation.h432
-rw-r--r--src/mds/OpenFileTable.cc1189
-rw-r--r--src/mds/OpenFileTable.h151
-rw-r--r--src/mds/PurgeQueue.cc776
-rw-r--r--src/mds/PurgeQueue.h228
-rw-r--r--src/mds/RecoveryQueue.cc237
-rw-r--r--src/mds/RecoveryQueue.h55
-rw-r--r--src/mds/ScatterLock.h255
-rw-r--r--src/mds/ScrubHeader.h68
-rw-r--r--src/mds/ScrubStack.cc755
-rw-r--r--src/mds/ScrubStack.h306
-rw-r--r--src/mds/Server.cc10206
-rw-r--r--src/mds/Server.h384
-rw-r--r--src/mds/SessionMap.cc1226
-rw-r--r--src/mds/SessionMap.h838
-rw-r--r--src/mds/SimpleLock.cc43
-rw-r--r--src/mds/SimpleLock.h720
-rw-r--r--src/mds/SnapClient.cc316
-rw-r--r--src/mds/SnapClient.h114
-rw-r--r--src/mds/SnapRealm.cc726
-rw-r--r--src/mds/SnapRealm.h166
-rw-r--r--src/mds/SnapServer.cc476
-rw-r--r--src/mds/SnapServer.h149
-rw-r--r--src/mds/StrayManager.cc759
-rw-r--r--src/mds/StrayManager.h197
-rw-r--r--src/mds/balancers/greedyspill.lua49
-rw-r--r--src/mds/cephfs_features.h51
-rw-r--r--src/mds/events/ECommitted.h43
-rw-r--r--src/mds/events/EExport.h58
-rw-r--r--src/mds/events/EFragment.h81
-rw-r--r--src/mds/events/EImportFinish.h53
-rw-r--r--src/mds/events/EImportStart.h61
-rw-r--r--src/mds/events/EMetaBlob.h600
-rw-r--r--src/mds/events/ENoOp.h35
-rw-r--r--src/mds/events/EOpen.h61
-rw-r--r--src/mds/events/EResetJournal.h39
-rw-r--r--src/mds/events/ESession.h67
-rw-r--r--src/mds/events/ESessions.h61
-rw-r--r--src/mds/events/ESlaveUpdate.h157
-rw-r--r--src/mds/events/ESubtreeMap.h48
-rw-r--r--src/mds/events/ETableClient.h49
-rw-r--r--src/mds/events/ETableServer.h59
-rw-r--r--src/mds/events/EUpdate.h55
-rw-r--r--src/mds/flock.cc596
-rw-r--r--src/mds/flock.h290
-rw-r--r--src/mds/inode_backtrace.cc163
-rw-r--r--src/mds/inode_backtrace.h97
-rw-r--r--src/mds/journal.cc3170
-rw-r--r--src/mds/locks.c162
-rw-r--r--src/mds/locks.h126
-rw-r--r--src/mds/mds_table_types.h81
-rw-r--r--src/mds/mdstypes.cc895
-rw-r--r--src/mds/mdstypes.h1821
-rw-r--r--src/mds/snap.cc218
-rw-r--r--src/mds/snap.h115
114 files changed, 84915 insertions, 0 deletions
diff --git a/src/mds/Anchor.cc b/src/mds/Anchor.cc
new file mode 100644
index 00000000..02cc2d2b
--- /dev/null
+++ b/src/mds/Anchor.cc
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "mds/Anchor.h"
+
+#include "common/Formatter.h"
+
+void Anchor::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(ino, bl);
+ encode(dirino, bl);
+ encode(d_name, bl);
+ encode(d_type, bl);
+ ENCODE_FINISH(bl);
+}
+
+void Anchor::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(ino, bl);
+ decode(dirino, bl);
+ decode(d_name, bl);
+ decode(d_type, bl);
+ DECODE_FINISH(bl);
+}
+
+void Anchor::dump(Formatter *f) const
+{
+ f->dump_unsigned("ino", ino);
+ f->dump_unsigned("dirino", dirino);
+ f->dump_string("d_name", d_name);
+ f->dump_unsigned("d_type", d_type);
+}
+
+void Anchor::generate_test_instances(list<Anchor*>& ls)
+{
+ ls.push_back(new Anchor);
+ ls.push_back(new Anchor);
+ ls.back()->ino = 1;
+ ls.back()->dirino = 2;
+ ls.back()->d_name = "hello";
+ ls.back()->d_type = DT_DIR;
+}
+
+ostream& operator<<(ostream& out, const Anchor &a)
+{
+ return out << "a(" << a.ino << " " << a.dirino << "/'" << a.d_name << "' " << a.d_type << ")";
+}
diff --git a/src/mds/Anchor.h b/src/mds/Anchor.h
new file mode 100644
index 00000000..49b592b9
--- /dev/null
+++ b/src/mds/Anchor.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ANCHOR_H
+#define CEPH_ANCHOR_H
+
+#include <string>
+
+#include "include/types.h"
+#include "mdstypes.h"
+#include "include/buffer.h"
+
+/*
+ * Anchor represents primary linkage of an inode. When adding inode to an
+ * anchor table, MDS ensures that the table also contains inode's ancestor
+ * inodes. MDS can get inode's path by looking up anchor table recursively.
+ */
+class Anchor {
+public:
+ inodeno_t ino; // anchored ino
+ inodeno_t dirino;
+ std::string d_name;
+ __u8 d_type = 0;
+
+ int omap_idx = -1; // stored in which omap object
+
+ Anchor() {}
+ Anchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp) :
+ ino(i), dirino(di), d_name(str), d_type(tp) {}
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<Anchor*>& ls);
+};
+WRITE_CLASS_ENCODER(Anchor)
+
+inline bool operator==(const Anchor &l, const Anchor &r) {
+ return l.ino == r.ino && l.dirino == r.dirino &&
+ l.d_name == r.d_name && l.d_type == r.d_type;
+}
+
+ostream& operator<<(ostream& out, const Anchor &a);
+
+class RecoveredAnchor : public Anchor {
+public:
+ RecoveredAnchor() {}
+
+ mds_rank_t auth = MDS_RANK_NONE; // auth hint
+};
+
+class OpenedAnchor : public Anchor {
+public:
+ OpenedAnchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp, int nr) :
+ Anchor(i, di, str, tp),
+ nref(nr)
+ {}
+
+ mutable int nref = 0; // how many children
+};
+
+#endif
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
new file mode 100644
index 00000000..b66550bd
--- /dev/null
+++ b/src/mds/Beacon.cc
@@ -0,0 +1,484 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "common/dout.h"
+#include "common/HeartbeatMap.h"
+
+#include "include/stringify.h"
+#include "include/util.h"
+
+#include "mon/MonClient.h"
+#include "mds/MDLog.h"
+#include "mds/MDSRank.h"
+#include "mds/MDSMap.h"
+#include "mds/Locker.h"
+
+#include "Beacon.h"
+
+#include <chrono>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds.beacon." << name << ' '
+
+using namespace std::chrono_literals;
+
+Beacon::Beacon(CephContext *cct, MonClient *monc, std::string_view name)
+ :
+ Dispatcher(cct),
+ beacon_interval(g_conf()->mds_beacon_interval),
+ monc(monc),
+ name(name)
+{
+}
+
+Beacon::~Beacon()
+{
+ shutdown();
+}
+
+void Beacon::shutdown()
+{
+ std::unique_lock<std::mutex> lock(mutex);
+ if (!finished) {
+ finished = true;
+ lock.unlock();
+ if (sender.joinable())
+ sender.join();
+ }
+}
+
+void Beacon::init(const MDSMap &mdsmap)
+{
+ std::unique_lock lock(mutex);
+
+ _notify_mdsmap(mdsmap);
+
+ sender = std::thread([this]() {
+ std::unique_lock<std::mutex> lock(mutex);
+ std::condition_variable c; // no one wakes us
+ while (!finished) {
+ auto now = clock::now();
+ auto since = std::chrono::duration<double>(now-last_send).count();
+ auto interval = beacon_interval;
+ if (since >= interval*.90) {
+ if (!_send()) {
+ interval = 0.5; /* 500ms */
+ }
+ } else {
+ interval -= since;
+ }
+ dout(20) << "sender thread waiting interval " << interval << "s" << dendl;
+ c.wait_for(lock, interval*1s);
+ }
+ });
+}
+
+bool Beacon::ms_can_fast_dispatch2(const Message::const_ref& m) const
+{
+ return m->get_type() == MSG_MDS_BEACON;
+}
+
+void Beacon::ms_fast_dispatch2(const Message::ref& m)
+{
+ bool handled = ms_dispatch2(m);
+ ceph_assert(handled);
+}
+
+bool Beacon::ms_dispatch2(const Message::ref& m)
+{
+ if (m->get_type() == MSG_MDS_BEACON) {
+ if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ handle_mds_beacon(MMDSBeacon::msgref_cast(m));
+ }
+ return true;
+ }
+
+ return false;
+}
+
+
+/**
+ * Update lagginess state based on response from remote MDSMonitor
+ *
+ * This function puts the passed message before returning
+ */
+void Beacon::handle_mds_beacon(const MMDSBeacon::const_ref &m)
+{
+ std::unique_lock lock(mutex);
+
+ version_t seq = m->get_seq();
+
+ // update lab
+ auto it = seq_stamp.find(seq);
+ if (it != seq_stamp.end()) {
+ auto now = clock::now();
+
+ last_acked_stamp = it->second;
+ auto rtt = std::chrono::duration<double>(now - last_acked_stamp).count();
+
+ dout(5) << "received beacon reply " << ceph_mds_state_name(m->get_state()) << " seq " << m->get_seq() << " rtt " << rtt << dendl;
+
+ if (laggy && rtt < g_conf()->mds_beacon_grace) {
+ dout(0) << " MDS is no longer laggy" << dendl;
+ laggy = false;
+ last_laggy = now;
+ }
+
+ // clean up seq_stamp map
+ seq_stamp.erase(seq_stamp.begin(), ++it);
+
+ // Wake a waiter up if present
+ cvar.notify_all();
+ } else {
+ dout(1) << "discarding unexpected beacon reply " << ceph_mds_state_name(m->get_state())
+ << " seq " << m->get_seq() << " dne" << dendl;
+ }
+}
+
+
+void Beacon::send()
+{
+ std::unique_lock lock(mutex);
+ _send();
+}
+
+
+void Beacon::send_and_wait(const double duration)
+{
+ std::unique_lock lock(mutex);
+ _send();
+ auto awaiting_seq = last_seq;
+ dout(20) << __func__ << ": awaiting " << awaiting_seq
+ << " for up to " << duration << "s" << dendl;
+
+ auto start = clock::now();
+ while (!seq_stamp.empty() && seq_stamp.begin()->first <= awaiting_seq) {
+ auto now = clock::now();
+ auto s = duration*.95-std::chrono::duration<double>(now-start).count();
+ if (s < 0) break;
+ cvar.wait_for(lock, s*1s);
+ }
+}
+
+
+/**
+ * Call periodically, or when you have updated the desired state
+ */
+bool Beacon::_send()
+{
+ auto now = clock::now();
+ auto since = std::chrono::duration<double>(now-last_acked_stamp).count();
+
+ if (!cct->get_heartbeat_map()->is_healthy()) {
+ /* If anything isn't progressing, let avoid sending a beacon so that
+ * the MDS will consider us laggy */
+ dout(0) << "Skipping beacon heartbeat to monitors (last acked " << since << "s ago); MDS internal heartbeat is not healthy!" << dendl;
+ return false;
+ }
+
+ ++last_seq;
+ dout(5) << "Sending beacon " << ceph_mds_state_name(want_state) << " seq " << last_seq << dendl;
+
+ seq_stamp[last_seq] = now;
+
+ ceph_assert(want_state != MDSMap::STATE_NULL);
+
+ auto beacon = MMDSBeacon::create(
+ monc->get_fsid(), mds_gid_t(monc->get_global_id()),
+ name,
+ epoch,
+ want_state,
+ last_seq,
+ CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+ beacon->set_health(health);
+ beacon->set_compat(compat);
+ // piggyback the sys info on beacon msg
+ if (want_state == MDSMap::STATE_BOOT) {
+ map<string, string> sys_info;
+ collect_sys_info(&sys_info, cct);
+ sys_info["addr"] = stringify(monc->get_myaddrs());
+ beacon->set_sys_info(sys_info);
+ }
+ monc->send_mon_message(beacon.detach());
+ last_send = now;
+ return true;
+}
+
+/**
+ * Call this when there is a new MDSMap available
+ */
+void Beacon::notify_mdsmap(const MDSMap &mdsmap)
+{
+ std::unique_lock lock(mutex);
+
+ _notify_mdsmap(mdsmap);
+}
+
+void Beacon::_notify_mdsmap(const MDSMap &mdsmap)
+{
+ ceph_assert(mdsmap.get_epoch() >= epoch);
+
+ if (mdsmap.get_epoch() != epoch) {
+ epoch = mdsmap.get_epoch();
+ compat = MDSMap::get_compat_set_default();
+ compat.merge(mdsmap.compat);
+ }
+}
+
+
+bool Beacon::is_laggy()
+{
+ std::unique_lock lock(mutex);
+
+ auto now = clock::now();
+ auto since = std::chrono::duration<double>(now-last_acked_stamp).count();
+ if (since > g_conf()->mds_beacon_grace) {
+ if (!laggy) {
+ dout(1) << "MDS connection to Monitors appears to be laggy; " << since
+ << "s since last acked beacon" << dendl;
+ }
+ laggy = true;
+ return true;
+ }
+ return false;
+}
+
+void Beacon::set_want_state(const MDSMap &mdsmap, MDSMap::DaemonState newstate)
+{
+ std::unique_lock lock(mutex);
+
+ // Update mdsmap epoch atomically with updating want_state, so that when
+ // we send a beacon with the new want state it has the latest epoch, and
+ // once we have updated to the latest epoch, we are not sending out
+ // a stale want_state (i.e. one from before making it through MDSMap
+ // handling)
+ _notify_mdsmap(mdsmap);
+
+ if (want_state != newstate) {
+ dout(5) << __func__ << ": "
+ << ceph_mds_state_name(want_state) << " -> "
+ << ceph_mds_state_name(newstate) << dendl;
+ want_state = newstate;
+ }
+}
+
+
+/**
+ * We are 'shown' an MDS briefly in order to update
+ * some health metrics that we will send in the next
+ * beacon.
+ */
+void Beacon::notify_health(MDSRank const *mds)
+{
+ std::unique_lock lock(mutex);
+ if (!mds) {
+ // No MDS rank held
+ return;
+ }
+
+ // I'm going to touch this MDS, so it must be locked
+ ceph_assert(mds->mds_lock.is_locked_by_me());
+
+ health.metrics.clear();
+
+ // Detect presence of entries in DamageTable
+ if (!mds->damage_table.empty()) {
+ MDSHealthMetric m(MDS_HEALTH_DAMAGE, HEALTH_ERR, std::string(
+ "Metadata damage detected"));
+ health.metrics.push_back(m);
+ }
+
+ // Detect MDS_HEALTH_TRIM condition
+ // Indicates MDS is not trimming promptly
+ {
+ if (mds->mdlog->get_num_segments() > (size_t)(g_conf()->mds_log_max_segments * g_conf().get_val<double>("mds_log_warn_factor"))) {
+ std::ostringstream oss;
+ oss << "Behind on trimming (" << mds->mdlog->get_num_segments()
+ << "/" << g_conf()->mds_log_max_segments << ")";
+
+ MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, oss.str());
+ m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments());
+ m.metadata["max_segments"] = stringify(g_conf()->mds_log_max_segments);
+ health.metrics.push_back(m);
+ }
+ }
+
+ // Detect clients failing to respond to modifications to capabilities in
+ // CLIENT_CAPS messages.
+ {
+ std::list<client_t> late_clients;
+ mds->locker->get_late_revoking_clients(&late_clients,
+ mds->mdsmap->get_session_timeout());
+ std::list<MDSHealthMetric> late_cap_metrics;
+
+ for (std::list<client_t>::iterator i = late_clients.begin(); i != late_clients.end(); ++i) {
+
+ // client_t is equivalent to session.info.inst.name.num
+ // Construct an entity_name_t to lookup into SessionMap
+ entity_name_t ename(CEPH_ENTITY_TYPE_CLIENT, i->v);
+ Session const *s = mds->sessionmap.get_session(ename);
+ if (s == NULL) {
+ // Shouldn't happen, but not worth crashing if it does as this is
+ // just health-reporting code.
+ derr << "Client ID without session: " << i->v << dendl;
+ continue;
+ }
+
+ std::ostringstream oss;
+ oss << "Client " << s->get_human_name() << " failing to respond to capability release";
+ MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, oss.str());
+ m.metadata["client_id"] = stringify(i->v);
+ late_cap_metrics.push_back(m);
+ }
+
+ if (late_cap_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) {
+ health.metrics.splice(health.metrics.end(), late_cap_metrics);
+ } else {
+ std::ostringstream oss;
+ oss << "Many clients (" << late_cap_metrics.size()
+ << ") failing to respond to capability release";
+ MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY, HEALTH_WARN, oss.str());
+ m.metadata["client_count"] = stringify(late_cap_metrics.size());
+ health.metrics.push_back(m);
+ late_cap_metrics.clear();
+ }
+ }
+
+ // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE
+ // messages. May be due to buggy client or resource-hogging application.
+ //
+ // Detect clients failing to advance their old_client_tid
+ {
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+
+ const auto min_caps_working_set = g_conf().get_val<uint64_t>("mds_min_caps_working_set");
+ const auto recall_warning_threshold = g_conf().get_val<Option::size_t>("mds_recall_warning_threshold");
+ const auto max_completed_requests = g_conf()->mds_max_completed_requests;
+ const auto max_completed_flushes = g_conf()->mds_max_completed_flushes;
+ std::list<MDSHealthMetric> late_recall_metrics;
+ std::list<MDSHealthMetric> large_completed_requests_metrics;
+ for (auto& session : sessions) {
+ const uint64_t num_caps = session->get_num_caps();
+ const uint64_t recall_caps = session->get_recall_caps();
+ if (recall_caps > recall_warning_threshold && num_caps > min_caps_working_set) {
+ dout(2) << "Session " << *session <<
+ " is not releasing caps fast enough. Recalled caps at " << recall_caps
+ << " > " << recall_warning_threshold << " (mds_recall_warning_threshold)." << dendl;
+ std::ostringstream oss;
+ oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
+ MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
+ m.metadata["client_id"] = stringify(session->get_client());
+ late_recall_metrics.push_back(m);
+ }
+ if ((session->get_num_trim_requests_warnings() > 0 &&
+ session->get_num_completed_requests() >= max_completed_requests) ||
+ (session->get_num_trim_flushes_warnings() > 0 &&
+ session->get_num_completed_flushes() >= max_completed_flushes)) {
+ std::ostringstream oss;
+ oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid. ";
+ MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
+ m.metadata["client_id"] = stringify(session->get_client());
+ large_completed_requests_metrics.push_back(m);
+ }
+ }
+
+ if (late_recall_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) {
+ health.metrics.splice(health.metrics.end(), late_recall_metrics);
+ } else {
+ std::ostringstream oss;
+ oss << "Many clients (" << late_recall_metrics.size()
+ << ") failing to respond to cache pressure";
+ MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL_MANY, HEALTH_WARN, oss.str());
+ m.metadata["client_count"] = stringify(late_recall_metrics.size());
+ health.metrics.push_back(m);
+ late_recall_metrics.clear();
+ }
+
+ if (large_completed_requests_metrics.size() <= (size_t)g_conf()->mds_health_summarize_threshold) {
+ health.metrics.splice(health.metrics.end(), large_completed_requests_metrics);
+ } else {
+ std::ostringstream oss;
+ oss << "Many clients (" << large_completed_requests_metrics.size()
+ << ") failing to advance their oldest client/flush tid";
+ MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, oss.str());
+ m.metadata["client_count"] = stringify(large_completed_requests_metrics.size());
+ health.metrics.push_back(m);
+ large_completed_requests_metrics.clear();
+ }
+ }
+
+ // Detect MDS_HEALTH_SLOW_REQUEST condition
+ {
+ int slow = mds->get_mds_slow_req_count();
+ if (slow) {
+ dout(20) << slow << " slow request found" << dendl;
+ std::ostringstream oss;
+ oss << slow << " slow requests are blocked > " << g_conf()->mds_op_complaint_time << " secs";
+
+ MDSHealthMetric m(MDS_HEALTH_SLOW_REQUEST, HEALTH_WARN, oss.str());
+ health.metrics.push_back(m);
+ }
+ }
+
+ {
+ auto complaint_time = g_conf()->osd_op_complaint_time;
+ auto now = clock::now();
+ auto cutoff = now - ceph::make_timespan(complaint_time);
+
+ std::string count;
+ ceph::coarse_mono_time oldest;
+ if (MDSIOContextBase::check_ios_in_flight(cutoff, count, oldest)) {
+ dout(20) << count << " slow metadata IOs found" << dendl;
+
+ auto oldest_secs = std::chrono::duration<double>(now - oldest).count();
+ std::ostringstream oss;
+ oss << count << " slow metadata IOs are blocked > " << complaint_time
+ << " secs, oldest blocked for " << (int64_t)oldest_secs << " secs";
+
+ MDSHealthMetric m(MDS_HEALTH_SLOW_METADATA_IO, HEALTH_WARN, oss.str());
+ health.metrics.push_back(m);
+ }
+ }
+
+ // Report a health warning if we are readonly
+ if (mds->mdcache->is_readonly()) {
+ MDSHealthMetric m(MDS_HEALTH_READ_ONLY, HEALTH_WARN,
+ "MDS in read-only mode");
+ health.metrics.push_back(m);
+ }
+
+ // Report if we have significantly exceeded our cache size limit
+ if (mds->mdcache->cache_overfull()) {
+ std::ostringstream oss;
+ oss << "MDS cache is too large (" << bytes2str(mds->mdcache->cache_size())
+ << "/" << bytes2str(mds->mdcache->cache_limit_memory()) << "); "
+ << mds->mdcache->num_inodes_with_caps << " inodes in use by clients, "
+ << mds->mdcache->get_num_strays() << " stray files";
+
+ MDSHealthMetric m(MDS_HEALTH_CACHE_OVERSIZED, HEALTH_WARN, oss.str());
+ health.metrics.push_back(m);
+ }
+}
+
+MDSMap::DaemonState Beacon::get_want_state() const
+{
+ std::unique_lock lock(mutex);
+ return want_state;
+}
+
diff --git a/src/mds/Beacon.h b/src/mds/Beacon.h
new file mode 100644
index 00000000..2e84aa6c
--- /dev/null
+++ b/src/mds/Beacon.h
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef BEACON_STATE_H
+#define BEACON_STATE_H
+
+#include <mutex>
+#include <string_view>
+#include <thread>
+
+#include "include/types.h"
+#include "include/Context.h"
+#include "msg/Dispatcher.h"
+
+#include "messages/MMDSBeacon.h"
+
+class MonClient;
+class MDSRank;
+
+
+/**
+ * One of these per MDS. Handle beacon logic in this separate class so
+ * that a busy MDS holding its own lock does not hold up sending beacon
+ * messages to the mon and cause false lagginess.
+ *
+ * So that we can continue to operate while the MDS is holding its own lock,
+ * we keep copies of the data needed to generate beacon messages. The MDS is
+ * responsible for calling Beacon::notify_* when things change.
+ */
+class Beacon : public Dispatcher
+{
+public:
+ using clock = ceph::coarse_mono_clock;
+ using time = ceph::coarse_mono_time;
+
+ Beacon(CephContext *cct, MonClient *monc, std::string_view name);
+ ~Beacon() override;
+
+ void init(const MDSMap &mdsmap);
+ void shutdown();
+
+ bool ms_can_fast_dispatch_any() const override { return true; }
+ bool ms_can_fast_dispatch2(const Message::const_ref& m) const override;
+ void ms_fast_dispatch2(const Message::ref& m) override;
+ bool ms_dispatch2(const Message::ref &m) override;
+ void ms_handle_connect(Connection *c) override {}
+ bool ms_handle_reset(Connection *c) override {return false;}
+ void ms_handle_remote_reset(Connection *c) override {}
+ bool ms_handle_refused(Connection *c) override {return false;}
+
+ void notify_mdsmap(const MDSMap &mdsmap);
+ void notify_health(const MDSRank *mds);
+
+ void handle_mds_beacon(const MMDSBeacon::const_ref &m);
+ void send();
+
+ void set_want_state(const MDSMap &mdsmap, MDSMap::DaemonState newstate);
+ MDSMap::DaemonState get_want_state() const;
+
+ /**
+ * Send a beacon, and block until the ack is received from the mon
+ * or `duration` seconds pass, whichever happens sooner. Useful
+ * for emitting a last message on shutdown.
+ */
+ void send_and_wait(const double duration);
+
+ bool is_laggy();
+ double last_cleared_laggy() const {
+ std::unique_lock lock(mutex);
+ return std::chrono::duration<double>(clock::now()-last_laggy).count();
+ }
+
+private:
+ void _notify_mdsmap(const MDSMap &mdsmap);
+ bool _send();
+
+ mutable std::mutex mutex;
+ std::thread sender;
+ std::condition_variable cvar;
+ time last_send = clock::zero();
+ double beacon_interval = 5.0;
+ bool finished = false;
+ MonClient* monc;
+
+ // Items we duplicate from the MDS to have access under our own lock
+ std::string name;
+ version_t epoch = 0;
+ CompatSet compat;
+ MDSMap::DaemonState want_state = MDSMap::STATE_BOOT;
+
+ // Internal beacon state
+ version_t last_seq = 0; // last seq sent to monitor
+ std::map<version_t,time> seq_stamp; // seq # -> time sent
+ time last_acked_stamp = clock::zero(); // last time we sent a beacon that got acked
+ bool laggy = false;
+ time last_laggy = clock::zero();
+
+ // Health status to be copied into each beacon message
+ MDSHealth health;
+};
+
+#endif // BEACON_STATE_H
diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc
new file mode 100644
index 00000000..b2a7db1e
--- /dev/null
+++ b/src/mds/CDentry.cc
@@ -0,0 +1,630 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "CDentry.h"
+#include "CInode.h"
+#include "CDir.h"
+
+#include "MDSRank.h"
+#include "MDCache.h"
+#include "Locker.h"
+#include "LogSegment.h"
+
+#include "messages/MLock.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->dirfrag() << " " << name << ") "
+
+
+ostream& CDentry::print_db_line_prefix(ostream& out)
+{
+ return out << ceph_clock_now() << " mds." << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") ";
+}
+
+LockType CDentry::lock_type(CEPH_LOCK_DN);
+LockType CDentry::versionlock_type(CEPH_LOCK_DVERSION);
+
+
+// CDentry
+
+ostream& operator<<(ostream& out, const CDentry& dn)
+{
+ filepath path;
+ dn.make_path(path);
+
+ out << "[dentry " << path;
+
+ if (true || dn.first != 0 || dn.last != CEPH_NOSNAP) {
+ out << " [" << dn.first << ",";
+ if (dn.last == CEPH_NOSNAP)
+ out << "head";
+ else
+ out << dn.last;
+ out << ']';
+ }
+
+ if (dn.is_auth()) {
+ out << " auth";
+ if (dn.is_replicated())
+ out << dn.get_replicas();
+ } else {
+ out << " rep@" << dn.authority();
+ out << "." << dn.get_replica_nonce();
+ }
+
+ if (dn.get_linkage()->is_null()) out << " NULL";
+ if (dn.get_linkage()->is_remote()) {
+ out << " REMOTE(";
+ out << dn.get_linkage()->get_remote_d_type_string();
+ out << ")";
+ }
+
+ if (!dn.lock.is_sync_and_unlocked())
+ out << " " << dn.lock;
+ if (!dn.versionlock.is_sync_and_unlocked())
+ out << " " << dn.versionlock;
+
+ if (dn.get_projected_version() != dn.get_version())
+ out << " pv=" << dn.get_projected_version();
+ out << " v=" << dn.get_version();
+
+ if (dn.get_num_auth_pins()) {
+ out << " ap=" << dn.get_num_auth_pins();
+#ifdef MDS_AUTHPIN_SET
+ dn.print_authpin_set(out);
+#endif
+ }
+
+ {
+ const CInode *inode = dn.get_linkage()->get_inode();
+ out << " ino=";
+ if (inode) {
+ out << inode->ino();
+ } else {
+ out << "(nil)";
+ }
+ }
+
+ out << " state=" << dn.get_state();
+ if (dn.is_new()) out << "|new";
+ if (dn.state_test(CDentry::STATE_BOTTOMLRU)) out << "|bottomlru";
+
+ if (dn.get_num_ref()) {
+ out << " |";
+ dn.print_pin_set(out);
+ }
+
+ out << " " << &dn;
+ out << "]";
+ return out;
+}
+
+
+bool operator<(const CDentry& l, const CDentry& r)
+{
+ if ((l.get_dir()->ino() < r.get_dir()->ino()) ||
+ (l.get_dir()->ino() == r.get_dir()->ino() &&
+ (l.get_name() < r.get_name() ||
+ (l.get_name() == r.get_name() && l.last < r.last))))
+ return true;
+ return false;
+}
+
+
+void CDentry::print(ostream& out)
+{
+ out << *this;
+}
+
+
+/*
+inodeno_t CDentry::get_ino()
+{
+ if (get_inode())
+ return get_inode()->ino();
+ return inodeno_t();
+}
+*/
+
+mds_authority_t CDentry::authority() const
+{
+ return dir->authority();
+}
+
+
+void CDentry::add_waiter(uint64_t tag, MDSContext *c)
+{
+ // wait on the directory?
+ if (tag & (WAIT_UNFREEZE|WAIT_SINGLEAUTH)) {
+ dir->add_waiter(tag, c);
+ return;
+ }
+ MDSCacheObject::add_waiter(tag, c);
+}
+
+
+version_t CDentry::pre_dirty(version_t min)
+{
+ projected_version = dir->pre_dirty(min);
+ dout(10) << __func__ << " " << *this << dendl;
+ return projected_version;
+}
+
+
+void CDentry::_mark_dirty(LogSegment *ls)
+{
+ // state+pin
+ if (!state_test(STATE_DIRTY)) {
+ state_set(STATE_DIRTY);
+ get(PIN_DIRTY);
+ dir->inc_num_dirty();
+ dir->dirty_dentries.push_back(&item_dir_dirty);
+ ceph_assert(ls);
+ }
+ if (ls)
+ ls->dirty_dentries.push_back(&item_dirty);
+}
+
+void CDentry::mark_dirty(version_t pv, LogSegment *ls)
+{
+ dout(10) << __func__ << " " << *this << dendl;
+
+ // i now live in this new dir version
+ ceph_assert(pv <= projected_version);
+ version = pv;
+ _mark_dirty(ls);
+
+ // mark dir too
+ dir->mark_dirty(pv, ls);
+}
+
+
+void CDentry::mark_clean()
+{
+ dout(10) << __func__ << " " << *this << dendl;
+ ceph_assert(is_dirty());
+
+ // not always true for recalc_auth_bits during resolve finish
+ //assert(dir->get_version() == 0 || version <= dir->get_version()); // hmm?
+
+ state_clear(STATE_DIRTY|STATE_NEW);
+ dir->dec_num_dirty();
+
+ item_dir_dirty.remove_myself();
+ item_dirty.remove_myself();
+
+ put(PIN_DIRTY);
+}
+
+void CDentry::mark_new()
+{
+ dout(10) << __func__ << " " << *this << dendl;
+ state_set(STATE_NEW);
+}
+
+void CDentry::make_path_string(string& s, bool projected) const
+{
+ if (dir) {
+ dir->inode->make_path_string(s, projected);
+ } else {
+ s = "???";
+ }
+ s += "/";
+ s.append(name.data(), name.length());
+}
+
+void CDentry::make_path(filepath& fp, bool projected) const
+{
+ ceph_assert(dir);
+ dir->inode->make_path(fp, projected);
+ fp.push_dentry(get_name());
+}
+
+/*
+ * we only add ourselves to remote_parents when the linkage is
+ * active (no longer projected). if the passed dnl is projected,
+ * don't link in, and do that work later in pop_projected_linkage().
+ */
+void CDentry::link_remote(CDentry::linkage_t *dnl, CInode *in)
+{
+ ceph_assert(dnl->is_remote());
+ ceph_assert(in->ino() == dnl->get_remote_ino());
+ dnl->inode = in;
+
+ if (dnl == &linkage)
+ in->add_remote_parent(this);
+}
+
+void CDentry::unlink_remote(CDentry::linkage_t *dnl)
+{
+ ceph_assert(dnl->is_remote());
+ ceph_assert(dnl->inode);
+
+ if (dnl == &linkage)
+ dnl->inode->remove_remote_parent(this);
+
+ dnl->inode = 0;
+}
+
+void CDentry::push_projected_linkage()
+{
+ _project_linkage();
+
+ if (is_auth()) {
+ CInode *diri = dir->inode;
+ if (diri->is_stray())
+ diri->mdcache->notify_stray_removed();
+ }
+}
+
+
+void CDentry::push_projected_linkage(CInode *inode)
+{
+ // dirty rstat tracking is in the projected plane
+ bool dirty_rstat = inode->is_dirty_rstat();
+ if (dirty_rstat)
+ inode->clear_dirty_rstat();
+
+ _project_linkage()->inode = inode;
+ inode->push_projected_parent(this);
+
+ if (dirty_rstat)
+ inode->mark_dirty_rstat();
+
+ if (is_auth()) {
+ CInode *diri = dir->inode;
+ if (diri->is_stray())
+ diri->mdcache->notify_stray_created();
+ }
+}
+
+CDentry::linkage_t *CDentry::pop_projected_linkage()
+{
+ ceph_assert(projected.size());
+
+ linkage_t& n = projected.front();
+
+ /*
+ * the idea here is that the link_remote_inode(), link_primary_inode(),
+ * etc. calls should make linkage identical to &n (and we assert as
+ * much).
+ */
+
+ if (n.remote_ino) {
+ dir->link_remote_inode(this, n.remote_ino, n.remote_d_type);
+ if (n.inode) {
+ linkage.inode = n.inode;
+ linkage.inode->add_remote_parent(this);
+ }
+ } else if (n.inode) {
+ dir->link_primary_inode(this, n.inode);
+ n.inode->pop_projected_parent();
+ }
+
+ ceph_assert(n.inode == linkage.inode);
+ ceph_assert(n.remote_ino == linkage.remote_ino);
+ ceph_assert(n.remote_d_type == linkage.remote_d_type);
+
+ projected.pop_front();
+
+ return &linkage;
+}
+
+
+
+// ----------------------------
+// auth pins
+
+int CDentry::get_num_dir_auth_pins() const
+{
+ ceph_assert(!is_projected());
+ if (get_linkage()->is_primary())
+ return auth_pins + get_linkage()->get_inode()->get_num_auth_pins();
+ return auth_pins;
+}
+
+bool CDentry::can_auth_pin(int *err_ret) const
+{
+ ceph_assert(dir);
+ return dir->can_auth_pin(err_ret);
+}
+
+void CDentry::auth_pin(void *by)
+{
+ if (auth_pins == 0)
+ get(PIN_AUTHPIN);
+ auth_pins++;
+
+#ifdef MDS_AUTHPIN_SET
+ auth_pin_set.insert(by);
+#endif
+
+ dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl;
+
+ dir->adjust_nested_auth_pins(1, by);
+}
+
+void CDentry::auth_unpin(void *by)
+{
+ auth_pins--;
+
+#ifdef MDS_AUTHPIN_SET
+ {
+ auto it = auth_pin_set.find(by);
+ ceph_assert(it != auth_pin_set.end());
+ auth_pin_set.erase(it);
+ }
+#endif
+
+ if (auth_pins == 0)
+ put(PIN_AUTHPIN);
+
+ dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl;
+ ceph_assert(auth_pins >= 0);
+
+ dir->adjust_nested_auth_pins(-1, by);
+}
+
+void CDentry::adjust_nested_auth_pins(int diradj, void *by)
+{
+ dir->adjust_nested_auth_pins(diradj, by);
+}
+
+bool CDentry::is_frozen() const
+{
+ return dir->is_frozen();
+}
+
+bool CDentry::is_freezing() const
+{
+ return dir->is_freezing();
+}
+
+void CDentry::decode_replica(bufferlist::const_iterator& p, bool is_new)
+{
+ __u32 nonce;
+ decode(nonce, p);
+ replica_nonce = nonce;
+
+ decode(first, p);
+
+ inodeno_t rino;
+ unsigned char rdtype;
+ decode(rino, p);
+ decode(rdtype, p);
+ lock.decode_state(p, is_new);
+
+ bool need_recover;
+ decode(need_recover, p);
+
+ if (is_new) {
+ if (rino)
+ dir->link_remote_inode(this, rino, rdtype);
+ if (need_recover)
+ lock.mark_need_recover();
+ }
+}
+
+// ----------------------------
+// locking
+
+void CDentry::set_object_info(MDSCacheObjectInfo &info)
+{
+ info.dirfrag = dir->dirfrag();
+ info.dname = name;
+ info.snapid = last;
+}
+
+void CDentry::encode_lock_state(int type, bufferlist& bl)
+{
+ encode(first, bl);
+
+ // null, ino, or remote_ino?
+ char c;
+ if (linkage.is_primary()) {
+ c = 1;
+ encode(c, bl);
+ encode(linkage.get_inode()->inode.ino, bl);
+ }
+ else if (linkage.is_remote()) {
+ c = 2;
+ encode(c, bl);
+ encode(linkage.get_remote_ino(), bl);
+ }
+ else if (linkage.is_null()) {
+ // encode nothing.
+ }
+ else ceph_abort();
+}
+
+void CDentry::decode_lock_state(int type, const bufferlist& bl)
+{
+ auto p = bl.cbegin();
+
+ snapid_t newfirst;
+ decode(newfirst, p);
+
+ if (!is_auth() && newfirst != first) {
+ dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl;
+ ceph_assert(newfirst > first);
+ first = newfirst;
+ }
+
+ if (p.end()) {
+ // null
+ ceph_assert(linkage.is_null());
+ return;
+ }
+
+ char c;
+ inodeno_t ino;
+ decode(c, p);
+
+ switch (c) {
+ case 1:
+ case 2:
+ decode(ino, p);
+ // newly linked?
+ if (linkage.is_null() && !is_auth()) {
+ // force trim from cache!
+ dout(10) << __func__ << " replica dentry null -> non-null, must trim" << dendl;
+ //assert(get_num_ref() == 0);
+ } else {
+ // verify?
+
+ }
+ break;
+ default:
+ ceph_abort();
+ }
+}
+
+
+ClientLease *CDentry::add_client_lease(client_t c, Session *session)
+{
+ ClientLease *l;
+ if (client_lease_map.count(c))
+ l = client_lease_map[c];
+ else {
+ dout(20) << __func__ << " client." << c << " on " << lock << dendl;
+ if (client_lease_map.empty()) {
+ get(PIN_CLIENTLEASE);
+ lock.get_client_lease();
+ }
+ l = client_lease_map[c] = new ClientLease(c, this);
+ l->seq = ++session->lease_seq;
+
+ }
+
+ return l;
+}
+
+void CDentry::remove_client_lease(ClientLease *l, Locker *locker)
+{
+ ceph_assert(l->parent == this);
+
+ bool gather = false;
+
+ dout(20) << __func__ << " client." << l->client << " on " << lock << dendl;
+
+ client_lease_map.erase(l->client);
+ l->item_lease.remove_myself();
+ l->item_session_lease.remove_myself();
+ delete l;
+
+ if (client_lease_map.empty()) {
+ gather = !lock.is_stable();
+ lock.put_client_lease();
+ put(PIN_CLIENTLEASE);
+ }
+
+ if (gather)
+ locker->eval_gather(&lock);
+}
+
+void CDentry::remove_client_leases(Locker *locker)
+{
+ while (!client_lease_map.empty())
+ remove_client_lease(client_lease_map.begin()->second, locker);
+}
+
+void CDentry::_put()
+{
+ if (get_num_ref() <= ((int)is_dirty() + 1)) {
+ CDentry::linkage_t *dnl = get_projected_linkage();
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ if (get_num_ref() == (int)is_dirty() + !!in->get_num_ref())
+ in->mdcache->maybe_eval_stray(in, true);
+ }
+ }
+}
+
+void CDentry::dump(Formatter *f) const
+{
+ ceph_assert(f != NULL);
+
+ filepath path;
+ make_path(path);
+
+ f->dump_string("path", path.get_path());
+ f->dump_unsigned("path_ino", path.get_ino().val);
+ f->dump_unsigned("snap_first", first);
+ f->dump_unsigned("snap_last", last);
+
+ f->dump_bool("is_primary", get_linkage()->is_primary());
+ f->dump_bool("is_remote", get_linkage()->is_remote());
+ f->dump_bool("is_null", get_linkage()->is_null());
+ f->dump_bool("is_new", is_new());
+ if (get_linkage()->get_inode()) {
+ f->dump_unsigned("inode", get_linkage()->get_inode()->ino());
+ } else {
+ f->dump_unsigned("inode", 0);
+ }
+
+ if (linkage.is_remote()) {
+ f->dump_string("remote_type", linkage.get_remote_d_type_string());
+ } else {
+ f->dump_string("remote_type", "");
+ }
+
+ f->dump_unsigned("version", get_version());
+ f->dump_unsigned("projected_version", get_projected_version());
+
+ f->dump_int("auth_pins", auth_pins);
+
+ MDSCacheObject::dump(f);
+
+ f->open_object_section("lock");
+ lock.dump(f);
+ f->close_section();
+
+ f->open_object_section("versionlock");
+ versionlock.dump(f);
+ f->close_section();
+
+ f->open_array_section("states");
+ MDSCacheObject::dump_states(f);
+ if (state_test(STATE_NEW))
+ f->dump_string("state", "new");
+ if (state_test(STATE_FRAGMENTING))
+ f->dump_string("state", "fragmenting");
+ if (state_test(STATE_PURGING))
+ f->dump_string("state", "purging");
+ if (state_test(STATE_BADREMOTEINO))
+ f->dump_string("state", "badremoteino");
+ if (state_test(STATE_STRAY))
+ f->dump_string("state", "stray");
+ f->close_section();
+}
+
+std::string CDentry::linkage_t::get_remote_d_type_string() const
+{
+ switch (DTTOIF(remote_d_type)) {
+ case S_IFSOCK: return "sock";
+ case S_IFLNK: return "lnk";
+ case S_IFREG: return "reg";
+ case S_IFBLK: return "blk";
+ case S_IFDIR: return "dir";
+ case S_IFCHR: return "chr";
+ case S_IFIFO: return "fifo";
+ default: ceph_abort(); return "";
+ }
+}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(CDentry, co_dentry, mds_co);
diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h
new file mode 100644
index 00000000..56aa58c5
--- /dev/null
+++ b/src/mds/CDentry.h
@@ -0,0 +1,381 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef CEPH_CDENTRY_H
+#define CEPH_CDENTRY_H
+
+#include <string>
+#include <string_view>
+#include <set>
+
+#include "include/counter.h"
+#include "include/types.h"
+#include "include/buffer_fwd.h"
+#include "include/lru.h"
+#include "include/elist.h"
+#include "include/filepath.h"
+
+#include "MDSCacheObject.h"
+#include "MDSContext.h"
+#include "SimpleLock.h"
+#include "LocalLock.h"
+#include "ScrubHeader.h"
+
+class CInode;
+class CDir;
+class Locker;
+class CDentry;
+class LogSegment;
+
+class Session;
+
+
+
+// define an ordering
+bool operator<(const CDentry& l, const CDentry& r);
+
+// dentry
+class CDentry : public MDSCacheObject, public LRUObject, public Counter<CDentry> {
+public:
+ MEMPOOL_CLASS_HELPERS();
+ friend class CDir;
+
+ struct linkage_t {
+ CInode *inode = nullptr;
+ inodeno_t remote_ino = 0;
+ unsigned char remote_d_type = 0;
+
+ linkage_t() {}
+
+ // dentry type is primary || remote || null
+ // inode ptr is required for primary, optional for remote, undefined for null
+ bool is_primary() const { return remote_ino == 0 && inode != 0; }
+ bool is_remote() const { return remote_ino > 0; }
+ bool is_null() const { return remote_ino == 0 && inode == 0; }
+
+ CInode *get_inode() { return inode; }
+ const CInode *get_inode() const { return inode; }
+ inodeno_t get_remote_ino() const { return remote_ino; }
+ unsigned char get_remote_d_type() const { return remote_d_type; }
+ std::string get_remote_d_type_string() const;
+
+ void set_remote(inodeno_t ino, unsigned char d_type) {
+ remote_ino = ino;
+ remote_d_type = d_type;
+ inode = 0;
+ }
+ void link_remote(CInode *in);
+ };
+
+
+ // -- state --
+ static const int STATE_NEW = (1<<0);
+ static const int STATE_FRAGMENTING = (1<<1);
+ static const int STATE_PURGING = (1<<2);
+ static const int STATE_BADREMOTEINO = (1<<3);
+ static const int STATE_EVALUATINGSTRAY = (1<<4);
+ static const int STATE_PURGINGPINNED = (1<<5);
+ static const int STATE_BOTTOMLRU = (1<<6);
+ // stray dentry needs notification of releasing reference
+ static const int STATE_STRAY = STATE_NOTIFYREF;
+ static const int MASK_STATE_IMPORT_KEPT = STATE_BOTTOMLRU;
+
+ // -- pins --
+ static const int PIN_INODEPIN = 1; // linked inode is pinned
+ static const int PIN_FRAGMENTING = -2; // containing dir is refragmenting
+ static const int PIN_PURGING = 3;
+ static const int PIN_SCRUBPARENT = 4;
+
+ static const unsigned EXPORT_NONCE = 1;
+
+
+ CDentry(std::string_view n, __u32 h,
+ snapid_t f, snapid_t l) :
+ hash(h),
+ first(f), last(l),
+ item_dirty(this),
+ lock(this, &lock_type),
+ versionlock(this, &versionlock_type),
+ name(n)
+ {}
+ CDentry(std::string_view n, __u32 h, inodeno_t ino, unsigned char dt,
+ snapid_t f, snapid_t l) :
+ hash(h),
+ first(f), last(l),
+ item_dirty(this),
+ lock(this, &lock_type),
+ versionlock(this, &versionlock_type),
+ name(n)
+ {
+ linkage.remote_ino = ino;
+ linkage.remote_d_type = dt;
+ }
+
+ std::string_view pin_name(int p) const override {
+ switch (p) {
+ case PIN_INODEPIN: return "inodepin";
+ case PIN_FRAGMENTING: return "fragmenting";
+ case PIN_PURGING: return "purging";
+ case PIN_SCRUBPARENT: return "scrubparent";
+ default: return generic_pin_name(p);
+ }
+ }
+
+ // -- wait --
+ //static const int WAIT_LOCK_OFFSET = 8;
+
+ void add_waiter(uint64_t tag, MDSContext *c) override;
+
+ bool is_lt(const MDSCacheObject *r) const override {
+ return *this < *static_cast<const CDentry*>(r);
+ }
+
+ dentry_key_t key() {
+ return dentry_key_t(last, name.c_str(), hash);
+ }
+
+ const CDir *get_dir() const { return dir; }
+ CDir *get_dir() { return dir; }
+ std::string_view get_name() const { return std::string_view(name); }
+
+ __u32 get_hash() const { return hash; }
+
+ // linkage
+ const linkage_t *get_linkage() const { return &linkage; }
+ linkage_t *get_linkage() { return &linkage; }
+
+ linkage_t *_project_linkage() {
+ projected.push_back(linkage_t());
+ return &projected.back();
+ }
+ void push_projected_linkage();
+ void push_projected_linkage(inodeno_t ino, char d_type) {
+ linkage_t *p = _project_linkage();
+ p->remote_ino = ino;
+ p->remote_d_type = d_type;
+ }
+ void push_projected_linkage(CInode *inode);
+ linkage_t *pop_projected_linkage();
+
+ bool is_projected() const { return !projected.empty(); }
+
+ linkage_t *get_projected_linkage() {
+ if (!projected.empty())
+ return &projected.back();
+ return &linkage;
+ }
+
+ const linkage_t *get_projected_linkage() const {
+ if (!projected.empty())
+ return &projected.back();
+ return &linkage;
+ }
+
+ CInode *get_projected_inode() {
+ return get_projected_linkage()->inode;
+ }
+
+ bool use_projected(client_t client, const MutationRef& mut) const {
+ return lock.can_read_projected(client) ||
+ lock.get_xlock_by() == mut;
+ }
+ linkage_t *get_linkage(client_t client, const MutationRef& mut) {
+ return use_projected(client, mut) ? get_projected_linkage() : get_linkage();
+ }
+
+ // ref counts: pin ourselves in the LRU when we're pinned.
+ void first_get() override {
+ lru_pin();
+ }
+ void last_put() override {
+ lru_unpin();
+ }
+ void _put() override;
+
+ // auth pins
+ bool can_auth_pin(int *err_ret=nullptr) const override;
+ void auth_pin(void *by) override;
+ void auth_unpin(void *by) override;
+ void adjust_nested_auth_pins(int diradj, void *by);
+ bool is_frozen() const override;
+ bool is_freezing() const override;
+ int get_num_dir_auth_pins() const;
+
+ // remote links
+ void link_remote(linkage_t *dnl, CInode *in);
+ void unlink_remote(linkage_t *dnl);
+
+ // copy cons
+ CDentry(const CDentry& m);
+ const CDentry& operator= (const CDentry& right);
+
+ // misc
+ void make_path_string(std::string& s, bool projected=false) const;
+ void make_path(filepath& fp, bool projected=false) const;
+
+ // -- version --
+ version_t get_version() const { return version; }
+ void set_version(version_t v) { projected_version = version = v; }
+ version_t get_projected_version() const { return projected_version; }
+ void set_projected_version(version_t v) { projected_version = v; }
+
+ mds_authority_t authority() const override;
+
+ version_t pre_dirty(version_t min=0);
+ void _mark_dirty(LogSegment *ls);
+ void mark_dirty(version_t projected_dirv, LogSegment *ls);
+ void mark_clean();
+
+ void mark_new();
+ bool is_new() const { return state_test(STATE_NEW); }
+ void clear_new() { state_clear(STATE_NEW); }
+
+ // -- replication
+ void encode_replica(mds_rank_t mds, bufferlist& bl, bool need_recover) {
+ __u32 nonce = add_replica(mds);
+ encode(nonce, bl);
+ encode(first, bl);
+ encode(linkage.remote_ino, bl);
+ encode(linkage.remote_d_type, bl);
+ lock.encode_state_for_replica(bl);
+ encode(need_recover, bl);
+ }
+ void decode_replica(bufferlist::const_iterator& p, bool is_new);
+
+ // -- exporting
+ // note: this assumes the dentry already exists.
+ // i.e., the name is already extracted... so we just need the other state.
+ void encode_export(bufferlist& bl) {
+ encode(first, bl);
+ encode(state, bl);
+ encode(version, bl);
+ encode(projected_version, bl);
+ encode(lock, bl);
+ encode(get_replicas(), bl);
+ get(PIN_TEMPEXPORTING);
+ }
+ void finish_export() {
+ // twiddle
+ clear_replica_map();
+ replica_nonce = EXPORT_NONCE;
+ state_clear(CDentry::STATE_AUTH);
+ if (is_dirty())
+ mark_clean();
+ put(PIN_TEMPEXPORTING);
+ }
+ void abort_export() {
+ put(PIN_TEMPEXPORTING);
+ }
+ void decode_import(bufferlist::const_iterator& blp, LogSegment *ls) {
+ decode(first, blp);
+ __u32 nstate;
+ decode(nstate, blp);
+ decode(version, blp);
+ decode(projected_version, blp);
+ decode(lock, blp);
+ decode(get_replicas(), blp);
+
+ // twiddle
+ state &= MASK_STATE_IMPORT_KEPT;
+ state_set(CDentry::STATE_AUTH);
+ if (nstate & STATE_DIRTY)
+ _mark_dirty(ls);
+ if (is_replicated())
+ get(PIN_REPLICATED);
+ replica_nonce = 0;
+ }
+
+ // -- locking --
+ SimpleLock* get_lock(int type) override {
+ ceph_assert(type == CEPH_LOCK_DN);
+ return &lock;
+ }
+ void set_object_info(MDSCacheObjectInfo &info) override;
+ void encode_lock_state(int type, bufferlist& bl) override;
+ void decode_lock_state(int type, const bufferlist& bl) override;
+
+ // ---------------------------------------------
+ // replicas (on clients)
+
+ bool is_any_leases() const {
+ return !client_lease_map.empty();
+ }
+ const ClientLease *get_client_lease(client_t c) const {
+ if (client_lease_map.count(c))
+ return client_lease_map.find(c)->second;
+ return 0;
+ }
+ ClientLease *get_client_lease(client_t c) {
+ if (client_lease_map.count(c))
+ return client_lease_map.find(c)->second;
+ return 0;
+ }
+ bool have_client_lease(client_t c) const {
+ const ClientLease *l = get_client_lease(c);
+ if (l)
+ return true;
+ else
+ return false;
+ }
+
+ ClientLease *add_client_lease(client_t c, Session *session);
+ void remove_client_lease(ClientLease *r, Locker *locker); // returns remaining mask (if any), and kicks locker eval_gathers
+ void remove_client_leases(Locker *locker);
+
+ ostream& print_db_line_prefix(ostream& out) override;
+ void print(ostream& out) override;
+ void dump(Formatter *f) const;
+
+
+ __u32 hash;
+ snapid_t first, last;
+
+ elist<CDentry*>::item item_dirty, item_dir_dirty;
+ elist<CDentry*>::item item_stray;
+
+ // lock
+ static LockType lock_type;
+ static LockType versionlock_type;
+
+ SimpleLock lock; // FIXME referenced containers not in mempool
+ LocalLock versionlock; // FIXME referenced containers not in mempool
+
+ mempool::mds_co::map<client_t,ClientLease*> client_lease_map;
+
+
+protected:
+ friend class Migrator;
+ friend class Locker;
+ friend class MDCache;
+ friend class StrayManager;
+ friend class CInode;
+ friend class C_MDC_XlockRequest;
+
+ CDir *dir = nullptr; // containing dirfrag
+ linkage_t linkage;
+ mempool::mds_co::list<linkage_t> projected;
+
+ version_t version = 0; // dir version when last touched.
+ version_t projected_version = 0; // what it will be when i unlock/commit.
+
+private:
+ mempool::mds_co::string name;
+};
+
+ostream& operator<<(ostream& out, const CDentry& dn);
+
+
+#endif
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
new file mode 100755
index 00000000..e6576542
--- /dev/null
+++ b/src/mds/CDir.cc
@@ -0,0 +1,3520 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string_view>
+
+#include "include/types.h"
+
+#include "CDir.h"
+#include "CDentry.h"
+#include "CInode.h"
+#include "Mutation.h"
+
+#include "MDSMap.h"
+#include "MDSRank.h"
+#include "MDCache.h"
+#include "Locker.h"
+#include "MDLog.h"
+#include "LogSegment.h"
+
+#include "common/bloom_filter.hpp"
+#include "include/Context.h"
+#include "common/Clock.h"
+
+#include "osdc/Objecter.h"
+
+#include "common/config.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") "
+
+int CDir::num_frozen_trees = 0;
+int CDir::num_freezing_trees = 0;
+
+class CDirContext : public MDSContext
+{
+protected:
+ CDir *dir;
+ MDSRank* get_mds() override {return dir->cache->mds;}
+
+public:
+ explicit CDirContext(CDir *d) : dir(d) {
+ ceph_assert(dir != NULL);
+ }
+};
+
+
+class CDirIOContext : public MDSIOContextBase
+{
+protected:
+ CDir *dir;
+ MDSRank* get_mds() override {return dir->cache->mds;}
+
+public:
+ explicit CDirIOContext(CDir *d) : dir(d) {
+ ceph_assert(dir != NULL);
+ }
+};
+
+
+// PINS
+//int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+
+
+ostream& operator<<(ostream& out, const CDir& dir)
+{
+ out << "[dir " << dir.dirfrag() << " " << dir.get_path() << "/"
+ << " [" << dir.first << ",head]";
+ if (dir.is_auth()) {
+ out << " auth";
+ if (dir.is_replicated())
+ out << dir.get_replicas();
+
+ if (dir.is_projected())
+ out << " pv=" << dir.get_projected_version();
+ out << " v=" << dir.get_version();
+ out << " cv=" << dir.get_committing_version();
+ out << "/" << dir.get_committed_version();
+ } else {
+ mds_authority_t a = dir.authority();
+ out << " rep@" << a.first;
+ if (a.second != CDIR_AUTH_UNKNOWN)
+ out << "," << a.second;
+ out << "." << dir.get_replica_nonce();
+ }
+
+ if (dir.is_rep()) out << " REP";
+
+ if (dir.get_dir_auth() != CDIR_AUTH_DEFAULT) {
+ if (dir.get_dir_auth().second == CDIR_AUTH_UNKNOWN)
+ out << " dir_auth=" << dir.get_dir_auth().first;
+ else
+ out << " dir_auth=" << dir.get_dir_auth();
+ }
+
+ if (dir.get_auth_pins() || dir.get_dir_auth_pins()) {
+ out << " ap=" << dir.get_auth_pins()
+ << "+" << dir.get_dir_auth_pins();
+#ifdef MDS_AUTHPIN_SET
+ dir.print_authpin_set(out);
+#endif
+ }
+
+ out << " state=" << dir.get_state();
+ if (dir.state_test(CDir::STATE_COMPLETE)) out << "|complete";
+ if (dir.state_test(CDir::STATE_FREEZINGTREE)) out << "|freezingtree";
+ if (dir.state_test(CDir::STATE_FROZENTREE)) out << "|frozentree";
+ if (dir.state_test(CDir::STATE_AUXSUBTREE)) out << "|auxsubtree";
+ if (dir.state_test(CDir::STATE_FROZENDIR)) out << "|frozendir";
+ if (dir.state_test(CDir::STATE_FREEZINGDIR)) out << "|freezingdir";
+ if (dir.state_test(CDir::STATE_EXPORTBOUND)) out << "|exportbound";
+ if (dir.state_test(CDir::STATE_IMPORTBOUND)) out << "|importbound";
+ if (dir.state_test(CDir::STATE_BADFRAG)) out << "|badfrag";
+ if (dir.state_test(CDir::STATE_FRAGMENTING)) out << "|fragmenting";
+ if (dir.state_test(CDir::STATE_CREATING)) out << "|creating";
+ if (dir.state_test(CDir::STATE_COMMITTING)) out << "|committing";
+ if (dir.state_test(CDir::STATE_FETCHING)) out << "|fetching";
+ if (dir.state_test(CDir::STATE_EXPORTING)) out << "|exporting";
+ if (dir.state_test(CDir::STATE_IMPORTING)) out << "|importing";
+ if (dir.state_test(CDir::STATE_STICKY)) out << "|sticky";
+ if (dir.state_test(CDir::STATE_DNPINNEDFRAG)) out << "|dnpinnedfrag";
+ if (dir.state_test(CDir::STATE_ASSIMRSTAT)) out << "|assimrstat";
+
+ // fragstat
+ out << " " << dir.fnode.fragstat;
+ if (!(dir.fnode.fragstat == dir.fnode.accounted_fragstat))
+ out << "/" << dir.fnode.accounted_fragstat;
+ if (g_conf()->mds_debug_scatterstat && dir.is_projected()) {
+ const fnode_t *pf = dir.get_projected_fnode();
+ out << "->" << pf->fragstat;
+ if (!(pf->fragstat == pf->accounted_fragstat))
+ out << "/" << pf->accounted_fragstat;
+ }
+
+ // rstat
+ out << " " << dir.fnode.rstat;
+ if (!(dir.fnode.rstat == dir.fnode.accounted_rstat))
+ out << "/" << dir.fnode.accounted_rstat;
+ if (g_conf()->mds_debug_scatterstat && dir.is_projected()) {
+ const fnode_t *pf = dir.get_projected_fnode();
+ out << "->" << pf->rstat;
+ if (!(pf->rstat == pf->accounted_rstat))
+ out << "/" << pf->accounted_rstat;
+ }
+
+ out << " hs=" << dir.get_num_head_items() << "+" << dir.get_num_head_null();
+ out << ",ss=" << dir.get_num_snap_items() << "+" << dir.get_num_snap_null();
+ if (dir.get_num_dirty())
+ out << " dirty=" << dir.get_num_dirty();
+
+ if (dir.get_num_ref()) {
+ out << " |";
+ dir.print_pin_set(out);
+ }
+
+ out << " " << &dir;
+ return out << "]";
+}
+
+
+void CDir::print(ostream& out)
+{
+ out << *this;
+}
+
+
+
+
+ostream& CDir::print_db_line_prefix(ostream& out)
+{
+ return out << ceph_clock_now() << " mds." << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") ";
+}
+
+
+
+// -------------------------------------------------------------------
+// CDir
+
+CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) :
+ cache(mdcache), inode(in), frag(fg),
+ first(2),
+ dirty_rstat_inodes(member_offset(CInode, dirty_rstat_item)),
+ projected_version(0),
+ dirty_dentries(member_offset(CDentry, item_dir_dirty)),
+ item_dirty(this), item_new(this),
+ num_head_items(0), num_head_null(0),
+ num_snap_items(0), num_snap_null(0),
+ num_dirty(0), committing_version(0), committed_version(0),
+ dir_auth_pins(0),
+ dir_rep(REP_NONE),
+ pop_me(mdcache->decayrate),
+ pop_nested(mdcache->decayrate),
+ pop_auth_subtree(mdcache->decayrate),
+ pop_auth_subtree_nested(mdcache->decayrate),
+ pop_spread(mdcache->decayrate),
+ pop_lru_subdirs(member_offset(CInode, item_pop_lru)),
+ num_dentries_nested(0), num_dentries_auth_subtree(0),
+ num_dentries_auth_subtree_nested(0),
+ dir_auth(CDIR_AUTH_DEFAULT)
+{
+ // auth
+ ceph_assert(in->is_dir());
+ if (auth) state_set(STATE_AUTH);
+}
+
+/**
+ * Check the recursive statistics on size for consistency.
+ * If mds_debug_scatterstat is enabled, assert for correctness,
+ * otherwise just print out the mismatch and continue.
+ */
+bool CDir::check_rstats(bool scrub)
+{
+ if (!g_conf()->mds_debug_scatterstat && !scrub)
+ return true;
+
+ dout(25) << "check_rstats on " << this << dendl;
+ if (!is_complete() || !is_auth() || is_frozen()) {
+ dout(3) << "check_rstats " << (scrub ? "(scrub) " : "")
+ << "bailing out -- incomplete or non-auth or frozen dir on "
+ << *this << dendl;
+ return !scrub;
+ }
+
+ frag_info_t frag_info;
+ nest_info_t nest_info;
+ for (auto i = items.begin(); i != items.end(); ++i) {
+ if (i->second->last != CEPH_NOSNAP)
+ continue;
+ CDentry::linkage_t *dnl = i->second->get_linkage();
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ nest_info.add(in->inode.accounted_rstat);
+ if (in->is_dir())
+ frag_info.nsubdirs++;
+ else
+ frag_info.nfiles++;
+ } else if (dnl->is_remote())
+ frag_info.nfiles++;
+ }
+
+ bool good = true;
+ // fragstat
+ if(!frag_info.same_sums(fnode.fragstat)) {
+ dout(1) << "mismatch between head items and fnode.fragstat! printing dentries" << dendl;
+ dout(1) << "get_num_head_items() = " << get_num_head_items()
+ << "; fnode.fragstat.nfiles=" << fnode.fragstat.nfiles
+ << " fnode.fragstat.nsubdirs=" << fnode.fragstat.nsubdirs << dendl;
+ good = false;
+ } else {
+ dout(20) << "get_num_head_items() = " << get_num_head_items()
+ << "; fnode.fragstat.nfiles=" << fnode.fragstat.nfiles
+ << " fnode.fragstat.nsubdirs=" << fnode.fragstat.nsubdirs << dendl;
+ }
+
+ // rstat
+ if (!nest_info.same_sums(fnode.rstat)) {
+ dout(1) << "mismatch between child accounted_rstats and my rstats!" << dendl;
+ dout(1) << "total of child dentrys: " << nest_info << dendl;
+ dout(1) << "my rstats: " << fnode.rstat << dendl;
+ good = false;
+ } else {
+ dout(20) << "total of child dentrys: " << nest_info << dendl;
+ dout(20) << "my rstats: " << fnode.rstat << dendl;
+ }
+
+ if (!good) {
+ if (!scrub) {
+ for (auto i = items.begin(); i != items.end(); ++i) {
+ CDentry *dn = i->second;
+ if (dn->get_linkage()->is_primary()) {
+ CInode *in = dn->get_linkage()->inode;
+ dout(1) << *dn << " rstat " << in->inode.accounted_rstat << dendl;
+ } else {
+ dout(1) << *dn << dendl;
+ }
+ }
+
+ ceph_assert(frag_info.nfiles == fnode.fragstat.nfiles);
+ ceph_assert(frag_info.nsubdirs == fnode.fragstat.nsubdirs);
+ ceph_assert(nest_info.rbytes == fnode.rstat.rbytes);
+ ceph_assert(nest_info.rfiles == fnode.rstat.rfiles);
+ ceph_assert(nest_info.rsubdirs == fnode.rstat.rsubdirs);
+ }
+ }
+ dout(10) << "check_rstats complete on " << this << dendl;
+ return good;
+}
+
+void CDir::adjust_num_inodes_with_caps(int d)
+{
+ // FIXME: smarter way to decide if adding 'this' to open file table
+ if (num_inodes_with_caps == 0 && d > 0)
+ cache->open_file_table.add_dirfrag(this);
+ else if (num_inodes_with_caps > 0 && num_inodes_with_caps == -d)
+ cache->open_file_table.remove_dirfrag(this);
+
+ num_inodes_with_caps += d;
+ ceph_assert(num_inodes_with_caps >= 0);
+}
+
+CDentry *CDir::lookup(std::string_view name, snapid_t snap)
+{
+ dout(20) << "lookup (" << snap << ", '" << name << "')" << dendl;
+ auto iter = items.lower_bound(dentry_key_t(snap, name, inode->hash_dentry_name(name)));
+ if (iter == items.end())
+ return 0;
+ if (iter->second->get_name() == name &&
+ iter->second->first <= snap &&
+ iter->second->last >= snap) {
+ dout(20) << " hit -> " << iter->first << dendl;
+ return iter->second;
+ }
+ dout(20) << " miss -> " << iter->first << dendl;
+ return 0;
+}
+
+CDentry *CDir::lookup_exact_snap(std::string_view name, snapid_t last) {
+ dout(20) << __func__ << " (" << last << ", '" << name << "')" << dendl;
+ auto p = items.find(dentry_key_t(last, name, inode->hash_dentry_name(name)));
+ if (p == items.end())
+ return NULL;
+ return p->second;
+}
+
+/***
+ * linking fun
+ */
+
+CDentry* CDir::add_null_dentry(std::string_view dname,
+ snapid_t first, snapid_t last)
+{
+ // foreign
+ ceph_assert(lookup_exact_snap(dname, last) == 0);
+
+ // create dentry
+ CDentry* dn = new CDentry(dname, inode->hash_dentry_name(dname), first, last);
+ if (is_auth())
+ dn->state_set(CDentry::STATE_AUTH);
+
+ cache->bottom_lru.lru_insert_mid(dn);
+ dn->state_set(CDentry::STATE_BOTTOMLRU);
+
+ dn->dir = this;
+ dn->version = get_projected_version();
+
+ // add to dir
+ ceph_assert(items.count(dn->key()) == 0);
+ //assert(null_items.count(dn->get_name()) == 0);
+
+ items[dn->key()] = dn;
+ if (last == CEPH_NOSNAP)
+ num_head_null++;
+ else
+ num_snap_null++;
+
+ if (state_test(CDir::STATE_DNPINNEDFRAG)) {
+ dn->get(CDentry::PIN_FRAGMENTING);
+ dn->state_set(CDentry::STATE_FRAGMENTING);
+ }
+
+ dout(12) << __func__ << " " << *dn << dendl;
+
+ // pin?
+ if (get_num_any() == 1)
+ get(PIN_CHILD);
+
+ ceph_assert(get_num_any() == items.size());
+ return dn;
+}
+
+
+CDentry* CDir::add_primary_dentry(std::string_view dname, CInode *in,
+ snapid_t first, snapid_t last)
+{
+ // primary
+ ceph_assert(lookup_exact_snap(dname, last) == 0);
+
+ // create dentry
+ CDentry* dn = new CDentry(dname, inode->hash_dentry_name(dname), first, last);
+ if (is_auth())
+ dn->state_set(CDentry::STATE_AUTH);
+ if (is_auth() || !inode->is_stray()) {
+ cache->lru.lru_insert_mid(dn);
+ } else {
+ cache->bottom_lru.lru_insert_mid(dn);
+ dn->state_set(CDentry::STATE_BOTTOMLRU);
+ }
+
+ dn->dir = this;
+ dn->version = get_projected_version();
+
+ // add to dir
+ ceph_assert(items.count(dn->key()) == 0);
+ //assert(null_items.count(dn->get_name()) == 0);
+
+ items[dn->key()] = dn;
+
+ dn->get_linkage()->inode = in;
+
+ link_inode_work(dn, in);
+
+ if (dn->last == CEPH_NOSNAP)
+ num_head_items++;
+ else
+ num_snap_items++;
+
+ if (state_test(CDir::STATE_DNPINNEDFRAG)) {
+ dn->get(CDentry::PIN_FRAGMENTING);
+ dn->state_set(CDentry::STATE_FRAGMENTING);
+ }
+
+ dout(12) << __func__ << " " << *dn << dendl;
+
+ // pin?
+ if (get_num_any() == 1)
+ get(PIN_CHILD);
+ ceph_assert(get_num_any() == items.size());
+ return dn;
+}
+
+CDentry* CDir::add_remote_dentry(std::string_view dname, inodeno_t ino, unsigned char d_type,
+ snapid_t first, snapid_t last)
+{
+ // foreign
+ ceph_assert(lookup_exact_snap(dname, last) == 0);
+
+ // create dentry
+ CDentry* dn = new CDentry(dname, inode->hash_dentry_name(dname), ino, d_type, first, last);
+ if (is_auth())
+ dn->state_set(CDentry::STATE_AUTH);
+ cache->lru.lru_insert_mid(dn);
+
+ dn->dir = this;
+ dn->version = get_projected_version();
+
+ // add to dir
+ ceph_assert(items.count(dn->key()) == 0);
+ //assert(null_items.count(dn->get_name()) == 0);
+
+ items[dn->key()] = dn;
+ if (last == CEPH_NOSNAP)
+ num_head_items++;
+ else
+ num_snap_items++;
+
+ if (state_test(CDir::STATE_DNPINNEDFRAG)) {
+ dn->get(CDentry::PIN_FRAGMENTING);
+ dn->state_set(CDentry::STATE_FRAGMENTING);
+ }
+
+ dout(12) << __func__ << " " << *dn << dendl;
+
+ // pin?
+ if (get_num_any() == 1)
+ get(PIN_CHILD);
+
+ ceph_assert(get_num_any() == items.size());
+ return dn;
+}
+
+
+
+void CDir::remove_dentry(CDentry *dn)
+{
+ dout(12) << __func__ << " " << *dn << dendl;
+
+ // there should be no client leases at this point!
+ ceph_assert(dn->client_lease_map.empty());
+
+ if (state_test(CDir::STATE_DNPINNEDFRAG)) {
+ dn->put(CDentry::PIN_FRAGMENTING);
+ dn->state_clear(CDentry::STATE_FRAGMENTING);
+ }
+
+ if (dn->get_linkage()->is_null()) {
+ if (dn->last == CEPH_NOSNAP)
+ num_head_null--;
+ else
+ num_snap_null--;
+ } else {
+ if (dn->last == CEPH_NOSNAP)
+ num_head_items--;
+ else
+ num_snap_items--;
+ }
+
+ if (!dn->get_linkage()->is_null())
+ // detach inode and dentry
+ unlink_inode_work(dn);
+
+ // remove from list
+ ceph_assert(items.count(dn->key()) == 1);
+ items.erase(dn->key());
+
+ // clean?
+ if (dn->is_dirty())
+ dn->mark_clean();
+
+ if (dn->state_test(CDentry::STATE_BOTTOMLRU))
+ cache->bottom_lru.lru_remove(dn);
+ else
+ cache->lru.lru_remove(dn);
+ delete dn;
+
+ // unpin?
+ if (get_num_any() == 0)
+ put(PIN_CHILD);
+ ceph_assert(get_num_any() == items.size());
+}
+
+void CDir::link_remote_inode(CDentry *dn, CInode *in)
+{
+ link_remote_inode(dn, in->ino(), IFTODT(in->get_projected_inode()->mode));
+}
+
+void CDir::link_remote_inode(CDentry *dn, inodeno_t ino, unsigned char d_type)
+{
+ dout(12) << __func__ << " " << *dn << " remote " << ino << dendl;
+ ceph_assert(dn->get_linkage()->is_null());
+
+ dn->get_linkage()->set_remote(ino, d_type);
+
+ if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
+ cache->bottom_lru.lru_remove(dn);
+ cache->lru.lru_insert_mid(dn);
+ dn->state_clear(CDentry::STATE_BOTTOMLRU);
+ }
+
+ if (dn->last == CEPH_NOSNAP) {
+ num_head_items++;
+ num_head_null--;
+ } else {
+ num_snap_items++;
+ num_snap_null--;
+ }
+ ceph_assert(get_num_any() == items.size());
+}
+
+void CDir::link_primary_inode(CDentry *dn, CInode *in)
+{
+ dout(12) << __func__ << " " << *dn << " " << *in << dendl;
+ ceph_assert(dn->get_linkage()->is_null());
+
+ dn->get_linkage()->inode = in;
+
+ link_inode_work(dn, in);
+
+ if (dn->state_test(CDentry::STATE_BOTTOMLRU) &&
+ (is_auth() || !inode->is_stray())) {
+ cache->bottom_lru.lru_remove(dn);
+ cache->lru.lru_insert_mid(dn);
+ dn->state_clear(CDentry::STATE_BOTTOMLRU);
+ }
+
+ if (dn->last == CEPH_NOSNAP) {
+ num_head_items++;
+ num_head_null--;
+ } else {
+ num_snap_items++;
+ num_snap_null--;
+ }
+
+ ceph_assert(get_num_any() == items.size());
+}
+
+void CDir::link_inode_work( CDentry *dn, CInode *in)
+{
+ ceph_assert(dn->get_linkage()->get_inode() == in);
+ in->set_primary_parent(dn);
+
+ // set inode version
+ //in->inode.version = dn->get_version();
+
+ // pin dentry?
+ if (in->get_num_ref())
+ dn->get(CDentry::PIN_INODEPIN);
+
+ if (in->state_test(CInode::STATE_TRACKEDBYOFT))
+ inode->mdcache->open_file_table.notify_link(in);
+ if (in->is_any_caps())
+ adjust_num_inodes_with_caps(1);
+
+ // adjust auth pin count
+ if (in->auth_pins)
+ dn->adjust_nested_auth_pins(in->auth_pins, NULL);
+
+ // verify open snaprealm parent
+ if (in->snaprealm)
+ in->snaprealm->adjust_parent();
+ else if (in->is_any_caps())
+ in->move_to_realm(inode->find_snaprealm());
+}
+
+void CDir::unlink_inode(CDentry *dn, bool adjust_lru)
+{
+ if (dn->get_linkage()->is_primary()) {
+ dout(12) << __func__ << " " << *dn << " " << *dn->get_linkage()->get_inode() << dendl;
+ } else {
+ dout(12) << __func__ << " " << *dn << dendl;
+ }
+
+ unlink_inode_work(dn);
+
+ if (adjust_lru && !dn->state_test(CDentry::STATE_BOTTOMLRU)) {
+ cache->lru.lru_remove(dn);
+ cache->bottom_lru.lru_insert_mid(dn);
+ dn->state_set(CDentry::STATE_BOTTOMLRU);
+ }
+
+ if (dn->last == CEPH_NOSNAP) {
+ num_head_items--;
+ num_head_null++;
+ } else {
+ num_snap_items--;
+ num_snap_null++;
+ }
+ ceph_assert(get_num_any() == items.size());
+}
+
+
+void CDir::try_remove_unlinked_dn(CDentry *dn)
+{
+ ceph_assert(dn->dir == this);
+ ceph_assert(dn->get_linkage()->is_null());
+
+ // no pins (besides dirty)?
+ if (dn->get_num_ref() != dn->is_dirty())
+ return;
+
+ // was the dn new?
+ if (dn->is_new()) {
+ dout(10) << __func__ << " " << *dn << " in " << *this << dendl;
+ if (dn->is_dirty())
+ dn->mark_clean();
+ remove_dentry(dn);
+
+ // NOTE: we may not have any more dirty dentries, but the fnode
+ // still changed, so the directory must remain dirty.
+ }
+}
+
+
+void CDir::unlink_inode_work(CDentry *dn)
+{
+ CInode *in = dn->get_linkage()->get_inode();
+
+ if (dn->get_linkage()->is_remote()) {
+ // remote
+ if (in)
+ dn->unlink_remote(dn->get_linkage());
+
+ dn->get_linkage()->set_remote(0, 0);
+ } else if (dn->get_linkage()->is_primary()) {
+ // primary
+ // unpin dentry?
+ if (in->get_num_ref())
+ dn->put(CDentry::PIN_INODEPIN);
+
+ if (in->state_test(CInode::STATE_TRACKEDBYOFT))
+ inode->mdcache->open_file_table.notify_unlink(in);
+ if (in->is_any_caps())
+ adjust_num_inodes_with_caps(-1);
+
+ // unlink auth_pin count
+ if (in->auth_pins)
+ dn->adjust_nested_auth_pins(-in->auth_pins, nullptr);
+
+ // detach inode
+ in->remove_primary_parent(dn);
+ if (in->is_dir())
+ in->item_pop_lru.remove_myself();
+ dn->get_linkage()->inode = 0;
+ } else {
+ ceph_assert(!dn->get_linkage()->is_null());
+ }
+}
+
+void CDir::add_to_bloom(CDentry *dn)
+{
+ ceph_assert(dn->last == CEPH_NOSNAP);
+ if (!bloom) {
+ /* not create bloom filter for incomplete dir that was added by log replay */
+ if (!is_complete())
+ return;
+
+ /* don't maintain bloom filters in standby replay (saves cycles, and also
+ * avoids need to implement clearing it in EExport for #16924) */
+ if (cache->mds->is_standby_replay()) {
+ return;
+ }
+
+ unsigned size = get_num_head_items() + get_num_snap_items();
+ if (size < 100) size = 100;
+ bloom.reset(new bloom_filter(size, 1.0 / size, 0));
+ }
+ /* This size and false positive probability is completely random.*/
+ bloom->insert(dn->get_name().data(), dn->get_name().size());
+}
+
+bool CDir::is_in_bloom(std::string_view name)
+{
+ if (!bloom)
+ return false;
+ return bloom->contains(name.data(), name.size());
+}
+
+void CDir::remove_null_dentries() {
+ dout(12) << __func__ << " " << *this << dendl;
+
+ auto p = items.begin();
+ while (p != items.end()) {
+ CDentry *dn = p->second;
+ ++p;
+ if (dn->get_linkage()->is_null() && !dn->is_projected())
+ remove_dentry(dn);
+ }
+
+ ceph_assert(num_snap_null == 0);
+ ceph_assert(num_head_null == 0);
+ ceph_assert(get_num_any() == items.size());
+}
+
+/** remove dirty null dentries for deleted directory. the dirfrag will be
+ * deleted soon, so it's safe to not commit dirty dentries.
+ *
+ * This is called when a directory is being deleted, a prerequisite
+ * of which is that its children have been unlinked: we expect to only see
+ * null, unprojected dentries here.
+ */
+void CDir::try_remove_dentries_for_stray()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(get_parent_dir()->inode->is_stray());
+
+ // clear dirty only when the directory was not snapshotted
+ bool clear_dirty = !inode->snaprealm;
+
+ auto p = items.begin();
+ while (p != items.end()) {
+ CDentry *dn = p->second;
+ ++p;
+ if (dn->last == CEPH_NOSNAP) {
+ ceph_assert(!dn->is_projected());
+ ceph_assert(dn->get_linkage()->is_null());
+ if (clear_dirty && dn->is_dirty())
+ dn->mark_clean();
+ // It's OK to remove lease prematurely because we will never link
+ // the dentry to inode again.
+ if (dn->is_any_leases())
+ dn->remove_client_leases(cache->mds->locker);
+ if (dn->get_num_ref() == 0)
+ remove_dentry(dn);
+ } else {
+ ceph_assert(!dn->is_projected());
+ CDentry::linkage_t *dnl= dn->get_linkage();
+ CInode *in = NULL;
+ if (dnl->is_primary()) {
+ in = dnl->get_inode();
+ if (clear_dirty && in->is_dirty())
+ in->mark_clean();
+ }
+ if (clear_dirty && dn->is_dirty())
+ dn->mark_clean();
+ if (dn->get_num_ref() == 0) {
+ remove_dentry(dn);
+ if (in)
+ cache->remove_inode(in);
+ }
+ }
+ }
+
+ if (clear_dirty && is_dirty())
+ mark_clean();
+}
+
+bool CDir::try_trim_snap_dentry(CDentry *dn, const set<snapid_t>& snaps)
+{
+ ceph_assert(dn->last != CEPH_NOSNAP);
+ set<snapid_t>::const_iterator p = snaps.lower_bound(dn->first);
+ CDentry::linkage_t *dnl= dn->get_linkage();
+ CInode *in = 0;
+ if (dnl->is_primary())
+ in = dnl->get_inode();
+ if ((p == snaps.end() || *p > dn->last) &&
+ (dn->get_num_ref() == dn->is_dirty()) &&
+ (!in || in->get_num_ref() == in->is_dirty())) {
+ dout(10) << " purging snapped " << *dn << dendl;
+ if (in && in->is_dirty())
+ in->mark_clean();
+ remove_dentry(dn);
+ if (in) {
+ dout(10) << " purging snapped " << *in << dendl;
+ cache->remove_inode(in);
+ }
+ return true;
+ }
+ return false;
+}
+
+
+void CDir::purge_stale_snap_data(const set<snapid_t>& snaps)
+{
+ dout(10) << __func__ << " " << snaps << dendl;
+
+ auto p = items.begin();
+ while (p != items.end()) {
+ CDentry *dn = p->second;
+ ++p;
+
+ if (dn->last == CEPH_NOSNAP)
+ continue;
+
+ try_trim_snap_dentry(dn, snaps);
+ }
+}
+
+
+/**
+ * steal_dentry -- semi-violently move a dentry from one CDir to another
+ * (*) violently, in that nitems, most pins, etc. are not correctly maintained
+ * on the old CDir corpse; must call finish_old_fragment() when finished.
+ */
+void CDir::steal_dentry(CDentry *dn)
+{
+ dout(15) << __func__ << " " << *dn << dendl;
+
+ items[dn->key()] = dn;
+
+ dn->dir->items.erase(dn->key());
+ if (dn->dir->items.empty())
+ dn->dir->put(PIN_CHILD);
+
+ if (get_num_any() == 0)
+ get(PIN_CHILD);
+ if (dn->get_linkage()->is_null()) {
+ if (dn->last == CEPH_NOSNAP)
+ num_head_null++;
+ else
+ num_snap_null++;
+ } else if (dn->last == CEPH_NOSNAP) {
+ num_head_items++;
+
+ if (dn->get_linkage()->is_primary()) {
+ CInode *in = dn->get_linkage()->get_inode();
+ auto pi = in->get_projected_inode();
+ if (in->is_dir()) {
+ fnode.fragstat.nsubdirs++;
+ if (in->item_pop_lru.is_on_list())
+ pop_lru_subdirs.push_back(&in->item_pop_lru);
+ } else {
+ fnode.fragstat.nfiles++;
+ }
+ fnode.rstat.rbytes += pi->accounted_rstat.rbytes;
+ fnode.rstat.rfiles += pi->accounted_rstat.rfiles;
+ fnode.rstat.rsubdirs += pi->accounted_rstat.rsubdirs;
+ fnode.rstat.rsnaps += pi->accounted_rstat.rsnaps;
+ if (pi->accounted_rstat.rctime > fnode.rstat.rctime)
+ fnode.rstat.rctime = pi->accounted_rstat.rctime;
+
+ if (in->is_any_caps())
+ adjust_num_inodes_with_caps(1);
+
+ // move dirty inode rstat to new dirfrag
+ if (in->is_dirty_rstat())
+ dirty_rstat_inodes.push_back(&in->dirty_rstat_item);
+ } else if (dn->get_linkage()->is_remote()) {
+ if (dn->get_linkage()->get_remote_d_type() == DT_DIR)
+ fnode.fragstat.nsubdirs++;
+ else
+ fnode.fragstat.nfiles++;
+ }
+ } else {
+ num_snap_items++;
+ if (dn->get_linkage()->is_primary()) {
+ CInode *in = dn->get_linkage()->get_inode();
+ if (in->is_dirty_rstat())
+ dirty_rstat_inodes.push_back(&in->dirty_rstat_item);
+ }
+ }
+
+ {
+ int dap = dn->get_num_dir_auth_pins();
+ if (dap) {
+ adjust_nested_auth_pins(dap, NULL);
+ dn->dir->adjust_nested_auth_pins(-dap, NULL);
+ }
+ }
+
+ if (dn->is_dirty()) {
+ dirty_dentries.push_back(&dn->item_dir_dirty);
+ num_dirty++;
+ }
+
+ dn->dir = this;
+}
+
+void CDir::prepare_old_fragment(map<string_snap_t, MDSContext::vec >& dentry_waiters, bool replay)
+{
+ // auth_pin old fragment for duration so that any auth_pinning
+ // during the dentry migration doesn't trigger side effects
+ if (!replay && is_auth())
+ auth_pin(this);
+
+ if (!waiting_on_dentry.empty()) {
+ for (const auto &p : waiting_on_dentry) {
+ auto &e = dentry_waiters[p.first];
+ for (const auto &waiter : p.second) {
+ e.push_back(waiter);
+ }
+ }
+ waiting_on_dentry.clear();
+ put(PIN_DNWAITER);
+ }
+}
+
+void CDir::prepare_new_fragment(bool replay)
+{
+ if (!replay && is_auth()) {
+ _freeze_dir();
+ mark_complete();
+ }
+ inode->add_dirfrag(this);
+}
+
+void CDir::finish_old_fragment(MDSContext::vec& waiters, bool replay)
+{
+ // take waiters _before_ unfreeze...
+ if (!replay) {
+ take_waiting(WAIT_ANY_MASK, waiters);
+ if (is_auth()) {
+ auth_unpin(this); // pinned in prepare_old_fragment
+ ceph_assert(is_frozen_dir());
+ unfreeze_dir();
+ }
+ }
+
+ ceph_assert(dir_auth_pins == 0);
+ ceph_assert(auth_pins == 0);
+
+ num_head_items = num_head_null = 0;
+ num_snap_items = num_snap_null = 0;
+ adjust_num_inodes_with_caps(-num_inodes_with_caps);
+
+ // this mirrors init_fragment_pins()
+ if (is_auth())
+ clear_replica_map();
+ if (is_dirty())
+ mark_clean();
+ if (state_test(STATE_IMPORTBOUND))
+ put(PIN_IMPORTBOUND);
+ if (state_test(STATE_EXPORTBOUND))
+ put(PIN_EXPORTBOUND);
+ if (is_subtree_root())
+ put(PIN_SUBTREE);
+
+ if (auth_pins > 0)
+ put(PIN_AUTHPIN);
+
+ ceph_assert(get_num_ref() == (state_test(STATE_STICKY) ? 1:0));
+}
+
+void CDir::init_fragment_pins()
+{
+ if (is_replicated())
+ get(PIN_REPLICATED);
+ if (state_test(STATE_DIRTY))
+ get(PIN_DIRTY);
+ if (state_test(STATE_EXPORTBOUND))
+ get(PIN_EXPORTBOUND);
+ if (state_test(STATE_IMPORTBOUND))
+ get(PIN_IMPORTBOUND);
+ if (is_subtree_root())
+ get(PIN_SUBTREE);
+}
+
+void CDir::split(int bits, list<CDir*>& subs, MDSContext::vec& waiters, bool replay)
+{
+ dout(10) << "split by " << bits << " bits on " << *this << dendl;
+
+ ceph_assert(replay || is_complete() || !is_auth());
+
+ frag_vec_t frags;
+ frag.split(bits, frags);
+
+ vector<CDir*> subfrags(1 << bits);
+
+ double fac = 1.0 / (double)(1 << bits); // for scaling load vecs
+
+ version_t rstat_version = inode->get_projected_inode()->rstat.version;
+ version_t dirstat_version = inode->get_projected_inode()->dirstat.version;
+
+ nest_info_t rstatdiff;
+ frag_info_t fragstatdiff;
+ if (fnode.accounted_rstat.version == rstat_version)
+ rstatdiff.add_delta(fnode.accounted_rstat, fnode.rstat);
+ if (fnode.accounted_fragstat.version == dirstat_version)
+ fragstatdiff.add_delta(fnode.accounted_fragstat, fnode.fragstat);
+ dout(10) << " rstatdiff " << rstatdiff << " fragstatdiff " << fragstatdiff << dendl;
+
+ map<string_snap_t, MDSContext::vec > dentry_waiters;
+ prepare_old_fragment(dentry_waiters, replay);
+
+ // create subfrag dirs
+ int n = 0;
+ for (const auto& fg : frags) {
+ CDir *f = new CDir(inode, fg, cache, is_auth());
+ f->state_set(state & (MASK_STATE_FRAGMENT_KEPT | STATE_COMPLETE));
+ f->get_replicas() = get_replicas();
+ f->set_version(get_version());
+ f->pop_me = pop_me;
+ f->pop_me.scale(fac);
+
+ // FIXME; this is an approximation
+ f->pop_nested = pop_nested;
+ f->pop_nested.scale(fac);
+ f->pop_auth_subtree = pop_auth_subtree;
+ f->pop_auth_subtree.scale(fac);
+ f->pop_auth_subtree_nested = pop_auth_subtree_nested;
+ f->pop_auth_subtree_nested.scale(fac);
+
+ dout(10) << " subfrag " << fg << " " << *f << dendl;
+ subfrags[n++] = f;
+ subs.push_back(f);
+
+ f->set_dir_auth(get_dir_auth());
+ f->freeze_tree_state = freeze_tree_state;
+ f->prepare_new_fragment(replay);
+ f->init_fragment_pins();
+ }
+
+ // repartition dentries
+ while (!items.empty()) {
+ auto p = items.begin();
+
+ CDentry *dn = p->second;
+ frag_t subfrag = inode->pick_dirfrag(dn->get_name());
+ int n = (subfrag.value() & (subfrag.mask() ^ frag.mask())) >> subfrag.mask_shift();
+ dout(15) << " subfrag " << subfrag << " n=" << n << " for " << p->first << dendl;
+ CDir *f = subfrags[n];
+ f->steal_dentry(dn);
+ }
+
+ for (const auto &p : dentry_waiters) {
+ frag_t subfrag = inode->pick_dirfrag(p.first.name);
+ int n = (subfrag.value() & (subfrag.mask() ^ frag.mask())) >> subfrag.mask_shift();
+ CDir *f = subfrags[n];
+
+ if (f->waiting_on_dentry.empty())
+ f->get(PIN_DNWAITER);
+ auto &e = f->waiting_on_dentry[p.first];
+ for (const auto &waiter : p.second) {
+ e.push_back(waiter);
+ }
+ }
+
+ // FIXME: handle dirty old rstat
+
+ // fix up new frag fragstats
+ for (int i=0; i<n; i++) {
+ CDir *f = subfrags[i];
+ f->fnode.rstat.version = rstat_version;
+ f->fnode.accounted_rstat = f->fnode.rstat;
+ f->fnode.fragstat.version = dirstat_version;
+ f->fnode.accounted_fragstat = f->fnode.fragstat;
+ dout(10) << " rstat " << f->fnode.rstat << " fragstat " << f->fnode.fragstat
+ << " on " << *f << dendl;
+ }
+
+ // give any outstanding frag stat differential to first frag
+ dout(10) << " giving rstatdiff " << rstatdiff << " fragstatdiff" << fragstatdiff
+ << " to " << *subfrags[0] << dendl;
+ subfrags[0]->fnode.accounted_rstat.add(rstatdiff);
+ subfrags[0]->fnode.accounted_fragstat.add(fragstatdiff);
+
+ finish_old_fragment(waiters, replay);
+}
+
+void CDir::merge(list<CDir*>& subs, MDSContext::vec& waiters, bool replay)
+{
+ dout(10) << "merge " << subs << dendl;
+
+ set_dir_auth(subs.front()->get_dir_auth());
+ freeze_tree_state = subs.front()->freeze_tree_state;
+
+ for (auto dir : subs) {
+ ceph_assert(get_dir_auth() == dir->get_dir_auth());
+ ceph_assert(freeze_tree_state == dir->freeze_tree_state);
+ }
+
+ prepare_new_fragment(replay);
+
+ nest_info_t rstatdiff;
+ frag_info_t fragstatdiff;
+ bool touched_mtime, touched_chattr;
+ version_t rstat_version = inode->get_projected_inode()->rstat.version;
+ version_t dirstat_version = inode->get_projected_inode()->dirstat.version;
+
+ map<string_snap_t, MDSContext::vec > dentry_waiters;
+
+ for (auto dir : subs) {
+ dout(10) << " subfrag " << dir->get_frag() << " " << *dir << dendl;
+ ceph_assert(!dir->is_auth() || dir->is_complete() || replay);
+
+ if (dir->fnode.accounted_rstat.version == rstat_version)
+ rstatdiff.add_delta(dir->fnode.accounted_rstat, dir->fnode.rstat);
+ if (dir->fnode.accounted_fragstat.version == dirstat_version)
+ fragstatdiff.add_delta(dir->fnode.accounted_fragstat, dir->fnode.fragstat,
+ &touched_mtime, &touched_chattr);
+
+ dir->prepare_old_fragment(dentry_waiters, replay);
+
+ // steal dentries
+ while (!dir->items.empty())
+ steal_dentry(dir->items.begin()->second);
+
+ // merge replica map
+ for (const auto &p : dir->get_replicas()) {
+ unsigned cur = get_replicas()[p.first];
+ if (p.second > cur)
+ get_replicas()[p.first] = p.second;
+ }
+
+ // merge version
+ if (dir->get_version() > get_version())
+ set_version(dir->get_version());
+
+ // merge state
+ state_set(dir->get_state() & MASK_STATE_FRAGMENT_KEPT);
+
+ dir->finish_old_fragment(waiters, replay);
+ inode->close_dirfrag(dir->get_frag());
+ }
+
+ if (!dentry_waiters.empty()) {
+ get(PIN_DNWAITER);
+ for (const auto &p : dentry_waiters) {
+ auto &e = waiting_on_dentry[p.first];
+ for (const auto &waiter : p.second) {
+ e.push_back(waiter);
+ }
+ }
+ }
+
+ if (is_auth() && !replay)
+ mark_complete();
+
+ // FIXME: merge dirty old rstat
+ fnode.rstat.version = rstat_version;
+ fnode.accounted_rstat = fnode.rstat;
+ fnode.accounted_rstat.add(rstatdiff);
+
+ fnode.fragstat.version = dirstat_version;
+ fnode.accounted_fragstat = fnode.fragstat;
+ fnode.accounted_fragstat.add(fragstatdiff);
+
+ init_fragment_pins();
+}
+
+
+
+
+void CDir::resync_accounted_fragstat()
+{
+ fnode_t *pf = get_projected_fnode();
+ auto pi = inode->get_projected_inode();
+
+ if (pf->accounted_fragstat.version != pi->dirstat.version) {
+ pf->fragstat.version = pi->dirstat.version;
+ dout(10) << __func__ << " " << pf->accounted_fragstat << " -> " << pf->fragstat << dendl;
+ pf->accounted_fragstat = pf->fragstat;
+ }
+}
+
+/*
+ * resync rstat and accounted_rstat with inode
+ */
+void CDir::resync_accounted_rstat()
+{
+ fnode_t *pf = get_projected_fnode();
+ auto pi = inode->get_projected_inode();
+
+ if (pf->accounted_rstat.version != pi->rstat.version) {
+ pf->rstat.version = pi->rstat.version;
+ dout(10) << __func__ << " " << pf->accounted_rstat << " -> " << pf->rstat << dendl;
+ pf->accounted_rstat = pf->rstat;
+ dirty_old_rstat.clear();
+ }
+}
+
+void CDir::assimilate_dirty_rstat_inodes()
+{
+ dout(10) << __func__ << dendl;
+ for (elist<CInode*>::iterator p = dirty_rstat_inodes.begin_use_current();
+ !p.end(); ++p) {
+ CInode *in = *p;
+ ceph_assert(in->is_auth());
+ if (in->is_frozen())
+ continue;
+
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
+
+ inode->mdcache->project_rstat_inode_to_frag(in, this, 0, 0, NULL);
+ }
+ state_set(STATE_ASSIMRSTAT);
+ dout(10) << __func__ << " done" << dendl;
+}
+
+void CDir::assimilate_dirty_rstat_inodes_finish(MutationRef& mut, EMetaBlob *blob)
+{
+ if (!state_test(STATE_ASSIMRSTAT))
+ return;
+ state_clear(STATE_ASSIMRSTAT);
+ dout(10) << __func__ << dendl;
+ elist<CInode*>::iterator p = dirty_rstat_inodes.begin_use_current();
+ while (!p.end()) {
+ CInode *in = *p;
+ ++p;
+
+ if (in->is_frozen())
+ continue;
+
+ CDentry *dn = in->get_projected_parent_dn();
+
+ mut->auth_pin(in);
+ mut->add_projected_inode(in);
+
+ in->clear_dirty_rstat();
+ blob->add_primary_dentry(dn, in, true);
+ }
+
+ if (!dirty_rstat_inodes.empty())
+ inode->mdcache->mds->locker->mark_updated_scatterlock(&inode->nestlock);
+}
+
+
+
+
+/****************************************
+ * WAITING
+ */
+
+void CDir::add_dentry_waiter(std::string_view dname, snapid_t snapid, MDSContext *c)
+{
+ if (waiting_on_dentry.empty())
+ get(PIN_DNWAITER);
+ waiting_on_dentry[string_snap_t(dname, snapid)].push_back(c);
+ dout(10) << __func__ << " dentry " << dname
+ << " snap " << snapid
+ << " " << c << " on " << *this << dendl;
+}
+
+void CDir::take_dentry_waiting(std::string_view dname, snapid_t first, snapid_t last,
+ MDSContext::vec& ls)
+{
+ if (waiting_on_dentry.empty())
+ return;
+
+ string_snap_t lb(dname, first);
+ string_snap_t ub(dname, last);
+ auto it = waiting_on_dentry.lower_bound(lb);
+ while (it != waiting_on_dentry.end() &&
+ !(ub < it->first)) {
+ dout(10) << __func__ << " " << dname
+ << " [" << first << "," << last << "] found waiter on snap "
+ << it->first.snapid
+ << " on " << *this << dendl;
+ for (const auto &waiter : it->second) {
+ ls.push_back(waiter);
+ }
+ waiting_on_dentry.erase(it++);
+ }
+
+ if (waiting_on_dentry.empty())
+ put(PIN_DNWAITER);
+}
+
+void CDir::take_sub_waiting(MDSContext::vec& ls)
+{
+ dout(10) << __func__ << dendl;
+ if (!waiting_on_dentry.empty()) {
+ for (const auto &p : waiting_on_dentry) {
+ for (const auto &waiter : p.second) {
+ ls.push_back(waiter);
+ }
+ }
+ waiting_on_dentry.clear();
+ put(PIN_DNWAITER);
+ }
+}
+
+
+
+void CDir::add_waiter(uint64_t tag, MDSContext *c)
+{
+ // hierarchical?
+
+ // at subtree root?
+ if (tag & WAIT_ATSUBTREEROOT) {
+ if (!is_subtree_root()) {
+ // try parent
+ dout(10) << "add_waiter " << std::hex << tag << std::dec << " " << c << " should be ATSUBTREEROOT, " << *this << " is not root, trying parent" << dendl;
+ inode->parent->dir->add_waiter(tag, c);
+ return;
+ }
+ }
+
+ ceph_assert(!(tag & WAIT_CREATED) || state_test(STATE_CREATING));
+
+ MDSCacheObject::add_waiter(tag, c);
+}
+
+
+
+/* NOTE: this checks dentry waiters too */
+void CDir::take_waiting(uint64_t mask, MDSContext::vec& ls)
+{
+ if ((mask & WAIT_DENTRY) && !waiting_on_dentry.empty()) {
+ // take all dentry waiters
+ for (const auto &p : waiting_on_dentry) {
+ dout(10) << "take_waiting dentry " << p.first.name
+ << " snap " << p.first.snapid << " on " << *this << dendl;
+ for (const auto &waiter : p.second) {
+ ls.push_back(waiter);
+ }
+ }
+ waiting_on_dentry.clear();
+ put(PIN_DNWAITER);
+ }
+
+ // waiting
+ MDSCacheObject::take_waiting(mask, ls);
+}
+
+
+void CDir::finish_waiting(uint64_t mask, int result)
+{
+ dout(11) << __func__ << " mask " << hex << mask << dec << " result " << result << " on " << *this << dendl;
+
+ MDSContext::vec finished;
+ take_waiting(mask, finished);
+ if (result < 0)
+ finish_contexts(g_ceph_context, finished, result);
+ else
+ cache->mds->queue_waiters(finished);
+}
+
+
+
+// dirty/clean
+
+fnode_t *CDir::project_fnode()
+{
+ ceph_assert(get_version() != 0);
+ auto &p = projected_fnode.emplace_back(*get_projected_fnode());
+
+ if (scrub_infop && scrub_infop->last_scrub_dirty) {
+ p.localized_scrub_stamp = scrub_infop->last_local.time;
+ p.localized_scrub_version = scrub_infop->last_local.version;
+ p.recursive_scrub_stamp = scrub_infop->last_recursive.time;
+ p.recursive_scrub_version = scrub_infop->last_recursive.version;
+ scrub_infop->last_scrub_dirty = false;
+ scrub_maybe_delete_info();
+ }
+
+ dout(10) << __func__ << " " << &p << dendl;
+ return &p;
+}
+
+void CDir::pop_and_dirty_projected_fnode(LogSegment *ls)
+{
+ ceph_assert(!projected_fnode.empty());
+ auto &front = projected_fnode.front();
+ dout(15) << __func__ << " " << &front << " v" << front.version << dendl;
+ fnode = front;
+ _mark_dirty(ls);
+ projected_fnode.pop_front();
+}
+
+
+version_t CDir::pre_dirty(version_t min)
+{
+ if (min > projected_version)
+ projected_version = min;
+ ++projected_version;
+ dout(10) << __func__ << " " << projected_version << dendl;
+ return projected_version;
+}
+
+void CDir::mark_dirty(version_t pv, LogSegment *ls)
+{
+ ceph_assert(get_version() < pv);
+ ceph_assert(pv <= projected_version);
+ fnode.version = pv;
+ _mark_dirty(ls);
+}
+
+void CDir::_mark_dirty(LogSegment *ls)
+{
+ if (!state_test(STATE_DIRTY)) {
+ dout(10) << __func__ << " (was clean) " << *this << " version " << get_version() << dendl;
+ _set_dirty_flag();
+ ceph_assert(ls);
+ } else {
+ dout(10) << __func__ << " (already dirty) " << *this << " version " << get_version() << dendl;
+ }
+ if (ls) {
+ ls->dirty_dirfrags.push_back(&item_dirty);
+
+ // if i've never committed, i need to be before _any_ mention of me is trimmed from the journal.
+ if (committed_version == 0 && !item_new.is_on_list())
+ ls->new_dirfrags.push_back(&item_new);
+ }
+}
+
+void CDir::mark_new(LogSegment *ls)
+{
+ ls->new_dirfrags.push_back(&item_new);
+ state_clear(STATE_CREATING);
+
+ MDSContext::vec waiters;
+ take_waiting(CDir::WAIT_CREATED, waiters);
+ cache->mds->queue_waiters(waiters);
+}
+
+void CDir::mark_clean()
+{
+ dout(10) << __func__ << " " << *this << " version " << get_version() << dendl;
+ if (state_test(STATE_DIRTY)) {
+ item_dirty.remove_myself();
+ item_new.remove_myself();
+
+ state_clear(STATE_DIRTY);
+ put(PIN_DIRTY);
+ }
+}
+
+// caller should hold auth pin of this
+void CDir::log_mark_dirty()
+{
+ if (is_dirty() || projected_version > get_version())
+ return; // noop if it is already dirty or will be dirty
+
+ version_t pv = pre_dirty();
+ mark_dirty(pv, cache->mds->mdlog->get_current_segment());
+}
+
+void CDir::mark_complete() {
+ state_set(STATE_COMPLETE);
+ bloom.reset();
+}
+
+void CDir::first_get()
+{
+ inode->get(CInode::PIN_DIRFRAG);
+}
+
+void CDir::last_put()
+{
+ inode->put(CInode::PIN_DIRFRAG);
+}
+
+
+
+/******************************************************************************
+ * FETCH and COMMIT
+ */
+
+// -----------------------
+// FETCH
+void CDir::fetch(MDSContext *c, bool ignore_authpinnability)
+{
+ string want;
+ return fetch(c, want, ignore_authpinnability);
+}
+
+void CDir::fetch(MDSContext *c, std::string_view want_dn, bool ignore_authpinnability)
+{
+ dout(10) << "fetch on " << *this << dendl;
+
+ ceph_assert(is_auth());
+ ceph_assert(!is_complete());
+
+ if (!can_auth_pin() && !ignore_authpinnability) {
+ if (c) {
+ dout(7) << "fetch waiting for authpinnable" << dendl;
+ add_waiter(WAIT_UNFREEZE, c);
+ } else
+ dout(7) << "fetch not authpinnable and no context" << dendl;
+ return;
+ }
+
+ // unlinked directory inode shouldn't have any entry
+ if (!inode->is_base() && get_parent_dir()->inode->is_stray() &&
+ !inode->snaprealm) {
+ dout(7) << "fetch dirfrag for unlinked directory, mark complete" << dendl;
+ if (get_version() == 0) {
+ ceph_assert(inode->is_auth());
+ set_version(1);
+
+ if (state_test(STATE_REJOINUNDEF)) {
+ ceph_assert(cache->mds->is_rejoin());
+ state_clear(STATE_REJOINUNDEF);
+ cache->opened_undef_dirfrag(this);
+ }
+ }
+ mark_complete();
+
+ if (c)
+ cache->mds->queue_waiter(c);
+ return;
+ }
+
+ if (c) add_waiter(WAIT_COMPLETE, c);
+ if (!want_dn.empty()) wanted_items.insert(mempool::mds_co::string(want_dn));
+
+ // already fetching?
+ if (state_test(CDir::STATE_FETCHING)) {
+ dout(7) << "already fetching; waiting" << dendl;
+ return;
+ }
+
+ auth_pin(this);
+ state_set(CDir::STATE_FETCHING);
+
+ if (cache->mds->logger) cache->mds->logger->inc(l_mds_dir_fetch);
+
+ std::set<dentry_key_t> empty;
+ _omap_fetch(NULL, empty);
+}
+
+void CDir::fetch(MDSContext *c, const std::set<dentry_key_t>& keys)
+{
+ dout(10) << "fetch " << keys.size() << " keys on " << *this << dendl;
+
+ ceph_assert(is_auth());
+ ceph_assert(!is_complete());
+
+ if (!can_auth_pin()) {
+ dout(7) << "fetch keys waiting for authpinnable" << dendl;
+ add_waiter(WAIT_UNFREEZE, c);
+ return;
+ }
+ if (state_test(CDir::STATE_FETCHING)) {
+ dout(7) << "fetch keys waiting for full fetch" << dendl;
+ add_waiter(WAIT_COMPLETE, c);
+ return;
+ }
+
+ auth_pin(this);
+ if (cache->mds->logger) cache->mds->logger->inc(l_mds_dir_fetch);
+
+ _omap_fetch(c, keys);
+}
+
+class C_IO_Dir_OMAP_FetchedMore : public CDirIOContext {
+ MDSContext *fin;
+public:
+ bufferlist hdrbl;
+ bool more = false;
+ map<string, bufferlist> omap; ///< carry-over from before
+ map<string, bufferlist> omap_more; ///< new batch
+ int ret;
+ C_IO_Dir_OMAP_FetchedMore(CDir *d, MDSContext *f) :
+ CDirIOContext(d), fin(f), ret(0) { }
+ void finish(int r) {
+ // merge results
+ if (omap.empty()) {
+ omap.swap(omap_more);
+ } else {
+ omap.insert(omap_more.begin(), omap_more.end());
+ }
+ if (more) {
+ dir->_omap_fetch_more(hdrbl, omap, fin);
+ } else {
+ dir->_omap_fetched(hdrbl, omap, !fin, r);
+ if (fin)
+ fin->complete(r);
+ }
+ }
+ void print(ostream& out) const override {
+ out << "dirfrag_fetch_more(" << dir->dirfrag() << ")";
+ }
+};
+
+class C_IO_Dir_OMAP_Fetched : public CDirIOContext {
+ MDSContext *fin;
+public:
+ bufferlist hdrbl;
+ bool more = false;
+ map<string, bufferlist> omap;
+ bufferlist btbl;
+ int ret1, ret2, ret3;
+
+ C_IO_Dir_OMAP_Fetched(CDir *d, MDSContext *f) :
+ CDirIOContext(d), fin(f), ret1(0), ret2(0), ret3(0) { }
+ void finish(int r) override {
+ // check the correctness of backtrace
+ if (r >= 0 && ret3 != -ECANCELED)
+ dir->inode->verify_diri_backtrace(btbl, ret3);
+ if (r >= 0) r = ret1;
+ if (r >= 0) r = ret2;
+ if (more) {
+ dir->_omap_fetch_more(hdrbl, omap, fin);
+ } else {
+ dir->_omap_fetched(hdrbl, omap, !fin, r);
+ if (fin)
+ fin->complete(r);
+ }
+ }
+ void print(ostream& out) const override {
+ out << "dirfrag_fetch(" << dir->dirfrag() << ")";
+ }
+};
+
+void CDir::_omap_fetch(MDSContext *c, const std::set<dentry_key_t>& keys)
+{
+ C_IO_Dir_OMAP_Fetched *fin = new C_IO_Dir_OMAP_Fetched(this, c);
+ object_t oid = get_ondisk_object();
+ object_locator_t oloc(cache->mds->mdsmap->get_metadata_pool());
+ ObjectOperation rd;
+ rd.omap_get_header(&fin->hdrbl, &fin->ret1);
+ if (keys.empty()) {
+ ceph_assert(!c);
+ rd.omap_get_vals("", "", g_conf()->mds_dir_keys_per_op,
+ &fin->omap, &fin->more, &fin->ret2);
+ } else {
+ ceph_assert(c);
+ std::set<std::string> str_keys;
+ for (auto p : keys) {
+ string str;
+ p.encode(str);
+ str_keys.insert(str);
+ }
+ rd.omap_get_vals_by_keys(str_keys, &fin->omap, &fin->ret2);
+ }
+ // check the correctness of backtrace
+ if (g_conf()->mds_verify_backtrace > 0 && frag == frag_t()) {
+ rd.getxattr("parent", &fin->btbl, &fin->ret3);
+ rd.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK);
+ } else {
+ fin->ret3 = -ECANCELED;
+ }
+
+ cache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, NULL, 0,
+ new C_OnFinisher(fin, cache->mds->finisher));
+}
+
+void CDir::_omap_fetch_more(
+ bufferlist& hdrbl,
+ map<string, bufferlist>& omap,
+ MDSContext *c)
+{
+ // we have more omap keys to fetch!
+ object_t oid = get_ondisk_object();
+ object_locator_t oloc(cache->mds->mdsmap->get_metadata_pool());
+ C_IO_Dir_OMAP_FetchedMore *fin = new C_IO_Dir_OMAP_FetchedMore(this, c);
+ fin->hdrbl.claim(hdrbl);
+ fin->omap.swap(omap);
+ ObjectOperation rd;
+ rd.omap_get_vals(fin->omap.rbegin()->first,
+ "", /* filter prefix */
+ g_conf()->mds_dir_keys_per_op,
+ &fin->omap_more,
+ &fin->more,
+ &fin->ret);
+ cache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, NULL, 0,
+ new C_OnFinisher(fin, cache->mds->finisher));
+}
+
+CDentry *CDir::_load_dentry(
+ std::string_view key,
+ std::string_view dname,
+ const snapid_t last,
+ bufferlist &bl,
+ const int pos,
+ const std::set<snapid_t> *snaps,
+ bool *force_dirty)
+{
+ auto q = bl.cbegin();
+
+ snapid_t first;
+ decode(first, q);
+
+ // marker
+ char type;
+ decode(type, q);
+
+ dout(20) << "_fetched pos " << pos << " marker '" << type << "' dname '" << dname
+ << " [" << first << "," << last << "]"
+ << dendl;
+
+ bool stale = false;
+ if (snaps && last != CEPH_NOSNAP) {
+ set<snapid_t>::const_iterator p = snaps->lower_bound(first);
+ if (p == snaps->end() || *p > last) {
+ dout(10) << " skipping stale dentry on [" << first << "," << last << "]" << dendl;
+ stale = true;
+ }
+ }
+
+ /*
+ * look for existing dentry for _last_ snap, because unlink +
+ * create may leave a "hole" (epochs during which the dentry
+ * doesn't exist) but for which no explicit negative dentry is in
+ * the cache.
+ */
+ CDentry *dn;
+ if (stale)
+ dn = lookup_exact_snap(dname, last);
+ else
+ dn = lookup(dname, last);
+
+ if (type == 'L') {
+ // hard link
+ inodeno_t ino;
+ unsigned char d_type;
+ decode(ino, q);
+ decode(d_type, q);
+
+ if (stale) {
+ if (!dn) {
+ stale_items.insert(mempool::mds_co::string(key));
+ *force_dirty = true;
+ }
+ return dn;
+ }
+
+ if (dn) {
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ dout(12) << "_fetched had " << (dnl->is_null() ? "NEG" : "") << " dentry " << *dn << dendl;
+ if (committed_version == 0 &&
+ dnl->is_remote() &&
+ dn->is_dirty() &&
+ ino == dnl->get_remote_ino() &&
+ d_type == dnl->get_remote_d_type()) {
+ // see comment below
+ dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl;
+ dn->mark_clean();
+ }
+ } else {
+ // (remote) link
+ dn = add_remote_dentry(dname, ino, d_type, first, last);
+
+ // link to inode?
+ CInode *in = cache->get_inode(ino); // we may or may not have it.
+ if (in) {
+ dn->link_remote(dn->get_linkage(), in);
+ dout(12) << "_fetched got remote link " << ino << " which we have " << *in << dendl;
+ } else {
+ dout(12) << "_fetched got remote link " << ino << " (don't have it)" << dendl;
+ }
+ }
+ }
+ else if (type == 'I') {
+ // inode
+
+ // Load inode data before looking up or constructing CInode
+ InodeStore inode_data;
+ inode_data.decode_bare(q);
+
+ if (stale) {
+ if (!dn) {
+ stale_items.insert(mempool::mds_co::string(key));
+ *force_dirty = true;
+ }
+ return dn;
+ }
+
+ bool undef_inode = false;
+ if (dn) {
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ dout(12) << "_fetched had " << (dnl->is_null() ? "NEG" : "") << " dentry " << *dn << dendl;
+
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ if (in->state_test(CInode::STATE_REJOINUNDEF)) {
+ undef_inode = true;
+ } else if (committed_version == 0 &&
+ dn->is_dirty() &&
+ inode_data.inode.ino == in->ino() &&
+ inode_data.inode.version == in->get_version()) {
+ /* clean underwater item?
+ * Underwater item is something that is dirty in our cache from
+ * journal replay, but was previously flushed to disk before the
+ * mds failed.
+ *
+ * We only do this is committed_version == 0. that implies either
+ * - this is a fetch after from a clean/empty CDir is created
+ * (and has no effect, since the dn won't exist); or
+ * - this is a fetch after _recovery_, which is what we're worried
+ * about. Items that are marked dirty from the journal should be
+ * marked clean if they appear on disk.
+ */
+ dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl;
+ dn->mark_clean();
+ dout(10) << "_fetched had underwater inode " << *dnl->get_inode() << ", marking clean" << dendl;
+ in->mark_clean();
+ }
+ }
+ }
+
+ if (!dn || undef_inode) {
+ // add inode
+ CInode *in = cache->get_inode(inode_data.inode.ino, last);
+ if (!in || undef_inode) {
+ if (undef_inode && in)
+ in->first = first;
+ else
+ in = new CInode(cache, true, first, last);
+
+ in->inode = inode_data.inode;
+ // symlink?
+ if (in->is_symlink())
+ in->symlink = inode_data.symlink;
+
+ in->dirfragtree.swap(inode_data.dirfragtree);
+ in->xattrs.swap(inode_data.xattrs);
+ in->old_inodes.swap(inode_data.old_inodes);
+ if (!in->old_inodes.empty()) {
+ snapid_t min_first = in->old_inodes.rbegin()->first + 1;
+ if (min_first > in->first)
+ in->first = min_first;
+ }
+
+ in->oldest_snap = inode_data.oldest_snap;
+ in->decode_snap_blob(inode_data.snap_blob);
+ if (snaps && !in->snaprealm)
+ in->purge_stale_snap_data(*snaps);
+
+ if (!undef_inode) {
+ cache->add_inode(in); // add
+ dn = add_primary_dentry(dname, in, first, last); // link
+ }
+ dout(12) << "_fetched got " << *dn << " " << *in << dendl;
+
+ if (in->inode.is_dirty_rstat())
+ in->mark_dirty_rstat();
+
+ //in->hack_accessed = false;
+ //in->hack_load_stamp = ceph_clock_now();
+ //num_new_inodes_loaded++;
+ } else if (g_conf().get_val<bool>("mds_hack_allow_loading_invalid_metadata")) {
+ dout(20) << "hack: adding duplicate dentry for " << *in << dendl;
+ dn = add_primary_dentry(dname, in, first, last);
+ } else {
+ dout(0) << "_fetched badness: got (but i already had) " << *in
+ << " mode " << in->inode.mode
+ << " mtime " << in->inode.mtime << dendl;
+ string dirpath, inopath;
+ this->inode->make_path_string(dirpath);
+ in->make_path_string(inopath);
+ cache->mds->clog->error() << "loaded dup inode " << inode_data.inode.ino
+ << " [" << first << "," << last << "] v" << inode_data.inode.version
+ << " at " << dirpath << "/" << dname
+ << ", but inode " << in->vino() << " v" << in->inode.version
+ << " already exists at " << inopath;
+ return dn;
+ }
+ }
+ } else {
+ std::ostringstream oss;
+ oss << "Invalid tag char '" << type << "' pos " << pos;
+ throw buffer::malformed_input(oss.str());
+ }
+
+ return dn;
+}
+
+void CDir::_omap_fetched(bufferlist& hdrbl, map<string, bufferlist>& omap,
+ bool complete, int r)
+{
+ LogChannelRef clog = cache->mds->clog;
+ dout(10) << "_fetched header " << hdrbl.length() << " bytes "
+ << omap.size() << " keys for " << *this << dendl;
+
+ ceph_assert(r == 0 || r == -ENOENT || r == -ENODATA);
+ ceph_assert(is_auth());
+ ceph_assert(!is_frozen());
+
+ if (hdrbl.length() == 0) {
+ dout(0) << "_fetched missing object for " << *this << dendl;
+
+ clog->error() << "dir " << dirfrag() << " object missing on disk; some "
+ "files may be lost (" << get_path() << ")";
+
+ go_bad(complete);
+ return;
+ }
+
+ fnode_t got_fnode;
+ {
+ auto p = hdrbl.cbegin();
+ try {
+ decode(got_fnode, p);
+ } catch (const buffer::error &err) {
+ derr << "Corrupt fnode in dirfrag " << dirfrag()
+ << ": " << err << dendl;
+ clog->warn() << "Corrupt fnode header in " << dirfrag() << ": "
+ << err << " (" << get_path() << ")";
+ go_bad(complete);
+ return;
+ }
+ if (!p.end()) {
+ clog->warn() << "header buffer of dir " << dirfrag() << " has "
+ << hdrbl.length() - p.get_off() << " extra bytes ("
+ << get_path() << ")";
+ go_bad(complete);
+ return;
+ }
+ }
+
+ dout(10) << "_fetched version " << got_fnode.version << dendl;
+
+ // take the loaded fnode?
+ // only if we are a fresh CDir* with no prior state.
+ if (get_version() == 0) {
+ ceph_assert(!is_projected());
+ ceph_assert(!state_test(STATE_COMMITTING));
+ fnode = got_fnode;
+ projected_version = committing_version = committed_version = got_fnode.version;
+
+ if (state_test(STATE_REJOINUNDEF)) {
+ ceph_assert(cache->mds->is_rejoin());
+ state_clear(STATE_REJOINUNDEF);
+ cache->opened_undef_dirfrag(this);
+ }
+ }
+
+ list<CInode*> undef_inodes;
+
+ // purge stale snaps?
+ // only if we have past_parents open!
+ bool force_dirty = false;
+ const set<snapid_t> *snaps = NULL;
+ SnapRealm *realm = inode->find_snaprealm();
+ if (!realm->have_past_parents_open()) {
+ dout(10) << " no snap purge, one or more past parents NOT open" << dendl;
+ } else if (fnode.snap_purged_thru < realm->get_last_destroyed()) {
+ snaps = &realm->get_snaps();
+ dout(10) << " snap_purged_thru " << fnode.snap_purged_thru
+ << " < " << realm->get_last_destroyed()
+ << ", snap purge based on " << *snaps << dendl;
+ if (get_num_snap_items() == 0) {
+ fnode.snap_purged_thru = realm->get_last_destroyed();
+ force_dirty = true;
+ }
+ }
+
+ unsigned pos = omap.size() - 1;
+ for (map<string, bufferlist>::reverse_iterator p = omap.rbegin();
+ p != omap.rend();
+ ++p, --pos) {
+ string dname;
+ snapid_t last;
+ dentry_key_t::decode_helper(p->first, dname, last);
+
+ CDentry *dn = NULL;
+ try {
+ dn = _load_dentry(
+ p->first, dname, last, p->second, pos, snaps,
+ &force_dirty);
+ } catch (const buffer::error &err) {
+ cache->mds->clog->warn() << "Corrupt dentry '" << dname << "' in "
+ "dir frag " << dirfrag() << ": "
+ << err << "(" << get_path() << ")";
+
+ // Remember that this dentry is damaged. Subsequent operations
+ // that try to act directly on it will get their EIOs, but this
+ // dirfrag as a whole will continue to look okay (minus the
+ // mysteriously-missing dentry)
+ go_bad_dentry(last, dname);
+
+ // Anyone who was WAIT_DENTRY for this guy will get kicked
+ // to RetryRequest, and hit the DamageTable-interrogating path.
+ // Stats will now be bogus because we will think we're complete,
+ // but have 1 or more missing dentries.
+ continue;
+ }
+
+ if (!dn)
+ continue;
+
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ if (dnl->is_primary() && dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
+ undef_inodes.push_back(dnl->get_inode());
+
+ if (wanted_items.count(mempool::mds_co::string(dname)) > 0 || !complete) {
+ dout(10) << " touching wanted dn " << *dn << dendl;
+ inode->mdcache->touch_dentry(dn);
+ }
+ }
+
+ //cache->mds->logger->inc("newin", num_new_inodes_loaded);
+
+ // mark complete, !fetching
+ if (complete) {
+ wanted_items.clear();
+ mark_complete();
+ state_clear(STATE_FETCHING);
+
+ if (scrub_infop && scrub_infop->need_scrub_local) {
+ scrub_infop->need_scrub_local = false;
+ scrub_local();
+ }
+ }
+
+ // open & force frags
+ while (!undef_inodes.empty()) {
+ CInode *in = undef_inodes.front();
+ undef_inodes.pop_front();
+ in->state_clear(CInode::STATE_REJOINUNDEF);
+ cache->opened_undef_inode(in);
+ }
+
+ // dirty myself to remove stale snap dentries
+ if (force_dirty && !inode->mdcache->is_readonly())
+ log_mark_dirty();
+
+ auth_unpin(this);
+
+ if (complete) {
+ // kick waiters
+ finish_waiting(WAIT_COMPLETE, 0);
+ }
+}
+
+void CDir::go_bad_dentry(snapid_t last, std::string_view dname)
+{
+ dout(10) << __func__ << " " << dname << dendl;
+ std::string path(get_path());
+ path += "/";
+ path += dname;
+ const bool fatal = cache->mds->damage_table.notify_dentry(
+ inode->ino(), frag, last, dname, path);
+ if (fatal) {
+ cache->mds->damaged();
+ ceph_abort(); // unreachable, damaged() respawns us
+ }
+}
+
+void CDir::go_bad(bool complete)
+{
+ dout(10) << __func__ << " " << frag << dendl;
+ const bool fatal = cache->mds->damage_table.notify_dirfrag(
+ inode->ino(), frag, get_path());
+ if (fatal) {
+ cache->mds->damaged();
+ ceph_abort(); // unreachable, damaged() respawns us
+ }
+
+ if (complete) {
+ if (get_version() == 0)
+ set_version(1);
+
+ state_set(STATE_BADFRAG);
+ mark_complete();
+ }
+
+ state_clear(STATE_FETCHING);
+ auth_unpin(this);
+ finish_waiting(WAIT_COMPLETE, -EIO);
+}
+
+// -----------------------
+// COMMIT
+
+/**
+ * commit
+ *
+ * @param want - min version i want committed
+ * @param c - callback for completion
+ */
+void CDir::commit(version_t want, MDSContext *c, bool ignore_authpinnability, int op_prio)
+{
+ dout(10) << "commit want " << want << " on " << *this << dendl;
+ if (want == 0) want = get_version();
+
+ // preconditions
+ ceph_assert(want <= get_version() || get_version() == 0); // can't commit the future
+ ceph_assert(want > committed_version); // the caller is stupid
+ ceph_assert(is_auth());
+ ceph_assert(ignore_authpinnability || can_auth_pin());
+
+ // note: queue up a noop if necessary, so that we always
+ // get an auth_pin.
+ if (!c)
+ c = new C_MDSInternalNoop;
+
+ // auth_pin on first waiter
+ if (waiting_for_commit.empty())
+ auth_pin(this);
+ waiting_for_commit[want].push_back(c);
+
+ // ok.
+ _commit(want, op_prio);
+}
+
+class C_IO_Dir_Committed : public CDirIOContext {
+ version_t version;
+public:
+ C_IO_Dir_Committed(CDir *d, version_t v) : CDirIOContext(d), version(v) { }
+ void finish(int r) override {
+ dir->_committed(r, version);
+ }
+ void print(ostream& out) const override {
+ out << "dirfrag_commit(" << dir->dirfrag() << ")";
+ }
+};
+
+/**
+ * Flush out the modified dentries in this dir. Keep the bufferlist
+ * below max_write_size;
+ */
+void CDir::_omap_commit(int op_prio)
+{
+ dout(10) << __func__ << dendl;
+
+ unsigned max_write_size = cache->max_dir_commit_size;
+ unsigned write_size = 0;
+
+ if (op_prio < 0)
+ op_prio = CEPH_MSG_PRIO_DEFAULT;
+
+ // snap purge?
+ const set<snapid_t> *snaps = NULL;
+ SnapRealm *realm = inode->find_snaprealm();
+ if (!realm->have_past_parents_open()) {
+ dout(10) << " no snap purge, one or more past parents NOT open" << dendl;
+ } else if (fnode.snap_purged_thru < realm->get_last_destroyed()) {
+ snaps = &realm->get_snaps();
+ dout(10) << " snap_purged_thru " << fnode.snap_purged_thru
+ << " < " << realm->get_last_destroyed()
+ << ", snap purge based on " << *snaps << dendl;
+ // fnode.snap_purged_thru = realm->get_last_destroyed();
+ }
+
+ set<string> to_remove;
+ map<string, bufferlist> to_set;
+
+ C_GatherBuilder gather(g_ceph_context,
+ new C_OnFinisher(new C_IO_Dir_Committed(this,
+ get_version()),
+ cache->mds->finisher));
+
+ SnapContext snapc;
+ object_t oid = get_ondisk_object();
+ object_locator_t oloc(cache->mds->mdsmap->get_metadata_pool());
+
+ if (!stale_items.empty()) {
+ for (const auto &p : stale_items) {
+ to_remove.insert(std::string(p));
+ write_size += p.length();
+ }
+ stale_items.clear();
+ }
+
+ auto write_one = [&](CDentry *dn) {
+ string key;
+ dn->key().encode(key);
+
+ if (dn->last != CEPH_NOSNAP &&
+ snaps && try_trim_snap_dentry(dn, *snaps)) {
+ dout(10) << " rm " << key << dendl;
+ write_size += key.length();
+ to_remove.insert(key);
+ return;
+ }
+
+ if (dn->get_linkage()->is_null()) {
+ dout(10) << " rm " << dn->get_name() << " " << *dn << dendl;
+ write_size += key.length();
+ to_remove.insert(key);
+ } else {
+ dout(10) << " set " << dn->get_name() << " " << *dn << dendl;
+ bufferlist dnbl;
+ _encode_dentry(dn, dnbl, snaps);
+ write_size += key.length() + dnbl.length();
+ to_set[key].swap(dnbl);
+ }
+
+ if (write_size >= max_write_size) {
+ ObjectOperation op;
+ op.priority = op_prio;
+
+ // don't create new dirfrag blindly
+ if (!is_new())
+ op.stat(NULL, (ceph::real_time*) NULL, NULL);
+
+ if (!to_set.empty())
+ op.omap_set(to_set);
+ if (!to_remove.empty())
+ op.omap_rm_keys(to_remove);
+
+ cache->mds->objecter->mutate(oid, oloc, op, snapc,
+ ceph::real_clock::now(),
+ 0, gather.new_sub());
+
+ write_size = 0;
+ to_set.clear();
+ to_remove.clear();
+ }
+ };
+
+ if (state_test(CDir::STATE_FRAGMENTING) && is_new()) {
+ assert(committed_version == 0);
+ for (auto p = items.begin(); p != items.end(); ) {
+ CDentry *dn = p->second;
+ ++p;
+ if (dn->get_linkage()->is_null())
+ continue;
+ write_one(dn);
+ }
+ } else {
+ for (auto p = dirty_dentries.begin(); !p.end(); ) {
+ CDentry *dn = *p;
+ ++p;
+ write_one(dn);
+ }
+ }
+
+ ObjectOperation op;
+ op.priority = op_prio;
+
+ // don't create new dirfrag blindly
+ if (!is_new())
+ op.stat(NULL, (ceph::real_time*)NULL, NULL);
+
+ /*
+ * save the header at the last moment.. If we were to send it off before other
+ * updates, but die before sending them all, we'd think that the on-disk state
+ * was fully committed even though it wasn't! However, since the messages are
+ * strictly ordered between the MDS and the OSD, and since messages to a given
+ * PG are strictly ordered, if we simply send the message containing the header
+ * off last, we cannot get our header into an incorrect state.
+ */
+ bufferlist header;
+ encode(fnode, header);
+ op.omap_set_header(header);
+
+ if (!to_set.empty())
+ op.omap_set(to_set);
+ if (!to_remove.empty())
+ op.omap_rm_keys(to_remove);
+
+ cache->mds->objecter->mutate(oid, oloc, op, snapc,
+ ceph::real_clock::now(),
+ 0, gather.new_sub());
+
+ gather.activate();
+}
+
+void CDir::_encode_dentry(CDentry *dn, bufferlist& bl,
+ const set<snapid_t> *snaps)
+{
+ // clear dentry NEW flag, if any. we can no longer silently drop it.
+ dn->clear_new();
+
+ encode(dn->first, bl);
+
+ // primary or remote?
+ if (dn->linkage.is_remote()) {
+ inodeno_t ino = dn->linkage.get_remote_ino();
+ unsigned char d_type = dn->linkage.get_remote_d_type();
+ dout(14) << " pos " << bl.length() << " dn '" << dn->get_name() << "' remote ino " << ino << dendl;
+
+ // marker, name, ino
+ bl.append('L'); // remote link
+ encode(ino, bl);
+ encode(d_type, bl);
+ } else if (dn->linkage.is_primary()) {
+ // primary link
+ CInode *in = dn->linkage.get_inode();
+ ceph_assert(in);
+
+ dout(14) << " pos " << bl.length() << " dn '" << dn->get_name() << "' inode " << *in << dendl;
+
+ // marker, name, inode, [symlink string]
+ bl.append('I'); // inode
+
+ if (in->is_multiversion()) {
+ if (!in->snaprealm) {
+ if (snaps)
+ in->purge_stale_snap_data(*snaps);
+ } else if (in->snaprealm->have_past_parents_open()) {
+ in->purge_stale_snap_data(in->snaprealm->get_snaps());
+ }
+ }
+
+ bufferlist snap_blob;
+ in->encode_snap_blob(snap_blob);
+ in->encode_bare(bl, cache->mds->mdsmap->get_up_features(), &snap_blob);
+ } else {
+ ceph_assert(!dn->linkage.is_null());
+ }
+}
+
+void CDir::_commit(version_t want, int op_prio)
+{
+ dout(10) << "_commit want " << want << " on " << *this << dendl;
+
+ // we can't commit things in the future.
+ // (even the projected future.)
+ ceph_assert(want <= get_version() || get_version() == 0);
+
+ // check pre+postconditions.
+ ceph_assert(is_auth());
+
+ // already committed?
+ if (committed_version >= want) {
+ dout(10) << "already committed " << committed_version << " >= " << want << dendl;
+ return;
+ }
+ // already committing >= want?
+ if (committing_version >= want) {
+ dout(10) << "already committing " << committing_version << " >= " << want << dendl;
+ ceph_assert(state_test(STATE_COMMITTING));
+ return;
+ }
+
+ // alrady committed an older version?
+ if (committing_version > committed_version) {
+ dout(10) << "already committing older " << committing_version << ", waiting for that to finish" << dendl;
+ return;
+ }
+
+ // commit.
+ committing_version = get_version();
+
+ // mark committing (if not already)
+ if (!state_test(STATE_COMMITTING)) {
+ dout(10) << "marking committing" << dendl;
+ state_set(STATE_COMMITTING);
+ }
+
+ if (cache->mds->logger) cache->mds->logger->inc(l_mds_dir_commit);
+
+ _omap_commit(op_prio);
+}
+
+
+/**
+ * _committed
+ *
+ * @param v version i just committed
+ */
+void CDir::_committed(int r, version_t v)
+{
+ if (r < 0) {
+ // the directory could be partly purged during MDS failover
+ if (r == -ENOENT && committed_version == 0 &&
+ !inode->is_base() && get_parent_dir()->inode->is_stray()) {
+ r = 0;
+ if (inode->snaprealm)
+ inode->state_set(CInode::STATE_MISSINGOBJS);
+ }
+ if (r < 0) {
+ dout(1) << "commit error " << r << " v " << v << dendl;
+ cache->mds->clog->error() << "failed to commit dir " << dirfrag() << " object,"
+ << " errno " << r;
+ cache->mds->handle_write_error(r);
+ return;
+ }
+ }
+
+ dout(10) << "_committed v " << v << " on " << *this << dendl;
+ ceph_assert(is_auth());
+
+ bool stray = inode->is_stray();
+
+ // take note.
+ ceph_assert(v > committed_version);
+ ceph_assert(v <= committing_version);
+ committed_version = v;
+
+ // _all_ commits done?
+ if (committing_version == committed_version)
+ state_clear(CDir::STATE_COMMITTING);
+
+ // _any_ commit, even if we've been redirtied, means we're no longer new.
+ item_new.remove_myself();
+
+ // dir clean?
+ if (committed_version == get_version())
+ mark_clean();
+
+ // dentries clean?
+ for (auto p = dirty_dentries.begin(); !p.end(); ) {
+ CDentry *dn = *p;
+ ++p;
+
+ // inode?
+ if (dn->linkage.is_primary()) {
+ CInode *in = dn->linkage.get_inode();
+ ceph_assert(in);
+ ceph_assert(in->is_auth());
+
+ if (committed_version >= in->get_version()) {
+ if (in->is_dirty()) {
+ dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << dendl;
+ in->mark_clean();
+ }
+ } else {
+ dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << dendl;
+ ceph_assert(in->is_dirty() || in->last < CEPH_NOSNAP); // special case for cow snap items (not predirtied)
+ }
+ }
+
+ // dentry
+ if (committed_version >= dn->get_version()) {
+ dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << dendl;
+ dn->mark_clean();
+
+ // drop clean null stray dentries immediately
+ if (stray &&
+ dn->get_num_ref() == 0 &&
+ !dn->is_projected() &&
+ dn->get_linkage()->is_null())
+ remove_dentry(dn);
+ } else {
+ dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << dendl;
+ ceph_assert(dn->is_dirty());
+ }
+ }
+
+ // finishers?
+ bool were_waiters = !waiting_for_commit.empty();
+
+ auto it = waiting_for_commit.begin();
+ while (it != waiting_for_commit.end()) {
+ auto _it = it;
+ ++_it;
+ if (it->first > committed_version) {
+ dout(10) << " there are waiters for " << it->first << ", committing again" << dendl;
+ _commit(it->first, -1);
+ break;
+ }
+ MDSContext::vec t;
+ for (const auto &waiter : it->second)
+ t.push_back(waiter);
+ cache->mds->queue_waiters(t);
+ waiting_for_commit.erase(it);
+ it = _it;
+ }
+
+ // try drop dentries in this dirfrag if it's about to be purged
+ if (!inode->is_base() && get_parent_dir()->inode->is_stray() &&
+ inode->snaprealm)
+ cache->maybe_eval_stray(inode, true);
+
+ // unpin if we kicked the last waiter.
+ if (were_waiters &&
+ waiting_for_commit.empty())
+ auth_unpin(this);
+}
+
+
+
+
+// IMPORT/EXPORT
+
+void CDir::encode_export(bufferlist& bl)
+{
+ ceph_assert(!is_projected());
+ encode(first, bl);
+ encode(fnode, bl);
+ encode(dirty_old_rstat, bl);
+ encode(committed_version, bl);
+
+ encode(state, bl);
+ encode(dir_rep, bl);
+
+ encode(pop_me, bl);
+ encode(pop_auth_subtree, bl);
+
+ encode(dir_rep_by, bl);
+ encode(get_replicas(), bl);
+
+ get(PIN_TEMPEXPORTING);
+}
+
+void CDir::finish_export()
+{
+ state &= MASK_STATE_EXPORT_KEPT;
+ pop_nested.sub(pop_auth_subtree);
+ pop_auth_subtree_nested.sub(pop_auth_subtree);
+ pop_me.zero();
+ pop_auth_subtree.zero();
+ put(PIN_TEMPEXPORTING);
+ dirty_old_rstat.clear();
+}
+
+void CDir::decode_import(bufferlist::const_iterator& blp, LogSegment *ls)
+{
+ decode(first, blp);
+ decode(fnode, blp);
+ decode(dirty_old_rstat, blp);
+ projected_version = fnode.version;
+ decode(committed_version, blp);
+ committing_version = committed_version;
+
+ unsigned s;
+ decode(s, blp);
+ state &= MASK_STATE_IMPORT_KEPT;
+ state_set(STATE_AUTH | (s & MASK_STATE_EXPORTED));
+
+ if (is_dirty()) {
+ get(PIN_DIRTY);
+ _mark_dirty(ls);
+ }
+
+ decode(dir_rep, blp);
+
+ decode(pop_me, blp);
+ decode(pop_auth_subtree, blp);
+ pop_nested.add(pop_auth_subtree);
+ pop_auth_subtree_nested.add(pop_auth_subtree);
+
+ decode(dir_rep_by, blp);
+ decode(get_replicas(), blp);
+ if (is_replicated()) get(PIN_REPLICATED);
+
+ replica_nonce = 0; // no longer defined
+
+ // did we import some dirty scatterlock data?
+ if (dirty_old_rstat.size() ||
+ !(fnode.rstat == fnode.accounted_rstat)) {
+ cache->mds->locker->mark_updated_scatterlock(&inode->nestlock);
+ ls->dirty_dirfrag_nest.push_back(&inode->item_dirty_dirfrag_nest);
+ }
+ if (!(fnode.fragstat == fnode.accounted_fragstat)) {
+ cache->mds->locker->mark_updated_scatterlock(&inode->filelock);
+ ls->dirty_dirfrag_dir.push_back(&inode->item_dirty_dirfrag_dir);
+ }
+ if (is_dirty_dft()) {
+ if (inode->dirfragtreelock.get_state() != LOCK_MIX &&
+ inode->dirfragtreelock.is_stable()) {
+ // clear stale dirtydft
+ state_clear(STATE_DIRTYDFT);
+ } else {
+ cache->mds->locker->mark_updated_scatterlock(&inode->dirfragtreelock);
+ ls->dirty_dirfrag_dirfragtree.push_back(&inode->item_dirty_dirfrag_dirfragtree);
+ }
+ }
+}
+
+void CDir::abort_import()
+{
+ ceph_assert(is_auth());
+ state_clear(CDir::STATE_AUTH);
+ remove_bloom();
+ clear_replica_map();
+ set_replica_nonce(CDir::EXPORT_NONCE);
+ if (is_dirty())
+ mark_clean();
+
+ pop_nested.sub(pop_auth_subtree);
+ pop_auth_subtree_nested.sub(pop_auth_subtree);
+ pop_me.zero();
+ pop_auth_subtree.zero();
+}
+
+void CDir::encode_dirstat(bufferlist& bl, const session_info_t& info, const DirStat& ds) {
+ if (info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
+ ENCODE_START(1, 1, bl);
+ encode(ds.frag, bl);
+ encode(ds.auth, bl);
+ encode(ds.dist, bl);
+ ENCODE_FINISH(bl);
+ }
+ else {
+ encode(ds.frag, bl);
+ encode(ds.auth, bl);
+ encode(ds.dist, bl);
+ }
+}
+
+/********************************
+ * AUTHORITY
+ */
+
+/*
+ * if dir_auth.first == parent, auth is same as inode.
+ * unless .second != unknown, in which case that sticks.
+ */
+mds_authority_t CDir::authority() const
+{
+ if (is_subtree_root())
+ return dir_auth;
+ else
+ return inode->authority();
+}
+
+/** is_subtree_root()
+ * true if this is an auth delegation point.
+ * that is, dir_auth != default (parent,unknown)
+ *
+ * some key observations:
+ * if i am auth:
+ * - any region bound will be an export, or frozen.
+ *
+ * note that this DOES heed dir_auth.pending
+ */
+/*
+bool CDir::is_subtree_root()
+{
+ if (dir_auth == CDIR_AUTH_DEFAULT) {
+ //dout(10) << "is_subtree_root false " << dir_auth << " != " << CDIR_AUTH_DEFAULT
+ //<< " on " << ino() << dendl;
+ return false;
+ } else {
+ //dout(10) << "is_subtree_root true " << dir_auth << " != " << CDIR_AUTH_DEFAULT
+ //<< " on " << ino() << dendl;
+ return true;
+ }
+}
+*/
+
+/** contains(x)
+ * true if we are x, or an ancestor of x
+ */
+bool CDir::contains(CDir *x)
+{
+ while (1) {
+ if (x == this)
+ return true;
+ x = x->get_inode()->get_projected_parent_dir();
+ if (x == 0)
+ return false;
+ }
+}
+
+
+
+/** set_dir_auth
+ */
+void CDir::set_dir_auth(const mds_authority_t &a)
+{
+ dout(10) << "setting dir_auth=" << a
+ << " from " << dir_auth
+ << " on " << *this << dendl;
+
+ bool was_subtree = is_subtree_root();
+ bool was_ambiguous = dir_auth.second >= 0;
+
+ // set it.
+ dir_auth = a;
+
+ // new subtree root?
+ if (!was_subtree && is_subtree_root()) {
+ dout(10) << " new subtree root, adjusting auth_pins" << dendl;
+
+ if (freeze_tree_state) {
+ // only by CDir::_freeze_tree()
+ ceph_assert(is_freezing_tree_root());
+ }
+
+ inode->num_subtree_roots++;
+
+ // unpin parent of frozen dir/tree?
+ if (inode->is_auth()) {
+ ceph_assert(!is_frozen_tree_root());
+ if (is_frozen_dir())
+ inode->auth_unpin(this);
+ }
+ }
+ if (was_subtree && !is_subtree_root()) {
+ dout(10) << " old subtree root, adjusting auth_pins" << dendl;
+
+ inode->num_subtree_roots--;
+
+ // pin parent of frozen dir/tree?
+ if (inode->is_auth()) {
+ ceph_assert(!is_frozen_tree_root());
+ if (is_frozen_dir())
+ inode->auth_pin(this);
+ }
+ }
+
+ // newly single auth?
+ if (was_ambiguous && dir_auth.second == CDIR_AUTH_UNKNOWN) {
+ MDSContext::vec ls;
+ take_waiting(WAIT_SINGLEAUTH, ls);
+ cache->mds->queue_waiters(ls);
+ }
+}
+
+/*****************************************
+ * AUTH PINS and FREEZING
+ *
+ * the basic plan is that auth_pins only exist in auth regions, and they
+ * prevent a freeze (and subsequent auth change).
+ *
+ * however, we also need to prevent a parent from freezing if a child is frozen.
+ * for that reason, the parent inode of a frozen directory is auth_pinned.
+ *
+ * the oddity is when the frozen directory is a subtree root. if that's the case,
+ * the parent inode isn't frozen. which means that when subtree authority is adjusted
+ * at the bounds, inodes for any frozen bound directories need to get auth_pins at that
+ * time.
+ *
+ */
+
+void CDir::auth_pin(void *by)
+{
+ if (auth_pins == 0)
+ get(PIN_AUTHPIN);
+ auth_pins++;
+
+#ifdef MDS_AUTHPIN_SET
+ auth_pin_set.insert(by);
+#endif
+
+ dout(10) << "auth_pin by " << by << " on " << *this << " count now " << auth_pins << dendl;
+
+ if (freeze_tree_state)
+ freeze_tree_state->auth_pins += 1;
+}
+
+void CDir::auth_unpin(void *by)
+{
+ auth_pins--;
+
+#ifdef MDS_AUTHPIN_SET
+ {
+ auto it = auth_pin_set.find(by);
+ ceph_assert(it != auth_pin_set.end());
+ auth_pin_set.erase(it);
+ }
+#endif
+ if (auth_pins == 0)
+ put(PIN_AUTHPIN);
+
+ dout(10) << "auth_unpin by " << by << " on " << *this << " count now " << auth_pins << dendl;
+ ceph_assert(auth_pins >= 0);
+
+ if (freeze_tree_state)
+ freeze_tree_state->auth_pins -= 1;
+
+ maybe_finish_freeze(); // pending freeze?
+}
+
+void CDir::adjust_nested_auth_pins(int dirinc, void *by)
+{
+ ceph_assert(dirinc);
+ dir_auth_pins += dirinc;
+
+ dout(15) << __func__ << " " << dirinc << " on " << *this
+ << " by " << by << " count now "
+ << auth_pins << "/" << dir_auth_pins << dendl;
+ ceph_assert(dir_auth_pins >= 0);
+
+ if (freeze_tree_state)
+ freeze_tree_state->auth_pins += dirinc;
+
+ if (dirinc < 0)
+ maybe_finish_freeze(); // pending freeze?
+}
+
+#ifdef MDS_VERIFY_FRAGSTAT
+void CDir::verify_fragstat()
+{
+ ceph_assert(is_complete());
+ if (inode->is_stray())
+ return;
+
+ frag_info_t c;
+ memset(&c, 0, sizeof(c));
+
+ for (auto it = items.begin();
+ it != items.end();
+ ++it) {
+ CDentry *dn = it->second;
+ if (dn->is_null())
+ continue;
+
+ dout(10) << " " << *dn << dendl;
+ if (dn->is_primary())
+ dout(10) << " " << *dn->inode << dendl;
+
+ if (dn->is_primary()) {
+ if (dn->inode->is_dir())
+ c.nsubdirs++;
+ else
+ c.nfiles++;
+ }
+ if (dn->is_remote()) {
+ if (dn->get_remote_d_type() == DT_DIR)
+ c.nsubdirs++;
+ else
+ c.nfiles++;
+ }
+ }
+
+ if (c.nsubdirs != fnode.fragstat.nsubdirs ||
+ c.nfiles != fnode.fragstat.nfiles) {
+ dout(0) << "verify_fragstat failed " << fnode.fragstat << " on " << *this << dendl;
+ dout(0) << " i count " << c << dendl;
+ ceph_abort();
+ } else {
+ dout(0) << "verify_fragstat ok " << fnode.fragstat << " on " << *this << dendl;
+ }
+}
+#endif
+
+/*****************************************************************************
+ * FREEZING
+ */
+
+// FREEZE TREE
+
+void CDir::_walk_tree(std::function<bool(CDir*)> callback)
+{
+
+ deque<CDir*> dfq;
+ dfq.push_back(this);
+
+ vector<CDir*> dfv;
+ while (!dfq.empty()) {
+ CDir *dir = dfq.front();
+ dfq.pop_front();
+
+ for (auto& p : *dir) {
+ CDentry *dn = p.second;
+ if (!dn->get_linkage()->is_primary())
+ continue;
+ CInode *in = dn->get_linkage()->get_inode();
+ if (!in->is_dir())
+ continue;
+
+ in->get_nested_dirfrags(dfv);
+ for (auto& dir : dfv) {
+ auto ret = callback(dir);
+ if (ret)
+ dfq.push_back(dir);
+ }
+ dfv.clear();
+ }
+ }
+}
+
+bool CDir::freeze_tree()
+{
+ ceph_assert(!is_frozen());
+ ceph_assert(!is_freezing());
+ ceph_assert(!freeze_tree_state);
+
+ auth_pin(this);
+
+ // Travese the subtree to mark dirfrags as 'freezing' (set freeze_tree_state)
+ // and to accumulate auth pins and record total count in freeze_tree_state.
+ // when auth unpin an 'freezing' object, the counter in freeze_tree_state also
+ // gets decreased. Subtree become 'frozen' when the counter reaches zero.
+ freeze_tree_state = std::make_shared<freeze_tree_state_t>(this);
+ freeze_tree_state->auth_pins += get_auth_pins() + get_dir_auth_pins();
+
+ _walk_tree([this](CDir *dir) {
+ if (dir->freeze_tree_state)
+ return false;
+ dir->freeze_tree_state = freeze_tree_state;
+ freeze_tree_state->auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
+ return true;
+ }
+ );
+
+ if (is_freezeable(true)) {
+ _freeze_tree();
+ auth_unpin(this);
+ return true;
+ } else {
+ state_set(STATE_FREEZINGTREE);
+ ++num_freezing_trees;
+ dout(10) << "freeze_tree waiting " << *this << dendl;
+ return false;
+ }
+}
+
+void CDir::_freeze_tree()
+{
+ dout(10) << __func__ << " " << *this << dendl;
+ ceph_assert(is_freezeable(true));
+
+ if (freeze_tree_state) {
+ ceph_assert(is_auth());
+ } else {
+ ceph_assert(!is_auth());
+ freeze_tree_state = std::make_shared<freeze_tree_state_t>(this);
+ }
+ freeze_tree_state->frozen = true;
+
+ if (is_auth()) {
+ mds_authority_t auth;
+ bool was_subtree = is_subtree_root();
+ if (was_subtree) {
+ auth = get_dir_auth();
+ } else {
+ // temporarily prevent parent subtree from becoming frozen.
+ inode->auth_pin(this);
+ // create new subtree
+ auth = authority();
+ }
+
+ _walk_tree([this, &auth] (CDir *dir) {
+ if (dir->freeze_tree_state != freeze_tree_state) {
+ inode->mdcache->adjust_subtree_auth(dir, auth);
+ return false;
+ }
+ return true;
+ }
+ );
+
+ ceph_assert(auth.first >= 0);
+ ceph_assert(auth.second == CDIR_AUTH_UNKNOWN);
+ auth.second = auth.first;
+ inode->mdcache->adjust_subtree_auth(this, auth);
+ if (!was_subtree)
+ inode->auth_unpin(this);
+ } else {
+ // importing subtree ?
+ _walk_tree([this] (CDir *dir) {
+ ceph_assert(!dir->freeze_tree_state);
+ dir->freeze_tree_state = freeze_tree_state;
+ return true;
+ }
+ );
+ }
+
+ // twiddle state
+ if (state_test(STATE_FREEZINGTREE)) {
+ state_clear(STATE_FREEZINGTREE);
+ --num_freezing_trees;
+ }
+
+ state_set(STATE_FROZENTREE);
+ ++num_frozen_trees;
+ get(PIN_FROZEN);
+}
+
+void CDir::unfreeze_tree()
+{
+ dout(10) << __func__ << " " << *this << dendl;
+
+ MDSContext::vec unfreeze_waiters;
+ take_waiting(WAIT_UNFREEZE, unfreeze_waiters);
+
+ if (freeze_tree_state) {
+ _walk_tree([this, &unfreeze_waiters](CDir *dir) {
+ if (dir->freeze_tree_state != freeze_tree_state)
+ return false;
+ dir->freeze_tree_state.reset();
+ dir->take_waiting(WAIT_UNFREEZE, unfreeze_waiters);
+ return true;
+ }
+ );
+ }
+
+ if (state_test(STATE_FROZENTREE)) {
+ // frozen. unfreeze.
+ state_clear(STATE_FROZENTREE);
+ --num_frozen_trees;
+
+ put(PIN_FROZEN);
+
+ if (is_auth()) {
+ // must be subtree
+ ceph_assert(is_subtree_root());
+ // for debug purpose, caller should ensure 'dir_auth.second == dir_auth.first'
+ mds_authority_t auth = get_dir_auth();
+ ceph_assert(auth.first >= 0);
+ ceph_assert(auth.second == auth.first);
+ auth.second = CDIR_AUTH_UNKNOWN;
+ inode->mdcache->adjust_subtree_auth(this, auth);
+ }
+ freeze_tree_state.reset();
+ } else {
+ ceph_assert(state_test(STATE_FREEZINGTREE));
+
+ // freezing. stop it.
+ state_clear(STATE_FREEZINGTREE);
+ --num_freezing_trees;
+ freeze_tree_state.reset();
+
+ finish_waiting(WAIT_FROZEN, -1);
+ auth_unpin(this);
+ }
+
+ cache->mds->queue_waiters(unfreeze_waiters);
+}
+
+void CDir::adjust_freeze_after_rename(CDir *dir)
+{
+ if (!freeze_tree_state || dir->freeze_tree_state != freeze_tree_state)
+ return;
+ CDir *newdir = dir->get_inode()->get_parent_dir();
+ if (newdir == this || newdir->freeze_tree_state == freeze_tree_state)
+ return;
+
+ ceph_assert(!freeze_tree_state->frozen);
+ ceph_assert(get_dir_auth_pins() > 0);
+
+ MDSContext::vec unfreeze_waiters;
+
+ auto unfreeze = [this, &unfreeze_waiters](CDir *dir) {
+ if (dir->freeze_tree_state != freeze_tree_state)
+ return false;
+ int dec = dir->get_auth_pins() + dir->get_dir_auth_pins();
+ // shouldn't become zero because srcdn of rename was auth pinned
+ ceph_assert(freeze_tree_state->auth_pins > dec);
+ freeze_tree_state->auth_pins -= dec;
+ dir->freeze_tree_state.reset();
+ dir->take_waiting(WAIT_UNFREEZE, unfreeze_waiters);
+ return true;
+ };
+
+ unfreeze(dir);
+ dir->_walk_tree(unfreeze);
+
+ cache->mds->queue_waiters(unfreeze_waiters);
+}
+
+bool CDir::can_auth_pin(int *err_ret) const
+{
+ int err;
+ if (!is_auth()) {
+ err = ERR_NOT_AUTH;
+ } else if (is_freezing_dir() || is_frozen_dir()) {
+ err = ERR_FRAGMENTING_DIR;
+ } else {
+ auto p = is_freezing_or_frozen_tree();
+ if (p.first || p.second) {
+ err = ERR_EXPORTING_TREE;
+ } else {
+ err = 0;
+ }
+ }
+ if (err && err_ret)
+ *err_ret = err;
+ return !err;
+}
+
+class C_Dir_AuthUnpin : public CDirContext {
+ public:
+ explicit C_Dir_AuthUnpin(CDir *d) : CDirContext(d) {}
+ void finish(int r) override {
+ dir->auth_unpin(dir->get_inode());
+ }
+};
+
+void CDir::maybe_finish_freeze()
+{
+ if (dir_auth_pins != 0)
+ return;
+
+ // we can freeze the _dir_ even with nested pins...
+ if (state_test(STATE_FREEZINGDIR)) {
+ if (auth_pins == 1) {
+ _freeze_dir();
+ auth_unpin(this);
+ finish_waiting(WAIT_FROZEN);
+ }
+ }
+
+ if (freeze_tree_state) {
+ if (freeze_tree_state->frozen ||
+ freeze_tree_state->auth_pins != 1)
+ return;
+
+ if (freeze_tree_state->dir != this) {
+ freeze_tree_state->dir->maybe_finish_freeze();
+ return;
+ }
+
+ ceph_assert(state_test(STATE_FREEZINGTREE));
+
+ if (!is_subtree_root() && inode->is_frozen()) {
+ dout(10) << __func__ << " !subtree root and frozen inode, waiting for unfreeze on " << inode << dendl;
+ // retake an auth_pin...
+ auth_pin(inode);
+ // and release it when the parent inode unfreezes
+ inode->add_waiter(WAIT_UNFREEZE, new C_Dir_AuthUnpin(this));
+ return;
+ }
+
+ _freeze_tree();
+ auth_unpin(this);
+ finish_waiting(WAIT_FROZEN);
+ }
+}
+
+
+
+// FREEZE DIR
+
+bool CDir::freeze_dir()
+{
+ ceph_assert(!is_frozen());
+ ceph_assert(!is_freezing());
+
+ auth_pin(this);
+ if (is_freezeable_dir(true)) {
+ _freeze_dir();
+ auth_unpin(this);
+ return true;
+ } else {
+ state_set(STATE_FREEZINGDIR);
+ dout(10) << "freeze_dir + wait " << *this << dendl;
+ return false;
+ }
+}
+
+void CDir::_freeze_dir()
+{
+ dout(10) << __func__ << " " << *this << dendl;
+ //assert(is_freezeable_dir(true));
+ // not always true during split because the original fragment may have frozen a while
+ // ago and we're just now getting around to breaking it up.
+
+ state_clear(STATE_FREEZINGDIR);
+ state_set(STATE_FROZENDIR);
+ get(PIN_FROZEN);
+
+ if (is_auth() && !is_subtree_root())
+ inode->auth_pin(this); // auth_pin for duration of freeze
+}
+
+
+void CDir::unfreeze_dir()
+{
+ dout(10) << __func__ << " " << *this << dendl;
+
+ if (state_test(STATE_FROZENDIR)) {
+ state_clear(STATE_FROZENDIR);
+ put(PIN_FROZEN);
+
+ // unpin (may => FREEZEABLE) FIXME: is this order good?
+ if (is_auth() && !is_subtree_root())
+ inode->auth_unpin(this);
+
+ finish_waiting(WAIT_UNFREEZE);
+ } else {
+ finish_waiting(WAIT_FROZEN, -1);
+
+ // still freezing. stop.
+ ceph_assert(state_test(STATE_FREEZINGDIR));
+ state_clear(STATE_FREEZINGDIR);
+ auth_unpin(this);
+
+ finish_waiting(WAIT_UNFREEZE);
+ }
+}
+
+/**
+ * Slightly less complete than operator<<, because this is intended
+ * for identifying a directory and its state rather than for dumping
+ * debug output.
+ */
+void CDir::dump(Formatter *f, int flags) const
+{
+ ceph_assert(f != NULL);
+ if (flags & DUMP_PATH) {
+ f->dump_stream("path") << get_path();
+ }
+ if (flags & DUMP_DIRFRAG) {
+ f->dump_stream("dirfrag") << dirfrag();
+ }
+ if (flags & DUMP_SNAPID_FIRST) {
+ f->dump_int("snapid_first", first);
+ }
+ if (flags & DUMP_VERSIONS) {
+ f->dump_stream("projected_version") << get_projected_version();
+ f->dump_stream("version") << get_version();
+ f->dump_stream("committing_version") << get_committing_version();
+ f->dump_stream("committed_version") << get_committed_version();
+ }
+ if (flags & DUMP_REP) {
+ f->dump_bool("is_rep", is_rep());
+ }
+ if (flags & DUMP_DIR_AUTH) {
+ if (get_dir_auth() != CDIR_AUTH_DEFAULT) {
+ if (get_dir_auth().second == CDIR_AUTH_UNKNOWN) {
+ f->dump_stream("dir_auth") << get_dir_auth().first;
+ } else {
+ f->dump_stream("dir_auth") << get_dir_auth();
+ }
+ } else {
+ f->dump_string("dir_auth", "");
+ }
+ }
+ if (flags & DUMP_STATES) {
+ f->open_array_section("states");
+ MDSCacheObject::dump_states(f);
+ if (state_test(CDir::STATE_COMPLETE)) f->dump_string("state", "complete");
+ if (state_test(CDir::STATE_FREEZINGTREE)) f->dump_string("state", "freezingtree");
+ if (state_test(CDir::STATE_FROZENTREE)) f->dump_string("state", "frozentree");
+ if (state_test(CDir::STATE_FROZENDIR)) f->dump_string("state", "frozendir");
+ if (state_test(CDir::STATE_FREEZINGDIR)) f->dump_string("state", "freezingdir");
+ if (state_test(CDir::STATE_EXPORTBOUND)) f->dump_string("state", "exportbound");
+ if (state_test(CDir::STATE_IMPORTBOUND)) f->dump_string("state", "importbound");
+ if (state_test(CDir::STATE_BADFRAG)) f->dump_string("state", "badfrag");
+ f->close_section();
+ }
+ if (flags & DUMP_MDS_CACHE_OBJECT) {
+ MDSCacheObject::dump(f);
+ }
+ if (flags & DUMP_ITEMS) {
+ f->open_array_section("dentries");
+ for (auto &p : items) {
+ CDentry *dn = p.second;
+ f->open_object_section("dentry");
+ dn->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+}
+
+void CDir::dump_load(Formatter *f)
+{
+ f->dump_stream("path") << get_path();
+ f->dump_stream("dirfrag") << dirfrag();
+
+ f->open_object_section("pop_me");
+ pop_me.dump(f);
+ f->close_section();
+
+ f->open_object_section("pop_nested");
+ pop_nested.dump(f);
+ f->close_section();
+
+ f->open_object_section("pop_auth_subtree");
+ pop_auth_subtree.dump(f);
+ f->close_section();
+
+ f->open_object_section("pop_auth_subtree_nested");
+ pop_auth_subtree_nested.dump(f);
+ f->close_section();
+}
+
+/****** Scrub Stuff *******/
+
+void CDir::scrub_info_create() const
+{
+ ceph_assert(!scrub_infop);
+
+ // break out of const-land to set up implicit initial state
+ CDir *me = const_cast<CDir*>(this);
+ fnode_t *fn = me->get_projected_fnode();
+
+ std::unique_ptr<scrub_info_t> si(new scrub_info_t());
+
+ si->last_recursive.version = si->recursive_start.version =
+ fn->recursive_scrub_version;
+ si->last_recursive.time = si->recursive_start.time =
+ fn->recursive_scrub_stamp;
+
+ si->last_local.version = fn->localized_scrub_version;
+ si->last_local.time = fn->localized_scrub_stamp;
+
+ me->scrub_infop.swap(si);
+}
+
+void CDir::scrub_initialize(const ScrubHeaderRefConst& header)
+{
+ dout(20) << __func__ << dendl;
+ ceph_assert(is_complete());
+ ceph_assert(header != nullptr);
+
+ // FIXME: weird implicit construction, is someone else meant
+ // to be calling scrub_info_create first?
+ scrub_info();
+ ceph_assert(scrub_infop && !scrub_infop->directory_scrubbing);
+
+ scrub_infop->recursive_start.version = get_projected_version();
+ scrub_infop->recursive_start.time = ceph_clock_now();
+
+ scrub_infop->directories_to_scrub.clear();
+ scrub_infop->directories_scrubbing.clear();
+ scrub_infop->directories_scrubbed.clear();
+ scrub_infop->others_to_scrub.clear();
+ scrub_infop->others_scrubbing.clear();
+ scrub_infop->others_scrubbed.clear();
+
+ for (auto i = items.begin();
+ i != items.end();
+ ++i) {
+ // TODO: handle snapshot scrubbing
+ if (i->first.snapid != CEPH_NOSNAP)
+ continue;
+
+ CDentry::linkage_t *dnl = i->second->get_projected_linkage();
+ if (dnl->is_primary()) {
+ if (dnl->get_inode()->is_dir())
+ scrub_infop->directories_to_scrub.insert(i->first);
+ else
+ scrub_infop->others_to_scrub.insert(i->first);
+ } else if (dnl->is_remote()) {
+ // TODO: check remote linkage
+ }
+ }
+ scrub_infop->directory_scrubbing = true;
+ scrub_infop->header = header;
+}
+
+void CDir::scrub_finished()
+{
+ dout(20) << __func__ << dendl;
+ ceph_assert(scrub_infop && scrub_infop->directory_scrubbing);
+
+ ceph_assert(scrub_infop->directories_to_scrub.empty());
+ ceph_assert(scrub_infop->directories_scrubbing.empty());
+ scrub_infop->directories_scrubbed.clear();
+ ceph_assert(scrub_infop->others_to_scrub.empty());
+ ceph_assert(scrub_infop->others_scrubbing.empty());
+ scrub_infop->others_scrubbed.clear();
+ scrub_infop->directory_scrubbing = false;
+
+ scrub_infop->last_recursive = scrub_infop->recursive_start;
+ scrub_infop->last_scrub_dirty = true;
+}
+
+int CDir::_next_dentry_on_set(dentry_key_set &dns, bool missing_okay,
+ MDSContext *cb, CDentry **dnout)
+{
+ dentry_key_t dnkey;
+ CDentry *dn;
+
+ while (!dns.empty()) {
+ set<dentry_key_t>::iterator front = dns.begin();
+ dnkey = *front;
+ dn = lookup(dnkey.name);
+ if (!dn) {
+ if (!is_complete() &&
+ (!has_bloom() || is_in_bloom(dnkey.name))) {
+ // need to re-read this dirfrag
+ fetch(cb);
+ return EAGAIN;
+ }
+ // okay, we lost it
+ if (missing_okay) {
+ dout(15) << " we no longer have directory dentry "
+ << dnkey.name << ", assuming it got renamed" << dendl;
+ dns.erase(dnkey);
+ continue;
+ } else {
+ dout(5) << " we lost dentry " << dnkey.name
+ << ", bailing out because that's impossible!" << dendl;
+ ceph_abort();
+ }
+ }
+ // okay, we got a dentry
+ dns.erase(dnkey);
+
+ if (dn->get_projected_version() < scrub_infop->last_recursive.version &&
+ !(scrub_infop->header->get_force())) {
+ dout(15) << " skip dentry " << dnkey.name
+ << ", no change since last scrub" << dendl;
+ continue;
+ }
+
+ if (!dn->get_linkage()->is_primary()) {
+ dout(15) << " skip dentry " << dnkey.name
+ << ", no longer primary" << dendl;
+ continue;
+ }
+
+ *dnout = dn;
+ return 0;
+ }
+ *dnout = NULL;
+ return ENOENT;
+}
+
+int CDir::scrub_dentry_next(MDSContext *cb, CDentry **dnout)
+{
+ dout(20) << __func__ << dendl;
+ ceph_assert(scrub_infop && scrub_infop->directory_scrubbing);
+
+ dout(20) << "trying to scrub directories underneath us" << dendl;
+ int rval = _next_dentry_on_set(scrub_infop->directories_to_scrub, true,
+ cb, dnout);
+ if (rval == 0) {
+ dout(20) << __func__ << " inserted to directories scrubbing: "
+ << *dnout << dendl;
+ scrub_infop->directories_scrubbing.insert((*dnout)->key());
+ } else if (rval == EAGAIN) {
+ // we don't need to do anything else
+ } else { // we emptied out the directory scrub set
+ ceph_assert(rval == ENOENT);
+ dout(20) << "no directories left, moving on to other kinds of dentries"
+ << dendl;
+
+ rval = _next_dentry_on_set(scrub_infop->others_to_scrub, false, cb, dnout);
+ if (rval == 0) {
+ dout(20) << __func__ << " inserted to others scrubbing: "
+ << *dnout << dendl;
+ scrub_infop->others_scrubbing.insert((*dnout)->key());
+ }
+ }
+ dout(20) << " returning " << rval << " with dn=" << *dnout << dendl;
+ return rval;
+}
+
+void CDir::scrub_dentries_scrubbing(list<CDentry*> *out_dentries)
+{
+ dout(20) << __func__ << dendl;
+ ceph_assert(scrub_infop && scrub_infop->directory_scrubbing);
+
+ for (set<dentry_key_t>::iterator i =
+ scrub_infop->directories_scrubbing.begin();
+ i != scrub_infop->directories_scrubbing.end();
+ ++i) {
+ CDentry *d = lookup(i->name, i->snapid);
+ ceph_assert(d);
+ out_dentries->push_back(d);
+ }
+ for (set<dentry_key_t>::iterator i = scrub_infop->others_scrubbing.begin();
+ i != scrub_infop->others_scrubbing.end();
+ ++i) {
+ CDentry *d = lookup(i->name, i->snapid);
+ ceph_assert(d);
+ out_dentries->push_back(d);
+ }
+}
+
+void CDir::scrub_dentry_finished(CDentry *dn)
+{
+ dout(20) << __func__ << " on dn " << *dn << dendl;
+ ceph_assert(scrub_infop && scrub_infop->directory_scrubbing);
+ dentry_key_t dn_key = dn->key();
+ if (scrub_infop->directories_scrubbing.erase(dn_key)) {
+ scrub_infop->directories_scrubbed.insert(dn_key);
+ } else {
+ ceph_assert(scrub_infop->others_scrubbing.count(dn_key));
+ scrub_infop->others_scrubbing.erase(dn_key);
+ scrub_infop->others_scrubbed.insert(dn_key);
+ }
+}
+
+void CDir::scrub_maybe_delete_info()
+{
+ if (scrub_infop &&
+ !scrub_infop->directory_scrubbing &&
+ !scrub_infop->need_scrub_local &&
+ !scrub_infop->last_scrub_dirty &&
+ !scrub_infop->pending_scrub_error &&
+ scrub_infop->dirty_scrub_stamps.empty()) {
+ scrub_infop.reset();
+ }
+}
+
+bool CDir::scrub_local()
+{
+ ceph_assert(is_complete());
+ bool rval = check_rstats(true);
+
+ scrub_info();
+ if (rval) {
+ scrub_infop->last_local.time = ceph_clock_now();
+ scrub_infop->last_local.version = get_projected_version();
+ scrub_infop->pending_scrub_error = false;
+ scrub_infop->last_scrub_dirty = true;
+ } else {
+ scrub_infop->pending_scrub_error = true;
+ if (scrub_infop->header->get_repair())
+ cache->repair_dirfrag_stats(this);
+ }
+ return rval;
+}
+
+std::string CDir::get_path() const
+{
+ std::string path;
+ get_inode()->make_path_string(path, true);
+ return path;
+}
+
+bool CDir::should_split_fast() const
+{
+ // Max size a fragment can be before trigger fast splitting
+ int fast_limit = g_conf()->mds_bal_split_size * g_conf()->mds_bal_fragment_fast_factor;
+
+ // Fast path: the sum of accounted size and null dentries does not
+ // exceed threshold: we definitely are not over it.
+ if (get_frag_size() + get_num_head_null() <= fast_limit) {
+ return false;
+ }
+
+ // Fast path: the accounted size of the frag exceeds threshold: we
+ // definitely are over it
+ if (get_frag_size() > fast_limit) {
+ return true;
+ }
+
+ int64_t effective_size = 0;
+
+ for (const auto &p : items) {
+ const CDentry *dn = p.second;
+ if (!dn->get_projected_linkage()->is_null()) {
+ effective_size++;
+ }
+ }
+
+ return effective_size > fast_limit;
+}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(CDir, co_dir, mds_co);
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
new file mode 100644
index 00000000..23c94c8b
--- /dev/null
+++ b/src/mds/CDir.h
@@ -0,0 +1,782 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef CEPH_CDIR_H
+#define CEPH_CDIR_H
+
+#include <iosfwd>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <string_view>
+
+#include "common/bloom_filter.hpp"
+#include "common/config.h"
+#include "include/buffer_fwd.h"
+#include "include/counter.h"
+#include "include/types.h"
+
+#include "CInode.h"
+#include "MDSCacheObject.h"
+#include "MDSContext.h"
+#include "cephfs_features.h"
+#include "SessionMap.h"
+#include "messages/MClientReply.h"
+
+class CDentry;
+class MDCache;
+
+struct ObjectOperation;
+
+ostream& operator<<(ostream& out, const class CDir& dir);
+class CDir : public MDSCacheObject, public Counter<CDir> {
+ using time = ceph::coarse_mono_time;
+ using clock = ceph::coarse_mono_clock;
+
+ friend ostream& operator<<(ostream& out, const class CDir& dir);
+
+public:
+ MEMPOOL_CLASS_HELPERS();
+ // -- pins --
+ static const int PIN_DNWAITER = 1;
+ static const int PIN_INOWAITER = 2;
+ static const int PIN_CHILD = 3;
+ static const int PIN_FROZEN = 4;
+ static const int PIN_SUBTREE = 5;
+ static const int PIN_IMPORTING = 7;
+ static const int PIN_IMPORTBOUND = 9;
+ static const int PIN_EXPORTBOUND = 10;
+ static const int PIN_STICKY = 11;
+ static const int PIN_SUBTREETEMP = 12; // used by MDCache::trim_non_auth()
+ std::string_view pin_name(int p) const override {
+ switch (p) {
+ case PIN_DNWAITER: return "dnwaiter";
+ case PIN_INOWAITER: return "inowaiter";
+ case PIN_CHILD: return "child";
+ case PIN_FROZEN: return "frozen";
+ case PIN_SUBTREE: return "subtree";
+ case PIN_IMPORTING: return "importing";
+ case PIN_IMPORTBOUND: return "importbound";
+ case PIN_EXPORTBOUND: return "exportbound";
+ case PIN_STICKY: return "sticky";
+ case PIN_SUBTREETEMP: return "subtreetemp";
+ default: return generic_pin_name(p);
+ }
+ }
+
+ // -- state --
+ static const unsigned STATE_COMPLETE = (1<< 0); // the complete contents are in cache
+ static const unsigned STATE_FROZENTREE = (1<< 1); // root of tree (bounded by exports)
+ static const unsigned STATE_FREEZINGTREE = (1<< 2); // in process of freezing
+ static const unsigned STATE_FROZENDIR = (1<< 3);
+ static const unsigned STATE_FREEZINGDIR = (1<< 4);
+ static const unsigned STATE_COMMITTING = (1<< 5); // mid-commit
+ static const unsigned STATE_FETCHING = (1<< 6); // currenting fetching
+ static const unsigned STATE_CREATING = (1<< 7);
+ static const unsigned STATE_IMPORTBOUND = (1<< 8);
+ static const unsigned STATE_EXPORTBOUND = (1<< 9);
+ static const unsigned STATE_EXPORTING = (1<<10);
+ static const unsigned STATE_IMPORTING = (1<<11);
+ static const unsigned STATE_FRAGMENTING = (1<<12);
+ static const unsigned STATE_STICKY = (1<<13); // sticky pin due to inode stickydirs
+ static const unsigned STATE_DNPINNEDFRAG = (1<<14); // dir is refragmenting
+ static const unsigned STATE_ASSIMRSTAT = (1<<15); // assimilating inode->frag rstats
+ static const unsigned STATE_DIRTYDFT = (1<<16); // dirty dirfragtree
+ static const unsigned STATE_BADFRAG = (1<<17); // bad dirfrag
+ static const unsigned STATE_TRACKEDBYOFT = (1<<18); // tracked by open file table
+ static const unsigned STATE_AUXSUBTREE = (1<<19); // no subtree merge
+
+ // common states
+ static const unsigned STATE_CLEAN = 0;
+
+ // these state bits are preserved by an import/export
+ // ...except if the directory is hashed, in which case none of them are!
+ static const unsigned MASK_STATE_EXPORTED =
+ (STATE_COMPLETE|STATE_DIRTY|STATE_DIRTYDFT|STATE_BADFRAG);
+ static const unsigned MASK_STATE_IMPORT_KEPT =
+ (
+ STATE_IMPORTING |
+ STATE_IMPORTBOUND |
+ STATE_EXPORTBOUND |
+ STATE_FROZENTREE |
+ STATE_STICKY |
+ STATE_TRACKEDBYOFT);
+ static const unsigned MASK_STATE_EXPORT_KEPT =
+ (STATE_EXPORTING |
+ STATE_IMPORTBOUND |
+ STATE_EXPORTBOUND |
+ STATE_FROZENTREE |
+ STATE_FROZENDIR |
+ STATE_STICKY |
+ STATE_TRACKEDBYOFT);
+ static const unsigned MASK_STATE_FRAGMENT_KEPT =
+ (STATE_DIRTY |
+ STATE_EXPORTBOUND |
+ STATE_IMPORTBOUND |
+ STATE_AUXSUBTREE |
+ STATE_REJOINUNDEF);
+
+ // -- rep spec --
+ static const int REP_NONE = 0;
+ static const int REP_ALL = 1;
+ static const int REP_LIST = 2;
+
+
+ static const unsigned EXPORT_NONCE = 1;
+
+
+ // -- wait masks --
+ static const uint64_t WAIT_DENTRY = (1<<0); // wait for item to be in cache
+ static const uint64_t WAIT_COMPLETE = (1<<1); // wait for complete dir contents
+ static const uint64_t WAIT_FROZEN = (1<<2); // auth pins removed
+ static const uint64_t WAIT_CREATED = (1<<3); // new dirfrag is logged
+
+ static const int WAIT_DNLOCK_OFFSET = 4;
+
+ static const uint64_t WAIT_ANY_MASK = (uint64_t)(-1);
+ static const uint64_t WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH);
+
+ // -- dump flags --
+ static const int DUMP_PATH = (1 << 0);
+ static const int DUMP_DIRFRAG = (1 << 1);
+ static const int DUMP_SNAPID_FIRST = (1 << 2);
+ static const int DUMP_VERSIONS = (1 << 3);
+ static const int DUMP_REP = (1 << 4);
+ static const int DUMP_DIR_AUTH = (1 << 5);
+ static const int DUMP_STATES = (1 << 6);
+ static const int DUMP_MDS_CACHE_OBJECT = (1 << 7);
+ static const int DUMP_ITEMS = (1 << 8);
+ static const int DUMP_ALL = (-1);
+ static const int DUMP_DEFAULT = DUMP_ALL & (~DUMP_ITEMS);
+
+ public:
+ // context
+ MDCache *cache;
+
+ CInode *inode; // my inode
+ frag_t frag; // my frag
+
+ bool is_lt(const MDSCacheObject *r) const override {
+ return dirfrag() < (static_cast<const CDir*>(r))->dirfrag();
+ }
+
+ fnode_t fnode;
+ snapid_t first;
+ mempool::mds_co::compact_map<snapid_t,old_rstat_t> dirty_old_rstat; // [value.first,key]
+
+ // my inodes with dirty rstat data
+ elist<CInode*> dirty_rstat_inodes;
+
+ void resync_accounted_fragstat();
+ void resync_accounted_rstat();
+ void assimilate_dirty_rstat_inodes();
+ void assimilate_dirty_rstat_inodes_finish(MutationRef& mut, EMetaBlob *blob);
+
+ void mark_exporting() {
+ state_set(CDir::STATE_EXPORTING);
+ inode->num_exporting_dirs++;
+ }
+ void clear_exporting() {
+ state_clear(CDir::STATE_EXPORTING);
+ inode->num_exporting_dirs--;
+ }
+
+protected:
+ version_t projected_version;
+ mempool::mds_co::list<fnode_t> projected_fnode;
+
+public:
+ elist<CDentry*> dirty_dentries;
+ elist<CDir*>::item item_dirty, item_new;
+
+public:
+ version_t get_version() const { return fnode.version; }
+ void set_version(version_t v) {
+ ceph_assert(projected_fnode.empty());
+ projected_version = fnode.version = v;
+ }
+ version_t get_projected_version() const { return projected_version; }
+
+ const fnode_t *get_projected_fnode() const {
+ if (projected_fnode.empty())
+ return &fnode;
+ else
+ return &projected_fnode.back();
+ }
+
+ fnode_t *get_projected_fnode() {
+ if (projected_fnode.empty())
+ return &fnode;
+ else
+ return &projected_fnode.back();
+ }
+ fnode_t *project_fnode();
+
+ void pop_and_dirty_projected_fnode(LogSegment *ls);
+ bool is_projected() const { return !projected_fnode.empty(); }
+ version_t pre_dirty(version_t min=0);
+ void _mark_dirty(LogSegment *ls);
+ void _set_dirty_flag() {
+ if (!state_test(STATE_DIRTY)) {
+ state_set(STATE_DIRTY);
+ get(PIN_DIRTY);
+ }
+ }
+ void mark_dirty(version_t pv, LogSegment *ls);
+ void mark_clean();
+
+ bool is_new() { return item_new.is_on_list(); }
+ void mark_new(LogSegment *ls);
+
+ bool is_bad() { return state_test(STATE_BADFRAG); }
+private:
+ void log_mark_dirty();
+
+public:
+ typedef mempool::mds_co::map<dentry_key_t, CDentry*> dentry_key_map;
+ typedef mempool::mds_co::set<dentry_key_t> dentry_key_set;
+
+ class scrub_info_t {
+ public:
+ /// inodes we contain with dirty scrub stamps
+ dentry_key_map dirty_scrub_stamps; // TODO: make use of this!
+ struct scrub_stamps {
+ version_t version;
+ utime_t time;
+ scrub_stamps() : version(0) {}
+ void operator=(const scrub_stamps &o) {
+ version = o.version;
+ time = o.time;
+ }
+ };
+
+ scrub_stamps recursive_start; // when we last started a recursive scrub
+ scrub_stamps last_recursive; // when we last finished a recursive scrub
+ scrub_stamps last_local; // when we last did a local scrub
+
+ bool directory_scrubbing; /// safety check
+ bool need_scrub_local;
+ bool last_scrub_dirty; /// is scrub info dirty or is it flushed to fnode?
+ bool pending_scrub_error;
+
+ /// these are lists of children in each stage of scrubbing
+ dentry_key_set directories_to_scrub;
+ dentry_key_set directories_scrubbing;
+ dentry_key_set directories_scrubbed;
+ dentry_key_set others_to_scrub;
+ dentry_key_set others_scrubbing;
+ dentry_key_set others_scrubbed;
+
+ ScrubHeaderRefConst header;
+
+ scrub_info_t() :
+ directory_scrubbing(false),
+ need_scrub_local(false),
+ last_scrub_dirty(false),
+ pending_scrub_error(false) {}
+ };
+ /**
+ * Call to start this CDir on a new scrub.
+ * @pre It is not currently scrubbing
+ * @pre The CDir is marked complete.
+ * @post It has set up its internal scrubbing state.
+ */
+ void scrub_initialize(const ScrubHeaderRefConst& header);
+ /**
+ * Get the next dentry to scrub. Gives you a CDentry* and its meaning. This
+ * function will give you all directory-representing dentries before any
+ * others.
+ * 0: success, you should scrub this CDentry right now
+ * EAGAIN: is currently fetching the next CDentry into memory for you.
+ * It will activate your callback when done; try again when it does!
+ * ENOENT: there are no remaining dentries to scrub
+ * <0: There was an unexpected error
+ *
+ * @param cb An MDSContext which will be activated only if
+ * we return EAGAIN via rcode, or else ignored
+ * @param dnout CDentry * which you should next scrub, or NULL
+ * @returns a value as described above
+ */
+ int scrub_dentry_next(MDSContext *cb, CDentry **dnout);
+ /**
+ * Get the currently scrubbing dentries. When returned, the passed-in
+ * list will be filled with all CDentry * which have been returned
+ * from scrub_dentry_next() but not sent back via scrub_dentry_finished().
+ */
+ void scrub_dentries_scrubbing(std::list<CDentry*> *out_dentries);
+ /**
+ * Report to the CDir that a CDentry has been scrubbed. Call this
+ * for every CDentry returned from scrub_dentry_next().
+ * @param dn The CDentry which has been scrubbed.
+ */
+ void scrub_dentry_finished(CDentry *dn);
+ /**
+ * Call this once all CDentries have been scrubbed, according to
+ * scrub_dentry_next's listing. It finalizes the scrub statistics.
+ */
+ void scrub_finished();
+ /**
+ * Tell the CDir to do a local scrub of itself.
+ * @pre The CDir is_complete().
+ * @returns true if the rstats and directory contents match, false otherwise.
+ */
+ bool scrub_local();
+private:
+ /**
+ * Create a scrub_info_t struct for the scrub_infop pointer.
+ */
+ void scrub_info_create() const;
+ /**
+ * Delete the scrub_infop if it's not got any useful data.
+ */
+ void scrub_maybe_delete_info();
+ /**
+ * Check the given set (presumably one of those in scrub_info_t) for the
+ * next key to scrub and look it up (or fail!).
+ */
+ int _next_dentry_on_set(dentry_key_set &dns, bool missing_okay,
+ MDSContext *cb, CDentry **dnout);
+
+
+protected:
+ std::unique_ptr<scrub_info_t> scrub_infop; // FIXME not in mempool
+
+ // contents of this directory
+ dentry_key_map items; // non-null AND null
+ unsigned num_head_items;
+ unsigned num_head_null;
+ unsigned num_snap_items;
+ unsigned num_snap_null;
+
+ int num_dirty;
+
+ int num_inodes_with_caps = 0;
+
+ // state
+ version_t committing_version;
+ version_t committed_version;
+
+ mempool::mds_co::compact_set<mempool::mds_co::string> stale_items;
+
+ // lock nesting, freeze
+ static int num_frozen_trees;
+ static int num_freezing_trees;
+
+ int dir_auth_pins;
+
+ // cache control (defined for authority; hints for replicas)
+ __s32 dir_rep;
+ mempool::mds_co::compact_set<__s32> dir_rep_by; // if dir_rep == REP_LIST
+
+ // popularity
+ dirfrag_load_vec_t pop_me;
+ dirfrag_load_vec_t pop_nested;
+ dirfrag_load_vec_t pop_auth_subtree;
+ dirfrag_load_vec_t pop_auth_subtree_nested;
+
+ time last_popularity_sample = clock::zero();
+
+ load_spread_t pop_spread;
+
+ elist<CInode*> pop_lru_subdirs;
+
+ // and to provide density
+ int num_dentries_nested;
+ int num_dentries_auth_subtree;
+ int num_dentries_auth_subtree_nested;
+
+
+ // friends
+ friend class Migrator;
+ friend class CInode;
+ friend class MDCache;
+ friend class MDiscover;
+ friend class MDBalancer;
+
+ friend class CDirDiscover;
+ friend class CDirExport;
+ friend class C_IO_Dir_TMAP_Fetched;
+ friend class C_IO_Dir_OMAP_Fetched;
+ friend class C_IO_Dir_OMAP_FetchedMore;
+ friend class C_IO_Dir_Committed;
+
+ std::unique_ptr<bloom_filter> bloom; // XXX not part of mempool::mds_co
+ /* If you set up the bloom filter, you must keep it accurate!
+ * It's deleted when you mark_complete() and is deliberately not serialized.*/
+
+ public:
+ CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth);
+
+ const scrub_info_t *scrub_info() const {
+ if (!scrub_infop) {
+ scrub_info_create();
+ }
+ return scrub_infop.get();
+ }
+
+
+ // -- accessors --
+ inodeno_t ino() const { return inode->ino(); } // deprecate me?
+ frag_t get_frag() const { return frag; }
+ dirfrag_t dirfrag() const { return dirfrag_t(inode->ino(), frag); }
+
+ CInode *get_inode() { return inode; }
+ const CInode *get_inode() const { return inode; }
+ CDir *get_parent_dir() { return inode->get_parent_dir(); }
+
+ dentry_key_map::iterator begin() { return items.begin(); }
+ dentry_key_map::iterator end() { return items.end(); }
+ dentry_key_map::iterator lower_bound(dentry_key_t key) { return items.lower_bound(key); }
+
+ unsigned get_num_head_items() const { return num_head_items; }
+ unsigned get_num_head_null() const { return num_head_null; }
+ unsigned get_num_snap_items() const { return num_snap_items; }
+ unsigned get_num_snap_null() const { return num_snap_null; }
+ unsigned get_num_any() const { return num_head_items + num_head_null + num_snap_items + num_snap_null; }
+
+ bool check_rstats(bool scrub=false);
+
+ void inc_num_dirty() { num_dirty++; }
+ void dec_num_dirty() {
+ ceph_assert(num_dirty > 0);
+ num_dirty--;
+ }
+ int get_num_dirty() const {
+ return num_dirty;
+ }
+
+ void adjust_num_inodes_with_caps(int d);
+
+ int64_t get_frag_size() const {
+ return get_projected_fnode()->fragstat.size();
+ }
+
+ // -- dentries and inodes --
+ public:
+ CDentry* lookup_exact_snap(std::string_view dname, snapid_t last);
+ CDentry* lookup(std::string_view n, snapid_t snap=CEPH_NOSNAP);
+
+ CDentry* add_null_dentry(std::string_view dname,
+ snapid_t first=2, snapid_t last=CEPH_NOSNAP);
+ CDentry* add_primary_dentry(std::string_view dname, CInode *in,
+ snapid_t first=2, snapid_t last=CEPH_NOSNAP);
+ CDentry* add_remote_dentry(std::string_view dname, inodeno_t ino, unsigned char d_type,
+ snapid_t first=2, snapid_t last=CEPH_NOSNAP);
+ void remove_dentry( CDentry *dn ); // delete dentry
+ void link_remote_inode( CDentry *dn, inodeno_t ino, unsigned char d_type);
+ void link_remote_inode( CDentry *dn, CInode *in );
+ void link_primary_inode( CDentry *dn, CInode *in );
+ void unlink_inode(CDentry *dn, bool adjust_lru=true);
+ void try_remove_unlinked_dn(CDentry *dn);
+
+ void add_to_bloom(CDentry *dn);
+ bool is_in_bloom(std::string_view name);
+ bool has_bloom() { return (bloom ? true : false); }
+ void remove_bloom() {
+ bloom.reset();
+ }
+private:
+ void link_inode_work( CDentry *dn, CInode *in );
+ void unlink_inode_work( CDentry *dn );
+ void remove_null_dentries();
+ void purge_stale_snap_data(const std::set<snapid_t>& snaps);
+public:
+ void try_remove_dentries_for_stray();
+ bool try_trim_snap_dentry(CDentry *dn, const std::set<snapid_t>& snaps);
+
+
+public:
+ void split(int bits, std::list<CDir*>& subs, MDSContext::vec& waiters, bool replay);
+ void merge(std::list<CDir*>& subs, MDSContext::vec& waiters, bool replay);
+
+ bool should_split() const {
+ return (int)get_frag_size() > g_conf()->mds_bal_split_size;
+ }
+ bool should_split_fast() const;
+ bool should_merge() const {
+ return (int)get_frag_size() < g_conf()->mds_bal_merge_size;
+ }
+
+private:
+ void prepare_new_fragment(bool replay);
+ void prepare_old_fragment(map<string_snap_t, MDSContext::vec >& dentry_waiters, bool replay);
+ void steal_dentry(CDentry *dn); // from another dir. used by merge/split.
+ void finish_old_fragment(MDSContext::vec& waiters, bool replay);
+ void init_fragment_pins();
+
+
+ // -- authority --
+ /*
+ * normal: <parent,unknown> !subtree_root
+ * delegation: <mds,unknown> subtree_root
+ * ambiguous: <mds1,mds2> subtree_root
+ * <parent,mds2> subtree_root
+ */
+ mds_authority_t dir_auth;
+
+ std::string get_path() const;
+
+ public:
+ mds_authority_t authority() const override;
+ mds_authority_t get_dir_auth() const { return dir_auth; }
+ void set_dir_auth(const mds_authority_t &a);
+ void set_dir_auth(mds_rank_t a) { set_dir_auth(mds_authority_t(a, CDIR_AUTH_UNKNOWN)); }
+ bool is_ambiguous_dir_auth() const {
+ return dir_auth.second != CDIR_AUTH_UNKNOWN;
+ }
+ bool is_full_dir_auth() const {
+ return is_auth() && !is_ambiguous_dir_auth();
+ }
+ bool is_full_dir_nonauth() const {
+ return !is_auth() && !is_ambiguous_dir_auth();
+ }
+
+ bool is_subtree_root() const {
+ return dir_auth != CDIR_AUTH_DEFAULT;
+ }
+
+ bool contains(CDir *x); // true if we are x or an ancestor of x
+
+
+ // for giving to clients
+ void get_dist_spec(std::set<mds_rank_t>& ls, mds_rank_t auth) {
+ if (is_auth()) {
+ list_replicas(ls);
+ if (!ls.empty())
+ ls.insert(auth);
+ }
+ }
+
+ static void encode_dirstat(bufferlist& bl, const session_info_t& info, const DirStat& ds);
+
+ void _encode_base(bufferlist& bl) {
+ encode(first, bl);
+ encode(fnode, bl);
+ encode(dir_rep, bl);
+ encode(dir_rep_by, bl);
+ }
+ void _decode_base(bufferlist::const_iterator& p) {
+ decode(first, p);
+ decode(fnode, p);
+ decode(dir_rep, p);
+ decode(dir_rep_by, p);
+ }
+ void encode_replica(mds_rank_t who, bufferlist& bl) {
+ __u32 nonce = add_replica(who);
+ encode(nonce, bl);
+ _encode_base(bl);
+ }
+ void decode_replica(bufferlist::const_iterator& p) {
+ __u32 nonce;
+ decode(nonce, p);
+ replica_nonce = nonce;
+ _decode_base(p);
+ }
+
+
+
+ // -- state --
+ bool is_complete() { return state & STATE_COMPLETE; }
+ bool is_exporting() { return state & STATE_EXPORTING; }
+ bool is_importing() { return state & STATE_IMPORTING; }
+ bool is_dirty_dft() { return state & STATE_DIRTYDFT; }
+
+ int get_dir_rep() const { return dir_rep; }
+ bool is_rep() const {
+ if (dir_rep == REP_NONE) return false;
+ return true;
+ }
+
+ // -- fetch --
+ object_t get_ondisk_object() {
+ return file_object_t(ino(), frag);
+ }
+ void fetch(MDSContext *c, bool ignore_authpinnability=false);
+ void fetch(MDSContext *c, std::string_view want_dn, bool ignore_authpinnability=false);
+ void fetch(MDSContext *c, const std::set<dentry_key_t>& keys);
+protected:
+ mempool::mds_co::compact_set<mempool::mds_co::string> wanted_items;
+
+ void _omap_fetch(MDSContext *fin, const std::set<dentry_key_t>& keys);
+ void _omap_fetch_more(
+ bufferlist& hdrbl, std::map<std::string, bufferlist>& omap,
+ MDSContext *fin);
+ CDentry *_load_dentry(
+ std::string_view key,
+ std::string_view dname,
+ snapid_t last,
+ bufferlist &bl,
+ int pos,
+ const std::set<snapid_t> *snaps,
+ bool *force_dirty);
+
+ /**
+ * Go bad due to a damaged dentry (register with damagetable and go BADFRAG)
+ */
+ void go_bad_dentry(snapid_t last, std::string_view dname);
+
+ /**
+ * Go bad due to a damaged header (register with damagetable and go BADFRAG)
+ */
+ void go_bad(bool complete);
+
+ void _omap_fetched(bufferlist& hdrbl, std::map<std::string, bufferlist>& omap,
+ bool complete, int r);
+
+ // -- commit --
+ mempool::mds_co::compact_map<version_t, MDSContext::vec_alloc<mempool::mds_co::pool_allocator> > waiting_for_commit;
+ void _commit(version_t want, int op_prio);
+ void _omap_commit(int op_prio);
+ void _encode_dentry(CDentry *dn, bufferlist& bl, const std::set<snapid_t> *snaps);
+ void _committed(int r, version_t v);
+public:
+#if 0 // unused?
+ void wait_for_commit(Context *c, version_t v=0);
+#endif
+ void commit_to(version_t want);
+ void commit(version_t want, MDSContext *c,
+ bool ignore_authpinnability=false, int op_prio=-1);
+
+ // -- dirtyness --
+ version_t get_committing_version() const { return committing_version; }
+ version_t get_committed_version() const { return committed_version; }
+ void set_committed_version(version_t v) { committed_version = v; }
+
+ void mark_complete();
+
+
+ // -- reference counting --
+ void first_get() override;
+ void last_put() override;
+
+ // -- waiters --
+protected:
+ mempool::mds_co::compact_map< string_snap_t, MDSContext::vec_alloc<mempool::mds_co::pool_allocator> > waiting_on_dentry; // FIXME string_snap_t not in mempool
+
+public:
+ bool is_waiting_for_dentry(std::string_view dname, snapid_t snap) {
+ return waiting_on_dentry.count(string_snap_t(dname, snap));
+ }
+ void add_dentry_waiter(std::string_view dentry, snapid_t snap, MDSContext *c);
+ void take_dentry_waiting(std::string_view dentry, snapid_t first, snapid_t last, MDSContext::vec& ls);
+ void take_sub_waiting(MDSContext::vec& ls); // dentry or ino
+
+ void add_waiter(uint64_t mask, MDSContext *c) override;
+ void take_waiting(uint64_t mask, MDSContext::vec& ls) override; // may include dentry waiters
+ void finish_waiting(uint64_t mask, int result = 0); // ditto
+
+
+ // -- import/export --
+ void encode_export(bufferlist& bl);
+ void finish_export();
+ void abort_export() {
+ put(PIN_TEMPEXPORTING);
+ }
+ void decode_import(bufferlist::const_iterator& blp, LogSegment *ls);
+ void abort_import();
+
+ // -- auth pins --
+ bool can_auth_pin(int *err_ret=nullptr) const override;
+ int get_auth_pins() const { return auth_pins; }
+ int get_dir_auth_pins() const { return dir_auth_pins; }
+ void auth_pin(void *who) override;
+ void auth_unpin(void *who) override;
+
+ void adjust_nested_auth_pins(int dirinc, void *by);
+ void verify_fragstat();
+
+ // -- freezing --
+ struct freeze_tree_state_t {
+ CDir *dir; // freezing/frozen tree root
+ int auth_pins = 0;
+ bool frozen = false;
+ freeze_tree_state_t(CDir *d) : dir(d) {}
+ };
+ // all dirfrags within freezing/frozen tree reference the 'state'
+ std::shared_ptr<freeze_tree_state_t> freeze_tree_state;
+
+ void _walk_tree(std::function<bool(CDir*)> cb);
+
+ bool freeze_tree();
+ void _freeze_tree();
+ void unfreeze_tree();
+ void adjust_freeze_after_rename(CDir *dir);
+
+ bool freeze_dir();
+ void _freeze_dir();
+ void unfreeze_dir();
+
+ void maybe_finish_freeze();
+
+ pair<bool,bool> is_freezing_or_frozen_tree() const {
+ if (freeze_tree_state) {
+ if (freeze_tree_state->frozen)
+ return make_pair(false, true);
+ return make_pair(true, false);
+ }
+ return make_pair(false, false);
+ }
+
+ bool is_freezing() const override { return is_freezing_dir() || is_freezing_tree(); }
+ bool is_freezing_tree() const {
+ if (!num_freezing_trees)
+ return false;
+ return is_freezing_or_frozen_tree().first;
+ }
+ bool is_freezing_tree_root() const { return state & STATE_FREEZINGTREE; }
+ bool is_freezing_dir() const { return state & STATE_FREEZINGDIR; }
+
+ bool is_frozen() const override { return is_frozen_dir() || is_frozen_tree(); }
+ bool is_frozen_tree() const {
+ if (!num_frozen_trees)
+ return false;
+ return is_freezing_or_frozen_tree().second;
+ }
+ bool is_frozen_tree_root() const { return state & STATE_FROZENTREE; }
+ bool is_frozen_dir() const { return state & STATE_FROZENDIR; }
+
+ bool is_freezeable(bool freezing=false) const {
+ // no nested auth pins.
+ if (auth_pins - (freezing ? 1 : 0) > 0 ||
+ (freeze_tree_state && freeze_tree_state->auth_pins != auth_pins))
+ return false;
+
+ // inode must not be frozen.
+ if (!is_subtree_root() && inode->is_frozen())
+ return false;
+
+ return true;
+ }
+
+ bool is_freezeable_dir(bool freezing=false) const {
+ if ((auth_pins - freezing) > 0 || dir_auth_pins > 0)
+ return false;
+
+ // if not subtree root, inode must not be frozen (tree--frozen_dir is okay).
+ if (!is_subtree_root() && inode->is_frozen() && !inode->is_frozen_dir())
+ return false;
+
+ return true;
+ }
+
+ ostream& print_db_line_prefix(ostream& out) override;
+ void print(ostream& out) override;
+ void dump(Formatter *f, int flags = DUMP_DEFAULT) const;
+ void dump_load(Formatter *f);
+};
+
+#endif
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
new file mode 100644
index 00000000..e5491171
--- /dev/null
+++ b/src/mds/CInode.cc
@@ -0,0 +1,4959 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/int_types.h"
+#include "common/errno.h"
+
+#include <string>
+#include <stdio.h>
+
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+
+#include "MDSRank.h"
+#include "MDCache.h"
+#include "MDLog.h"
+#include "Locker.h"
+#include "Mutation.h"
+
+#include "events/EUpdate.h"
+
+#include "osdc/Objecter.h"
+
+#include "snap.h"
+
+#include "LogSegment.h"
+
+#include "common/Clock.h"
+
+#include "common/config.h"
+#include "global/global_context.h"
+#include "include/ceph_assert.h"
+
+#include "mds/MDSContinuation.h"
+#include "mds/InoTable.h"
+#include "cephfs_features.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") "
+
+
+class CInodeIOContext : public MDSIOContextBase
+{
+protected:
+ CInode *in;
+ MDSRank *get_mds() override {return in->mdcache->mds;}
+public:
+ explicit CInodeIOContext(CInode *in_) : in(in_) {
+ ceph_assert(in != NULL);
+ }
+};
+
+sr_t* const CInode::projected_inode::UNDEF_SRNODE = (sr_t*)(unsigned long)-1;
+
+LockType CInode::versionlock_type(CEPH_LOCK_IVERSION);
+LockType CInode::authlock_type(CEPH_LOCK_IAUTH);
+LockType CInode::linklock_type(CEPH_LOCK_ILINK);
+LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT);
+LockType CInode::filelock_type(CEPH_LOCK_IFILE);
+LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR);
+LockType CInode::snaplock_type(CEPH_LOCK_ISNAP);
+LockType CInode::nestlock_type(CEPH_LOCK_INEST);
+LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK);
+LockType CInode::policylock_type(CEPH_LOCK_IPOLICY);
+
+//int cinode_pins[CINODE_NUM_PINS]; // counts
+ostream& CInode::print_db_line_prefix(ostream& out)
+{
+ return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") ";
+}
+
+/*
+ * write caps and lock ids
+ */
+struct cinode_lock_info_t cinode_lock_info[] = {
+ { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR },
+ { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL },
+ { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL },
+ { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL },
+};
+int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]);
+
+
+
+ostream& operator<<(ostream& out, const CInode& in)
+{
+ string path;
+ in.make_path_string(path, true);
+
+ out << "[inode " << in.inode.ino;
+ out << " ["
+ << (in.is_multiversion() ? "...":"")
+ << in.first << "," << in.last << "]";
+ out << " " << path << (in.is_dir() ? "/":"");
+
+ if (in.is_auth()) {
+ out << " auth";
+ if (in.is_replicated())
+ out << in.get_replicas();
+ } else {
+ mds_authority_t a = in.authority();
+ out << " rep@" << a.first;
+ if (a.second != CDIR_AUTH_UNKNOWN)
+ out << "," << a.second;
+ out << "." << in.get_replica_nonce();
+ }
+
+ if (in.is_symlink())
+ out << " symlink='" << in.symlink << "'";
+ if (in.is_dir() && !in.dirfragtree.empty())
+ out << " " << in.dirfragtree;
+
+ out << " v" << in.get_version();
+ if (in.get_projected_version() > in.get_version())
+ out << " pv" << in.get_projected_version();
+
+ if (in.get_num_auth_pins()) {
+ out << " ap=" << in.get_num_auth_pins();
+#ifdef MDS_AUTHPIN_SET
+ in.print_authpin_set(out);
+#endif
+ }
+
+ if (in.snaprealm)
+ out << " snaprealm=" << in.snaprealm;
+
+ if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH";
+ if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " needsrecover";
+ if (in.state_test(CInode::STATE_RECOVERING)) out << " recovering";
+ if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " dirtyparent";
+ if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " missingobjs";
+ if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
+ if (in.is_frozen_inode()) out << " FROZEN";
+ if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN";
+
+ const CInode::mempool_inode *pi = in.get_projected_inode();
+ if (pi->is_truncating())
+ out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")";
+
+ if (in.inode.is_dir()) {
+ out << " " << in.inode.dirstat;
+ if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
+ const CInode::mempool_inode *pi = in.get_projected_inode();
+ out << "->" << pi->dirstat;
+ }
+ } else {
+ out << " s=" << in.inode.size;
+ if (in.inode.nlink != 1)
+ out << " nl=" << in.inode.nlink;
+ }
+
+ // rstat
+ out << " " << in.inode.rstat;
+ if (!(in.inode.rstat == in.inode.accounted_rstat))
+ out << "/" << in.inode.accounted_rstat;
+ if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
+ const CInode::mempool_inode *pi = in.get_projected_inode();
+ out << "->" << pi->rstat;
+ if (!(pi->rstat == pi->accounted_rstat))
+ out << "/" << pi->accounted_rstat;
+ }
+
+ if (!in.client_need_snapflush.empty())
+ out << " need_snapflush=" << in.client_need_snapflush;
+
+
+ // locks
+ if (!in.authlock.is_sync_and_unlocked())
+ out << " " << in.authlock;
+ if (!in.linklock.is_sync_and_unlocked())
+ out << " " << in.linklock;
+ if (in.inode.is_dir()) {
+ if (!in.dirfragtreelock.is_sync_and_unlocked())
+ out << " " << in.dirfragtreelock;
+ if (!in.snaplock.is_sync_and_unlocked())
+ out << " " << in.snaplock;
+ if (!in.nestlock.is_sync_and_unlocked())
+ out << " " << in.nestlock;
+ if (!in.policylock.is_sync_and_unlocked())
+ out << " " << in.policylock;
+ } else {
+ if (!in.flocklock.is_sync_and_unlocked())
+ out << " " << in.flocklock;
+ }
+ if (!in.filelock.is_sync_and_unlocked())
+ out << " " << in.filelock;
+ if (!in.xattrlock.is_sync_and_unlocked())
+ out << " " << in.xattrlock;
+ if (!in.versionlock.is_sync_and_unlocked())
+ out << " " << in.versionlock;
+
+ // hack: spit out crap on which clients have caps
+ if (in.inode.client_ranges.size())
+ out << " cr=" << in.inode.client_ranges;
+
+ if (!in.get_client_caps().empty()) {
+ out << " caps={";
+ bool first = true;
+ for (const auto &p : in.get_client_caps()) {
+ if (!first) out << ",";
+ out << p.first << "="
+ << ccap_string(p.second.pending());
+ if (p.second.issued() != p.second.pending())
+ out << "/" << ccap_string(p.second.issued());
+ out << "/" << ccap_string(p.second.wanted())
+ << "@" << p.second.get_last_seq();
+ first = false;
+ }
+ out << "}";
+ if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) {
+ out << ",l=" << in.get_loner();
+ if (in.get_loner() != in.get_wanted_loner())
+ out << "(" << in.get_wanted_loner() << ")";
+ }
+ }
+ if (!in.get_mds_caps_wanted().empty()) {
+ out << " mcw={";
+ bool first = true;
+ for (const auto &p : in.get_mds_caps_wanted()) {
+ if (!first)
+ out << ',';
+ out << p.first << '=' << ccap_string(p.second);
+ first = false;
+ }
+ out << '}';
+ }
+
+ if (in.get_num_ref()) {
+ out << " |";
+ in.print_pin_set(out);
+ }
+
+ if (in.inode.export_pin != MDS_RANK_NONE) {
+ out << " export_pin=" << in.inode.export_pin;
+ }
+
+ out << " " << &in;
+ out << "]";
+ return out;
+}
+
+ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si)
+{
+ out << "{scrub_start_version: " << si.scrub_start_version
+ << ", scrub_start_stamp: " << si.scrub_start_stamp
+ << ", last_scrub_version: " << si.last_scrub_version
+ << ", last_scrub_stamp: " << si.last_scrub_stamp;
+ return out;
+}
+
+CInode::CInode(MDCache *c, bool auth, snapid_t f, snapid_t l)
+ :
+ mdcache(c),
+ first(f), last(l),
+ item_dirty(this),
+ item_caps(this),
+ item_open_file(this),
+ item_dirty_parent(this),
+ item_dirty_dirfrag_dir(this),
+ item_dirty_dirfrag_nest(this),
+ item_dirty_dirfrag_dirfragtree(this),
+ pop(c->decayrate),
+ versionlock(this, &versionlock_type),
+ authlock(this, &authlock_type),
+ linklock(this, &linklock_type),
+ dirfragtreelock(this, &dirfragtreelock_type),
+ filelock(this, &filelock_type),
+ xattrlock(this, &xattrlock_type),
+ snaplock(this, &snaplock_type),
+ nestlock(this, &nestlock_type),
+ flocklock(this, &flocklock_type),
+ policylock(this, &policylock_type)
+{
+ if (auth) state_set(STATE_AUTH);
+}
+
+void CInode::print(ostream& out)
+{
+ out << *this;
+}
+
+void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
+{
+ dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
+
+ if (client_need_snapflush.empty()) {
+ get(CInode::PIN_NEEDSNAPFLUSH);
+
+ // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially
+ // long periods waiting for clients to flush their snaps.
+ auth_pin(this); // pin head inode...
+ }
+
+ auto &clients = client_need_snapflush[snapid];
+ if (clients.empty())
+ snapin->auth_pin(this); // ...and pin snapped/old inode!
+
+ clients.insert(client);
+}
+
+void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
+{
+ dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
+ auto it = client_need_snapflush.find(snapid);
+ if (it == client_need_snapflush.end()) {
+ dout(10) << " snapid not found" << dendl;
+ return;
+ }
+ size_t n = it->second.erase(client);
+ if (n == 0) {
+ dout(10) << " client not found" << dendl;
+ return;
+ }
+ if (it->second.empty()) {
+ client_need_snapflush.erase(it);
+ snapin->auth_unpin(this);
+
+ if (client_need_snapflush.empty()) {
+ put(CInode::PIN_NEEDSNAPFLUSH);
+ auth_unpin(this);
+ }
+ }
+}
+
+pair<bool,bool> CInode::split_need_snapflush(CInode *cowin, CInode *in)
+{
+ dout(10) << __func__ << " [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl;
+ bool cowin_need_flush = false;
+ bool orig_need_flush = false;
+ auto it = client_need_snapflush.lower_bound(cowin->first);
+ while (it != client_need_snapflush.end() && it->first < in->first) {
+ ceph_assert(!it->second.empty());
+ if (cowin->last >= it->first) {
+ cowin->auth_pin(this);
+ cowin_need_flush = true;
+ ++it;
+ } else {
+ it = client_need_snapflush.erase(it);
+ }
+ in->auth_unpin(this);
+ }
+
+ if (it != client_need_snapflush.end() && it->first <= in->last)
+ orig_need_flush = true;
+
+ return make_pair(cowin_need_flush, orig_need_flush);
+}
+
+void CInode::mark_dirty_rstat()
+{
+ if (!state_test(STATE_DIRTYRSTAT)) {
+ dout(10) << __func__ << dendl;
+ state_set(STATE_DIRTYRSTAT);
+ get(PIN_DIRTYRSTAT);
+ CDentry *pdn = get_projected_parent_dn();
+ if (pdn->is_auth()) {
+ CDir *pdir = pdn->dir;
+ pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item);
+ mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock);
+ } else {
+ // under cross-MDS rename.
+ // DIRTYRSTAT flag will get cleared when rename finishes
+ ceph_assert(state_test(STATE_AMBIGUOUSAUTH));
+ }
+ }
+}
+void CInode::clear_dirty_rstat()
+{
+ if (state_test(STATE_DIRTYRSTAT)) {
+ dout(10) << __func__ << dendl;
+ state_clear(STATE_DIRTYRSTAT);
+ put(PIN_DIRTYRSTAT);
+ dirty_rstat_item.remove_myself();
+ }
+}
+
+CInode::projected_inode &CInode::project_inode(bool xattr, bool snap)
+{
+ auto &pi = projected_nodes.empty() ?
+ projected_nodes.emplace_back(inode) :
+ projected_nodes.emplace_back(projected_nodes.back().inode);
+
+ if (scrub_infop && scrub_infop->last_scrub_dirty) {
+ pi.inode.last_scrub_stamp = scrub_infop->last_scrub_stamp;
+ pi.inode.last_scrub_version = scrub_infop->last_scrub_version;
+ scrub_infop->last_scrub_dirty = false;
+ scrub_maybe_delete_info();
+ }
+
+ if (xattr) {
+ pi.xattrs.reset(new mempool_xattr_map(*get_projected_xattrs()));
+ ++num_projected_xattrs;
+ }
+
+ if (snap) {
+ project_snaprealm();
+ }
+
+ dout(15) << __func__ << " " << pi.inode.ino << dendl;
+ return pi;
+}
+
+void CInode::pop_and_dirty_projected_inode(LogSegment *ls)
+{
+ ceph_assert(!projected_nodes.empty());
+ auto &front = projected_nodes.front();
+ dout(15) << __func__ << " " << front.inode.ino
+ << " v" << front.inode.version << dendl;
+ int64_t old_pool = inode.layout.pool_id;
+
+ mark_dirty(front.inode.version, ls);
+ bool new_export_pin = inode.export_pin != front.inode.export_pin;
+ inode = front.inode;
+ if (new_export_pin)
+ maybe_export_pin(true);
+
+ if (inode.is_backtrace_updated())
+ mark_dirty_parent(ls, old_pool != inode.layout.pool_id);
+
+ if (front.xattrs) {
+ --num_projected_xattrs;
+ xattrs = *front.xattrs;
+ }
+
+ if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) {
+ pop_projected_snaprealm(projected_nodes.front().snapnode, false);
+ --num_projected_srnodes;
+ }
+
+ projected_nodes.pop_front();
+}
+
+sr_t *CInode::prepare_new_srnode(snapid_t snapid)
+{
+ const sr_t *cur_srnode = get_projected_srnode();
+ sr_t *new_srnode;
+
+ if (cur_srnode) {
+ new_srnode = new sr_t(*cur_srnode);
+ if (!new_srnode->past_parents.empty()) {
+ // convert past_parents to past_parent_snaps
+ ceph_assert(snaprealm);
+ auto& snaps = snaprealm->get_snaps();
+ for (auto p : snaps) {
+ if (p >= new_srnode->current_parent_since)
+ break;
+ if (!new_srnode->snaps.count(p))
+ new_srnode->past_parent_snaps.insert(p);
+ }
+ new_srnode->seq = snaprealm->get_newest_seq();
+ new_srnode->past_parents.clear();
+ }
+ if (snaprealm)
+ snaprealm->past_parents_dirty = false;
+ } else {
+ if (snapid == 0)
+ snapid = mdcache->get_global_snaprealm()->get_newest_seq();
+ new_srnode = new sr_t();
+ new_srnode->seq = snapid;
+ new_srnode->created = snapid;
+ new_srnode->current_parent_since = get_oldest_snap();
+ }
+ return new_srnode;
+}
+
+void CInode::project_snaprealm(sr_t *new_srnode)
+{
+ dout(10) << __func__ << " " << new_srnode << dendl;
+ ceph_assert(projected_nodes.back().snapnode == projected_inode::UNDEF_SRNODE);
+ projected_nodes.back().snapnode = new_srnode;
+ ++num_projected_srnodes;
+}
+
+void CInode::mark_snaprealm_global(sr_t *new_srnode)
+{
+ ceph_assert(!is_dir());
+ // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since'
+ new_srnode->last_destroyed = new_srnode->current_parent_since;
+ new_srnode->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+ new_srnode->mark_parent_global();
+}
+
+void CInode::clear_snaprealm_global(sr_t *new_srnode)
+{
+ // restore 'current_parent_since'
+ new_srnode->current_parent_since = new_srnode->last_destroyed;
+ new_srnode->last_destroyed = 0;
+ new_srnode->seq = mdcache->get_global_snaprealm()->get_newest_seq();
+ new_srnode->clear_parent_global();
+}
+
+bool CInode::is_projected_snaprealm_global() const
+{
+ const sr_t *srnode = get_projected_srnode();
+ if (srnode && srnode->is_parent_global())
+ return true;
+ return false;
+}
+
+void CInode::project_snaprealm_past_parent(SnapRealm *newparent)
+{
+ sr_t *new_snap = project_snaprealm();
+ record_snaprealm_past_parent(new_snap, newparent);
+}
+
+
+/* if newparent != parent, add parent to past_parents
+ if parent DNE, we need to find what the parent actually is and fill that in */
+void CInode::record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent)
+{
+ ceph_assert(!new_snap->is_parent_global());
+ SnapRealm *oldparent;
+ if (!snaprealm) {
+ oldparent = find_snaprealm();
+ } else {
+ oldparent = snaprealm->parent;
+ }
+
+ if (newparent != oldparent) {
+ snapid_t oldparentseq = oldparent->get_newest_seq();
+ if (oldparentseq + 1 > new_snap->current_parent_since) {
+ // copy old parent's snaps
+ const set<snapid_t>& snaps = oldparent->get_snaps();
+ auto p = snaps.lower_bound(new_snap->current_parent_since);
+ if (p != snaps.end())
+ new_snap->past_parent_snaps.insert(p, snaps.end());
+ if (oldparentseq > new_snap->seq)
+ new_snap->seq = oldparentseq;
+ }
+ new_snap->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+ }
+}
+
+void CInode::record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *oldparent,
+ CDentry *dn, bool primary_dn)
+{
+ ceph_assert(new_snap->is_parent_global());
+
+ if (!oldparent)
+ oldparent = dn->get_dir()->inode->find_snaprealm();
+ auto& snaps = oldparent->get_snaps();
+
+ if (!primary_dn) {
+ auto p = snaps.lower_bound(dn->first);
+ if (p != snaps.end())
+ new_snap->past_parent_snaps.insert(p, snaps.end());
+ } else {
+ // 'last_destroyed' is used as 'current_parent_since'
+ auto p = snaps.lower_bound(new_snap->last_destroyed);
+ if (p != snaps.end())
+ new_snap->past_parent_snaps.insert(p, snaps.end());
+ new_snap->last_destroyed = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+ }
+}
+
+void CInode::early_pop_projected_snaprealm()
+{
+ ceph_assert(!projected_nodes.empty());
+ if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) {
+ pop_projected_snaprealm(projected_nodes.front().snapnode, true);
+ projected_nodes.front().snapnode = projected_inode::UNDEF_SRNODE;
+ --num_projected_srnodes;
+ }
+}
+
+void CInode::pop_projected_snaprealm(sr_t *next_snaprealm, bool early)
+{
+ if (next_snaprealm) {
+ dout(10) << __func__ << (early ? " (early) " : " ")
+ << next_snaprealm << " seq " << next_snaprealm->seq << dendl;
+ bool invalidate_cached_snaps = false;
+ if (!snaprealm) {
+ open_snaprealm();
+ } else if (!snaprealm->srnode.past_parents.empty()) {
+ invalidate_cached_snaps = true;
+ // re-open past parents
+ snaprealm->close_parents();
+
+ dout(10) << " realm " << *snaprealm << " past_parents " << snaprealm->srnode.past_parents
+ << " -> " << next_snaprealm->past_parents << dendl;
+ }
+ auto old_flags = snaprealm->srnode.flags;
+ snaprealm->srnode = *next_snaprealm;
+ delete next_snaprealm;
+
+ if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
+ snaprealm->close_parents();
+ snaprealm->adjust_parent();
+ }
+
+ // we should be able to open these up (or have them already be open).
+ bool ok = snaprealm->_open_parents(NULL);
+ ceph_assert(ok);
+
+ if (invalidate_cached_snaps)
+ snaprealm->invalidate_cached_snaps();
+
+ if (snaprealm->parent)
+ dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl;
+ } else {
+ dout(10) << __func__ << (early ? " (early) null" : " null") << dendl;
+ ceph_assert(snaprealm);
+ snaprealm->merge_to(NULL);
+ }
+}
+
+
+// ====== CInode =======
+
+// dirfrags
+
+__u32 InodeStoreBase::hash_dentry_name(std::string_view dn)
+{
+ int which = inode.dir_layout.dl_dir_hash;
+ if (!which)
+ which = CEPH_STR_HASH_LINUX;
+ ceph_assert(ceph_str_hash_valid(which));
+ return ceph_str_hash(which, dn.data(), dn.length());
+}
+
+frag_t InodeStoreBase::pick_dirfrag(std::string_view dn)
+{
+ if (dirfragtree.empty())
+ return frag_t(); // avoid the string hash if we can.
+
+ __u32 h = hash_dentry_name(dn);
+ return dirfragtree[h];
+}
+
+bool CInode::get_dirfrags_under(frag_t fg, list<CDir*>& ls)
+{
+ bool all = true;
+ {
+ frag_vec_t leaves;
+ dirfragtree.get_leaves_under(fg, leaves);
+ for (const auto &leaf : leaves) {
+ if (auto it = dirfrags.find(leaf); it != dirfrags.end()) {
+ ls.push_back(it->second);
+ } else {
+ all = false;
+ }
+ }
+ }
+
+ if (all)
+ return all;
+
+ fragtree_t tmpdft;
+ tmpdft.force_to_leaf(g_ceph_context, fg);
+ for (auto &p : dirfrags) {
+ tmpdft.force_to_leaf(g_ceph_context, p.first);
+ if (fg.contains(p.first) && !dirfragtree.is_leaf(p.first))
+ ls.push_back(p.second);
+ }
+
+ all = true;
+ {
+ frag_vec_t leaves;
+ tmpdft.get_leaves_under(fg, leaves);
+ for (const auto& leaf : leaves) {
+ if (!dirfrags.count(leaf)) {
+ all = false;
+ break;
+ }
+ }
+ }
+
+ return all;
+}
+
+void CInode::verify_dirfrags()
+{
+ bool bad = false;
+ for (const auto &p : dirfrags) {
+ if (!dirfragtree.is_leaf(p.first)) {
+ dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
+ << ": " << *p.second << dendl;
+ bad = true;
+ }
+ }
+ ceph_assert(!bad);
+}
+
+void CInode::force_dirfrags()
+{
+ bool bad = false;
+ for (auto &p : dirfrags) {
+ if (!dirfragtree.is_leaf(p.first)) {
+ dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
+ << ": " << *p.second << dendl;
+ bad = true;
+ }
+ }
+
+ if (bad) {
+ frag_vec_t leaves;
+ dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ mdcache->get_force_dirfrag(dirfrag_t(ino(), leaf), true);
+ }
+ }
+
+ verify_dirfrags();
+}
+
+CDir *CInode::get_approx_dirfrag(frag_t fg)
+{
+ CDir *dir = get_dirfrag(fg);
+ if (dir) return dir;
+
+ // find a child?
+ list<CDir*> ls;
+ get_dirfrags_under(fg, ls);
+ if (!ls.empty())
+ return ls.front();
+
+ // try parents?
+ while (fg.bits() > 0) {
+ fg = fg.parent();
+ dir = get_dirfrag(fg);
+ if (dir) return dir;
+ }
+ return NULL;
+}
+
+CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg)
+{
+ ceph_assert(is_dir());
+
+ // have it?
+ CDir *dir = get_dirfrag(fg);
+ if (!dir) {
+ // create it.
+ ceph_assert(is_auth() || mdcache->mds->is_any_replay());
+ dir = new CDir(this, fg, mdcache, is_auth());
+ add_dirfrag(dir);
+ }
+ return dir;
+}
+
+CDir *CInode::add_dirfrag(CDir *dir)
+{
+ auto em = dirfrags.emplace(std::piecewise_construct, std::forward_as_tuple(dir->dirfrag().frag), std::forward_as_tuple(dir));
+ ceph_assert(em.second);
+
+ if (stickydir_ref > 0) {
+ dir->state_set(CDir::STATE_STICKY);
+ dir->get(CDir::PIN_STICKY);
+ }
+
+ maybe_export_pin();
+
+ return dir;
+}
+
+void CInode::close_dirfrag(frag_t fg)
+{
+ dout(14) << __func__ << " " << fg << dendl;
+ ceph_assert(dirfrags.count(fg));
+
+ CDir *dir = dirfrags[fg];
+ dir->remove_null_dentries();
+
+ // clear dirty flag
+ if (dir->is_dirty())
+ dir->mark_clean();
+
+ if (stickydir_ref > 0) {
+ dir->state_clear(CDir::STATE_STICKY);
+ dir->put(CDir::PIN_STICKY);
+ }
+
+ if (dir->is_subtree_root())
+ num_subtree_roots--;
+
+ // dump any remaining dentries, for debugging purposes
+ for (const auto &p : dir->items)
+ dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl;
+
+ ceph_assert(dir->get_num_ref() == 0);
+ delete dir;
+ dirfrags.erase(fg);
+}
+
+void CInode::close_dirfrags()
+{
+ while (!dirfrags.empty())
+ close_dirfrag(dirfrags.begin()->first);
+}
+
+bool CInode::has_subtree_root_dirfrag(int auth)
+{
+ if (num_subtree_roots > 0) {
+ if (auth == -1)
+ return true;
+ for (const auto &p : dirfrags) {
+ if (p.second->is_subtree_root() &&
+ p.second->dir_auth.first == auth)
+ return true;
+ }
+ }
+ return false;
+}
+
+bool CInode::has_subtree_or_exporting_dirfrag()
+{
+ if (num_subtree_roots > 0 || num_exporting_dirs > 0)
+ return true;
+ return false;
+}
+
+void CInode::get_stickydirs()
+{
+ if (stickydir_ref == 0) {
+ get(PIN_STICKYDIRS);
+ for (const auto &p : dirfrags) {
+ p.second->state_set(CDir::STATE_STICKY);
+ p.second->get(CDir::PIN_STICKY);
+ }
+ }
+ stickydir_ref++;
+}
+
+void CInode::put_stickydirs()
+{
+ ceph_assert(stickydir_ref > 0);
+ stickydir_ref--;
+ if (stickydir_ref == 0) {
+ put(PIN_STICKYDIRS);
+ for (const auto &p : dirfrags) {
+ p.second->state_clear(CDir::STATE_STICKY);
+ p.second->put(CDir::PIN_STICKY);
+ }
+ }
+}
+
+
+
+
+
+// pins
+
+void CInode::first_get()
+{
+ // pin my dentry?
+ if (parent)
+ parent->get(CDentry::PIN_INODEPIN);
+}
+
+void CInode::last_put()
+{
+ // unpin my dentry?
+ if (parent)
+ parent->put(CDentry::PIN_INODEPIN);
+}
+
+void CInode::_put()
+{
+ if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
+ mdcache->maybe_eval_stray(this, true);
+}
+
+void CInode::add_remote_parent(CDentry *p)
+{
+ if (remote_parents.empty())
+ get(PIN_REMOTEPARENT);
+ remote_parents.insert(p);
+}
+void CInode::remove_remote_parent(CDentry *p)
+{
+ remote_parents.erase(p);
+ if (remote_parents.empty())
+ put(PIN_REMOTEPARENT);
+}
+
+
+
+
+CDir *CInode::get_parent_dir()
+{
+ if (parent)
+ return parent->dir;
+ return NULL;
+}
+CDir *CInode::get_projected_parent_dir()
+{
+ CDentry *p = get_projected_parent_dn();
+ if (p)
+ return p->dir;
+ return NULL;
+}
+CInode *CInode::get_parent_inode()
+{
+ if (parent)
+ return parent->dir->inode;
+ return NULL;
+}
+
+bool CInode::is_ancestor_of(const CInode *other) const
+{
+ while (other) {
+ if (other == this)
+ return true;
+ const CDentry *pdn = other->get_oldest_parent_dn();
+ if (!pdn) {
+ ceph_assert(other->is_base());
+ break;
+ }
+ other = pdn->get_dir()->get_inode();
+ }
+ return false;
+}
+
+bool CInode::is_projected_ancestor_of(const CInode *other) const
+{
+ while (other) {
+ if (other == this)
+ return true;
+ const CDentry *pdn = other->get_projected_parent_dn();
+ if (!pdn) {
+ ceph_assert(other->is_base());
+ break;
+ }
+ other = pdn->get_dir()->get_inode();
+ }
+ return false;
+}
+
+/*
+ * Because a non-directory inode may have multiple links, the use_parent
+ * argument allows selecting which parent to use for path construction. This
+ * argument is only meaningful for the final component (i.e. the first of the
+ * nested calls) because directories cannot have multiple hard links. If
+ * use_parent is NULL and projected is true, the primary parent's projected
+ * inode is used all the way up the path chain. Otherwise the primary parent
+ * stable inode is used.
+ */
+void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const
+{
+ if (!use_parent) {
+ use_parent = projected ? get_projected_parent_dn() : parent;
+ }
+
+ if (use_parent) {
+ use_parent->make_path_string(s, projected);
+ } else if (is_root()) {
+ s = "";
+ } else if (is_mdsdir()) {
+ char t[40];
+ uint64_t eino(ino());
+ eino -= MDS_INO_MDSDIR_OFFSET;
+ snprintf(t, sizeof(t), "~mds%" PRId64, eino);
+ s = t;
+ } else {
+ char n[40];
+ uint64_t eino(ino());
+ snprintf(n, sizeof(n), "#%" PRIx64, eino);
+ s += n;
+ }
+}
+
+void CInode::make_path(filepath& fp, bool projected) const
+{
+ const CDentry *use_parent = projected ? get_projected_parent_dn() : parent;
+ if (use_parent) {
+ ceph_assert(!is_base());
+ use_parent->make_path(fp, projected);
+ } else {
+ fp = filepath(ino());
+ }
+}
+
+void CInode::name_stray_dentry(string& dname)
+{
+ char s[20];
+ snprintf(s, sizeof(s), "%llx", (unsigned long long)inode.ino.val);
+ dname = s;
+}
+
+version_t CInode::pre_dirty()
+{
+ version_t pv;
+ CDentry* _cdentry = get_projected_parent_dn();
+ if (_cdentry) {
+ pv = _cdentry->pre_dirty(get_projected_version());
+ dout(10) << "pre_dirty " << pv << " (current v " << inode.version << ")" << dendl;
+ } else {
+ ceph_assert(is_base());
+ pv = get_projected_version() + 1;
+ }
+ // force update backtrace for old format inode (see mempool_inode::decode)
+ if (inode.backtrace_version == 0 && !projected_nodes.empty()) {
+ mempool_inode &pi = projected_nodes.back().inode;
+ if (pi.backtrace_version == 0)
+ pi.update_backtrace(pv);
+ }
+ return pv;
+}
+
+void CInode::_mark_dirty(LogSegment *ls)
+{
+ if (!state_test(STATE_DIRTY)) {
+ state_set(STATE_DIRTY);
+ get(PIN_DIRTY);
+ ceph_assert(ls);
+ }
+
+ // move myself to this segment's dirty list
+ if (ls)
+ ls->dirty_inodes.push_back(&item_dirty);
+}
+
+void CInode::mark_dirty(version_t pv, LogSegment *ls) {
+
+ dout(10) << __func__ << " " << *this << dendl;
+
+ /*
+ NOTE: I may already be dirty, but this fn _still_ needs to be called so that
+ the directory is (perhaps newly) dirtied, and so that parent_dir_version is
+ updated below.
+ */
+
+ // only auth can get dirty. "dirty" async data in replicas is relative to
+ // filelock state, not the dirty flag.
+ ceph_assert(is_auth());
+
+ // touch my private version
+ ceph_assert(inode.version < pv);
+ inode.version = pv;
+ _mark_dirty(ls);
+
+ // mark dentry too
+ if (parent)
+ parent->mark_dirty(pv, ls);
+}
+
+
+void CInode::mark_clean()
+{
+ dout(10) << __func__ << " " << *this << dendl;
+ if (state_test(STATE_DIRTY)) {
+ state_clear(STATE_DIRTY);
+ put(PIN_DIRTY);
+
+ // remove myself from ls dirty list
+ item_dirty.remove_myself();
+ }
+}
+
+
+// --------------
+// per-inode storage
+// (currently for root inode only)
+
+struct C_IO_Inode_Stored : public CInodeIOContext {
+ version_t version;
+ Context *fin;
+ C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
+ void finish(int r) override {
+ in->_stored(r, version, fin);
+ }
+ void print(ostream& out) const override {
+ out << "inode_store(" << in->ino() << ")";
+ }
+};
+
+object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix)
+{
+ char n[60];
+ snprintf(n, sizeof(n), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)fg);
+ ceph_assert(strlen(n) + suffix.size() < sizeof n);
+ strncat(n, suffix.data(), suffix.size());
+ return object_t(n);
+}
+
+void CInode::store(MDSContext *fin)
+{
+ dout(10) << __func__ << " " << get_version() << dendl;
+ ceph_assert(is_base());
+
+ if (snaprealm)
+ purge_stale_snap_data(snaprealm->get_snaps());
+
+ // encode
+ bufferlist bl;
+ string magic = CEPH_FS_ONDISK_MAGIC;
+ using ceph::encode;
+ encode(magic, bl);
+ encode_store(bl, mdcache->mds->mdsmap->get_up_features());
+
+ // write it.
+ SnapContext snapc;
+ ObjectOperation m;
+ m.write_full(bl);
+
+ object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode");
+ object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool());
+
+ Context *newfin =
+ new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin),
+ mdcache->mds->finisher);
+ mdcache->mds->objecter->mutate(oid, oloc, m, snapc,
+ ceph::real_clock::now(), 0,
+ newfin);
+}
+
+void CInode::_stored(int r, version_t v, Context *fin)
+{
+ if (r < 0) {
+ dout(1) << "store error " << r << " v " << v << " on " << *this << dendl;
+ mdcache->mds->clog->error() << "failed to store inode " << ino()
+ << " object: " << cpp_strerror(r);
+ mdcache->mds->handle_write_error(r);
+ fin->complete(r);
+ return;
+ }
+
+ dout(10) << __func__ << " " << v << " on " << *this << dendl;
+ if (v == get_projected_version())
+ mark_clean();
+
+ fin->complete(0);
+}
+
+void CInode::flush(MDSContext *fin)
+{
+ dout(10) << __func__ << " " << *this << dendl;
+ ceph_assert(is_auth() && can_auth_pin());
+
+ MDSGatherBuilder gather(g_ceph_context);
+
+ if (is_dirty_parent()) {
+ store_backtrace(gather.new_sub());
+ }
+ if (is_dirty()) {
+ if (is_base()) {
+ store(gather.new_sub());
+ } else {
+ parent->dir->commit(0, gather.new_sub());
+ }
+ }
+
+ if (gather.has_subs()) {
+ gather.set_finisher(fin);
+ gather.activate();
+ } else {
+ fin->complete(0);
+ }
+}
+
+struct C_IO_Inode_Fetched : public CInodeIOContext {
+ bufferlist bl, bl2;
+ Context *fin;
+ C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {}
+ void finish(int r) override {
+ // Ignore 'r', because we fetch from two places, so r is usually ENOENT
+ in->_fetched(bl, bl2, fin);
+ }
+ void print(ostream& out) const override {
+ out << "inode_fetch(" << in->ino() << ")";
+ }
+};
+
+void CInode::fetch(MDSContext *fin)
+{
+ dout(10) << __func__ << dendl;
+
+ C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin);
+ C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher));
+
+ object_t oid = CInode::get_object_name(ino(), frag_t(), "");
+ object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool());
+
+ // Old on-disk format: inode stored in xattr of a dirfrag
+ ObjectOperation rd;
+ rd.getxattr("inode", &c->bl, NULL);
+ mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub());
+
+ // Current on-disk format: inode stored in a .inode object
+ object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode");
+ mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub());
+
+ gather.activate();
+}
+
+void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin)
+{
+ dout(10) << __func__ << " got " << bl.length() << " and " << bl2.length() << dendl;
+ bufferlist::const_iterator p;
+ if (bl2.length()) {
+ p = bl2.cbegin();
+ } else if (bl.length()) {
+ p = bl.cbegin();
+ } else {
+ derr << "No data while reading inode " << ino() << dendl;
+ fin->complete(-ENOENT);
+ return;
+ }
+
+ using ceph::decode;
+ // Attempt decode
+ try {
+ string magic;
+ decode(magic, p);
+ dout(10) << " magic is '" << magic << "' (expecting '"
+ << CEPH_FS_ONDISK_MAGIC << "')" << dendl;
+ if (magic != CEPH_FS_ONDISK_MAGIC) {
+ dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC
+ << "'" << dendl;
+ fin->complete(-EINVAL);
+ } else {
+ decode_store(p);
+ dout(10) << "_fetched " << *this << dendl;
+ fin->complete(0);
+ }
+ } catch (buffer::error &err) {
+ derr << "Corrupt inode " << ino() << ": " << err << dendl;
+ fin->complete(-EINVAL);
+ return;
+ }
+}
+
+void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt)
+{
+ bt.ino = inode.ino;
+ bt.ancestors.clear();
+ bt.pool = pool;
+
+ CInode *in = this;
+ CDentry *pdn = get_parent_dn();
+ while (pdn) {
+ CInode *diri = pdn->get_dir()->get_inode();
+ bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->inode.version));
+ in = diri;
+ pdn = in->get_parent_dn();
+ }
+ for (auto &p : inode.old_pools) {
+ // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
+ if (p != pool)
+ bt.old_pools.insert(p);
+ }
+}
+
+struct C_IO_Inode_StoredBacktrace : public CInodeIOContext {
+ version_t version;
+ Context *fin;
+ C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
+ void finish(int r) override {
+ in->_stored_backtrace(r, version, fin);
+ }
+ void print(ostream& out) const override {
+ out << "backtrace_store(" << in->ino() << ")";
+ }
+};
+
+void CInode::store_backtrace(MDSContext *fin, int op_prio)
+{
+ dout(10) << __func__ << " on " << *this << dendl;
+ ceph_assert(is_dirty_parent());
+
+ if (op_prio < 0)
+ op_prio = CEPH_MSG_PRIO_DEFAULT;
+
+ auth_pin(this);
+
+ const int64_t pool = get_backtrace_pool();
+ inode_backtrace_t bt;
+ build_backtrace(pool, bt);
+ bufferlist parent_bl;
+ using ceph::encode;
+ encode(bt, parent_bl);
+
+ ObjectOperation op;
+ op.priority = op_prio;
+ op.create(false);
+ op.setxattr("parent", parent_bl);
+
+ bufferlist layout_bl;
+ encode(inode.layout, layout_bl, mdcache->mds->mdsmap->get_up_features());
+ op.setxattr("layout", layout_bl);
+
+ SnapContext snapc;
+ object_t oid = get_object_name(ino(), frag_t(), "");
+ object_locator_t oloc(pool);
+ Context *fin2 = new C_OnFinisher(
+ new C_IO_Inode_StoredBacktrace(this, inode.backtrace_version, fin),
+ mdcache->mds->finisher);
+
+ if (!state_test(STATE_DIRTYPOOL) || inode.old_pools.empty()) {
+ dout(20) << __func__ << ": no dirtypool or no old pools" << dendl;
+ mdcache->mds->objecter->mutate(oid, oloc, op, snapc,
+ ceph::real_clock::now(),
+ 0, fin2);
+ return;
+ }
+
+ C_GatherBuilder gather(g_ceph_context, fin2);
+ mdcache->mds->objecter->mutate(oid, oloc, op, snapc,
+ ceph::real_clock::now(),
+ 0, gather.new_sub());
+
+ // In the case where DIRTYPOOL is set, we update all old pools backtraces
+ // such that anyone reading them will see the new pool ID in
+ // inode_backtrace_t::pool and go read everything else from there.
+ for (const auto &p : inode.old_pools) {
+ if (p == pool)
+ continue;
+
+ dout(20) << __func__ << ": updating old pool " << p << dendl;
+
+ ObjectOperation op;
+ op.priority = op_prio;
+ op.create(false);
+ op.setxattr("parent", parent_bl);
+
+ object_locator_t oloc(p);
+ mdcache->mds->objecter->mutate(oid, oloc, op, snapc,
+ ceph::real_clock::now(),
+ 0, gather.new_sub());
+ }
+ gather.activate();
+}
+
+void CInode::_stored_backtrace(int r, version_t v, Context *fin)
+{
+ if (r == -ENOENT) {
+ const int64_t pool = get_backtrace_pool();
+ bool exists = mdcache->mds->objecter->with_osdmap(
+ [pool](const OSDMap &osd_map) {
+ return osd_map.have_pg_pool(pool);
+ });
+
+ // This ENOENT is because the pool doesn't exist (the user deleted it
+ // out from under us), so the backtrace can never be written, so pretend
+ // to succeed so that the user can proceed to e.g. delete the file.
+ if (!exists) {
+ dout(4) << __func__ << " got ENOENT: a data pool was deleted "
+ "beneath us!" << dendl;
+ r = 0;
+ }
+ }
+
+ if (r < 0) {
+ dout(1) << "store backtrace error " << r << " v " << v << dendl;
+ mdcache->mds->clog->error() << "failed to store backtrace on ino "
+ << ino() << " object"
+ << ", pool " << get_backtrace_pool()
+ << ", errno " << r;
+ mdcache->mds->handle_write_error(r);
+ if (fin)
+ fin->complete(r);
+ return;
+ }
+
+ dout(10) << __func__ << " v " << v << dendl;
+
+ auth_unpin(this);
+ if (v == inode.backtrace_version)
+ clear_dirty_parent();
+ if (fin)
+ fin->complete(0);
+}
+
+void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace)
+{
+ mdcache->fetch_backtrace(inode.ino, get_backtrace_pool(), *backtrace, fin);
+}
+
+void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool)
+{
+ if (!state_test(STATE_DIRTYPARENT)) {
+ dout(10) << __func__ << dendl;
+ state_set(STATE_DIRTYPARENT);
+ get(PIN_DIRTYPARENT);
+ ceph_assert(ls);
+ }
+ if (dirty_pool)
+ state_set(STATE_DIRTYPOOL);
+ if (ls)
+ ls->dirty_parent_inodes.push_back(&item_dirty_parent);
+}
+
+void CInode::clear_dirty_parent()
+{
+ if (state_test(STATE_DIRTYPARENT)) {
+ dout(10) << __func__ << dendl;
+ state_clear(STATE_DIRTYPARENT);
+ state_clear(STATE_DIRTYPOOL);
+ put(PIN_DIRTYPARENT);
+ item_dirty_parent.remove_myself();
+ }
+}
+
+void CInode::verify_diri_backtrace(bufferlist &bl, int err)
+{
+ if (is_base() || is_dirty_parent() || !is_auth())
+ return;
+
+ dout(10) << __func__ << dendl;
+
+ if (err == 0) {
+ inode_backtrace_t backtrace;
+ using ceph::decode;
+ decode(backtrace, bl);
+ CDentry *pdn = get_parent_dn();
+ if (backtrace.ancestors.empty() ||
+ backtrace.ancestors[0].dname != pdn->get_name() ||
+ backtrace.ancestors[0].dirino != pdn->get_dir()->ino())
+ err = -EINVAL;
+ }
+
+ if (err) {
+ MDSRank *mds = mdcache->mds;
+ mds->clog->error() << "bad backtrace on directory inode " << ino();
+ ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace > 1));
+
+ mark_dirty_parent(mds->mdlog->get_current_segment(), false);
+ mds->mdlog->flush();
+ }
+}
+
+// ------------------
+// parent dir
+
+
+void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features,
+ const bufferlist *snap_blob) const
+{
+ using ceph::encode;
+ encode(inode, bl, features);
+ if (is_symlink())
+ encode(symlink, bl);
+ encode(dirfragtree, bl);
+ encode(xattrs, bl);
+ if (snap_blob)
+ encode(*snap_blob, bl);
+ else
+ encode(bufferlist(), bl);
+ encode(old_inodes, bl, features);
+ encode(oldest_snap, bl);
+ encode(damage_flags, bl);
+}
+
+void InodeStoreBase::encode(bufferlist &bl, uint64_t features,
+ const bufferlist *snap_blob) const
+{
+ ENCODE_START(6, 4, bl);
+ encode_bare(bl, features, snap_blob);
+ ENCODE_FINISH(bl);
+}
+
+void CInode::encode_store(bufferlist& bl, uint64_t features)
+{
+ bufferlist snap_blob;
+ encode_snap_blob(snap_blob);
+ InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(),
+ &snap_blob);
+}
+
+void InodeStoreBase::decode_bare(bufferlist::const_iterator &bl,
+ bufferlist& snap_blob, __u8 struct_v)
+{
+ using ceph::decode;
+ decode(inode, bl);
+ if (is_symlink()) {
+ std::string tmp;
+ decode(tmp, bl);
+ symlink = std::string_view(tmp);
+ }
+ decode(dirfragtree, bl);
+ decode_noshare(xattrs, bl);
+ decode(snap_blob, bl);
+
+ decode(old_inodes, bl);
+ if (struct_v == 2 && inode.is_dir()) {
+ bool default_layout_exists;
+ decode(default_layout_exists, bl);
+ if (default_layout_exists) {
+ decode(struct_v, bl); // this was a default_file_layout
+ decode(inode.layout, bl); // but we only care about the layout portion
+ }
+ }
+
+ if (struct_v >= 5) {
+ // InodeStore is embedded in dentries without proper versioning, so
+ // we consume up to the end of the buffer
+ if (!bl.end()) {
+ decode(oldest_snap, bl);
+ }
+
+ if (!bl.end()) {
+ decode(damage_flags, bl);
+ }
+ }
+}
+
+
+void InodeStoreBase::decode(bufferlist::const_iterator &bl, bufferlist& snap_blob)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
+ decode_bare(bl, snap_blob, struct_v);
+ DECODE_FINISH(bl);
+}
+
+void CInode::decode_store(bufferlist::const_iterator& bl)
+{
+ bufferlist snap_blob;
+ InodeStoreBase::decode(bl, snap_blob);
+ decode_snap_blob(snap_blob);
+}
+
+// ------------------
+// locking
+
+void CInode::set_object_info(MDSCacheObjectInfo &info)
+{
+ info.ino = ino();
+ info.snapid = last;
+}
+
+void CInode::encode_lock_state(int type, bufferlist& bl)
+{
+ using ceph::encode;
+ encode(first, bl);
+ if (!is_base())
+ encode(parent->first, bl);
+
+ switch (type) {
+ case CEPH_LOCK_IAUTH:
+ encode(inode.version, bl);
+ encode(inode.ctime, bl);
+ encode(inode.mode, bl);
+ encode(inode.uid, bl);
+ encode(inode.gid, bl);
+ break;
+
+ case CEPH_LOCK_ILINK:
+ encode(inode.version, bl);
+ encode(inode.ctime, bl);
+ encode(inode.nlink, bl);
+ break;
+
+ case CEPH_LOCK_IDFT:
+ if (is_auth()) {
+ encode(inode.version, bl);
+ } else {
+ // treat flushing as dirty when rejoining cache
+ bool dirty = dirfragtreelock.is_dirty_or_flushing();
+ encode(dirty, bl);
+ }
+ {
+ // encode the raw tree
+ encode(dirfragtree, bl);
+
+ // also specify which frags are mine
+ set<frag_t> myfrags;
+ list<CDir*> dfls;
+ get_dirfrags(dfls);
+ for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p)
+ if ((*p)->is_auth()) {
+ frag_t fg = (*p)->get_frag();
+ myfrags.insert(fg);
+ }
+ encode(myfrags, bl);
+ }
+ break;
+
+ case CEPH_LOCK_IFILE:
+ if (is_auth()) {
+ encode(inode.version, bl);
+ encode(inode.ctime, bl);
+ encode(inode.mtime, bl);
+ encode(inode.atime, bl);
+ encode(inode.time_warp_seq, bl);
+ if (!is_dir()) {
+ encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features());
+ encode(inode.size, bl);
+ encode(inode.truncate_seq, bl);
+ encode(inode.truncate_size, bl);
+ encode(inode.client_ranges, bl);
+ encode(inode.inline_data, bl);
+ }
+ } else {
+ // treat flushing as dirty when rejoining cache
+ bool dirty = filelock.is_dirty_or_flushing();
+ encode(dirty, bl);
+ }
+
+ {
+ dout(15) << __func__ << " inode.dirstat is " << inode.dirstat << dendl;
+ encode(inode.dirstat, bl); // only meaningful if i am auth.
+ bufferlist tmp;
+ __u32 n = 0;
+ for (const auto &p : dirfrags) {
+ frag_t fg = p.first;
+ CDir *dir = p.second;
+ if (is_auth() || dir->is_auth()) {
+ fnode_t *pf = dir->get_projected_fnode();
+ dout(15) << fg << " " << *dir << dendl;
+ dout(20) << fg << " fragstat " << pf->fragstat << dendl;
+ dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
+ encode(fg, tmp);
+ encode(dir->first, tmp);
+ encode(pf->fragstat, tmp);
+ encode(pf->accounted_fragstat, tmp);
+ n++;
+ }
+ }
+ encode(n, bl);
+ bl.claim_append(tmp);
+ }
+ break;
+
+ case CEPH_LOCK_INEST:
+ if (is_auth()) {
+ encode(inode.version, bl);
+ } else {
+ // treat flushing as dirty when rejoining cache
+ bool dirty = nestlock.is_dirty_or_flushing();
+ encode(dirty, bl);
+ }
+ {
+ dout(15) << __func__ << " inode.rstat is " << inode.rstat << dendl;
+ encode(inode.rstat, bl); // only meaningful if i am auth.
+ bufferlist tmp;
+ __u32 n = 0;
+ for (const auto &p : dirfrags) {
+ frag_t fg = p.first;
+ CDir *dir = p.second;
+ if (is_auth() || dir->is_auth()) {
+ fnode_t *pf = dir->get_projected_fnode();
+ dout(10) << fg << " " << *dir << dendl;
+ dout(10) << fg << " " << pf->rstat << dendl;
+ dout(10) << fg << " " << pf->rstat << dendl;
+ dout(10) << fg << " " << dir->dirty_old_rstat << dendl;
+ encode(fg, tmp);
+ encode(dir->first, tmp);
+ encode(pf->rstat, tmp);
+ encode(pf->accounted_rstat, tmp);
+ encode(dir->dirty_old_rstat, tmp);
+ n++;
+ }
+ }
+ encode(n, bl);
+ bl.claim_append(tmp);
+ }
+ break;
+
+ case CEPH_LOCK_IXATTR:
+ encode(inode.version, bl);
+ encode(inode.ctime, bl);
+ encode(xattrs, bl);
+ break;
+
+ case CEPH_LOCK_ISNAP:
+ encode(inode.version, bl);
+ encode(inode.ctime, bl);
+ encode_snap(bl);
+ break;
+
+ case CEPH_LOCK_IFLOCK:
+ encode(inode.version, bl);
+ _encode_file_locks(bl);
+ break;
+
+ case CEPH_LOCK_IPOLICY:
+ if (inode.is_dir()) {
+ encode(inode.version, bl);
+ encode(inode.ctime, bl);
+ encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features());
+ encode(inode.quota, bl);
+ encode(inode.export_pin, bl);
+ }
+ break;
+
+ default:
+ ceph_abort();
+ }
+}
+
+
+/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
+
+void CInode::decode_lock_state(int type, const bufferlist& bl)
+{
+ auto p = bl.cbegin();
+ utime_t tm;
+
+ snapid_t newfirst;
+ using ceph::decode;
+ decode(newfirst, p);
+ if (!is_auth() && newfirst != first) {
+ dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl;
+ first = newfirst;
+ }
+ if (!is_base()) {
+ decode(newfirst, p);
+ if (!parent->is_auth() && newfirst != parent->first) {
+ dout(10) << __func__ << " parent first " << first << " -> " << newfirst << dendl;
+ parent->first = newfirst;
+ }
+ }
+
+ switch (type) {
+ case CEPH_LOCK_IAUTH:
+ decode(inode.version, p);
+ decode(tm, p);
+ if (inode.ctime < tm) inode.ctime = tm;
+ decode(inode.mode, p);
+ decode(inode.uid, p);
+ decode(inode.gid, p);
+ break;
+
+ case CEPH_LOCK_ILINK:
+ decode(inode.version, p);
+ decode(tm, p);
+ if (inode.ctime < tm) inode.ctime = tm;
+ decode(inode.nlink, p);
+ break;
+
+ case CEPH_LOCK_IDFT:
+ if (is_auth()) {
+ bool replica_dirty;
+ decode(replica_dirty, p);
+ if (replica_dirty) {
+ dout(10) << __func__ << " setting dftlock dirty flag" << dendl;
+ dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
+ }
+ } else {
+ decode(inode.version, p);
+ }
+ {
+ fragtree_t temp;
+ decode(temp, p);
+ set<frag_t> authfrags;
+ decode(authfrags, p);
+ if (is_auth()) {
+ // auth. believe replica's auth frags only.
+ for (set<frag_t>::iterator p = authfrags.begin(); p != authfrags.end(); ++p)
+ if (!dirfragtree.is_leaf(*p)) {
+ dout(10) << " forcing frag " << *p << " to leaf (split|merge)" << dendl;
+ dirfragtree.force_to_leaf(g_ceph_context, *p);
+ dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
+ }
+ } else {
+ // replica. take the new tree, BUT make sure any open
+ // dirfrags remain leaves (they may have split _after_ this
+ // dft was scattered, or we may still be be waiting on the
+ // notify from the auth)
+ dirfragtree.swap(temp);
+ for (const auto &p : dirfrags) {
+ if (!dirfragtree.is_leaf(p.first)) {
+ dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl;
+ dirfragtree.force_to_leaf(g_ceph_context, p.first);
+ }
+ if (p.second->is_auth())
+ p.second->state_clear(CDir::STATE_DIRTYDFT);
+ }
+ }
+ if (g_conf()->mds_debug_frag)
+ verify_dirfrags();
+ }
+ break;
+
+ case CEPH_LOCK_IFILE:
+ if (!is_auth()) {
+ decode(inode.version, p);
+ decode(tm, p);
+ if (inode.ctime < tm) inode.ctime = tm;
+ decode(inode.mtime, p);
+ decode(inode.atime, p);
+ decode(inode.time_warp_seq, p);
+ if (!is_dir()) {
+ decode(inode.layout, p);
+ decode(inode.size, p);
+ decode(inode.truncate_seq, p);
+ decode(inode.truncate_size, p);
+ decode(inode.client_ranges, p);
+ decode(inode.inline_data, p);
+ }
+ } else {
+ bool replica_dirty;
+ decode(replica_dirty, p);
+ if (replica_dirty) {
+ dout(10) << __func__ << " setting filelock dirty flag" << dendl;
+ filelock.mark_dirty(); // ok bc we're auth and caller will handle
+ }
+ }
+ {
+ frag_info_t dirstat;
+ decode(dirstat, p);
+ if (!is_auth()) {
+ dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl;
+ inode.dirstat = dirstat; // take inode summation if replica
+ }
+ __u32 n;
+ decode(n, p);
+ dout(10) << " ...got " << n << " fragstats on " << *this << dendl;
+ while (n--) {
+ frag_t fg;
+ snapid_t fgfirst;
+ frag_info_t fragstat;
+ frag_info_t accounted_fragstat;
+ decode(fg, p);
+ decode(fgfirst, p);
+ decode(fragstat, p);
+ decode(accounted_fragstat, p);
+ dout(10) << fg << " [" << fgfirst << ",head] " << dendl;
+ dout(10) << fg << " fragstat " << fragstat << dendl;
+ dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl;
+
+ CDir *dir = get_dirfrag(fg);
+ if (is_auth()) {
+ ceph_assert(dir); // i am auth; i had better have this dir open
+ dout(10) << fg << " first " << dir->first << " -> " << fgfirst
+ << " on " << *dir << dendl;
+ dir->first = fgfirst;
+ dir->fnode.fragstat = fragstat;
+ dir->fnode.accounted_fragstat = accounted_fragstat;
+ dir->first = fgfirst;
+ if (!(fragstat == accounted_fragstat)) {
+ dout(10) << fg << " setting filelock updated flag" << dendl;
+ filelock.mark_dirty(); // ok bc we're auth and caller will handle
+ }
+ } else {
+ if (dir && dir->is_auth()) {
+ dout(10) << fg << " first " << dir->first << " -> " << fgfirst
+ << " on " << *dir << dendl;
+ dir->first = fgfirst;
+ fnode_t *pf = dir->get_projected_fnode();
+ finish_scatter_update(&filelock, dir,
+ inode.dirstat.version, pf->accounted_fragstat.version);
+ }
+ }
+ }
+ }
+ break;
+
+ case CEPH_LOCK_INEST:
+ if (is_auth()) {
+ bool replica_dirty;
+ decode(replica_dirty, p);
+ if (replica_dirty) {
+ dout(10) << __func__ << " setting nestlock dirty flag" << dendl;
+ nestlock.mark_dirty(); // ok bc we're auth and caller will handle
+ }
+ } else {
+ decode(inode.version, p);
+ }
+ {
+ nest_info_t rstat;
+ decode(rstat, p);
+ if (!is_auth()) {
+ dout(10) << " taking inode rstat " << rstat << " for " << *this << dendl;
+ inode.rstat = rstat; // take inode summation if replica
+ }
+ __u32 n;
+ decode(n, p);
+ while (n--) {
+ frag_t fg;
+ snapid_t fgfirst;
+ nest_info_t rstat;
+ nest_info_t accounted_rstat;
+ decltype(CDir::dirty_old_rstat) dirty_old_rstat;
+ decode(fg, p);
+ decode(fgfirst, p);
+ decode(rstat, p);
+ decode(accounted_rstat, p);
+ decode(dirty_old_rstat, p);
+ dout(10) << fg << " [" << fgfirst << ",head]" << dendl;
+ dout(10) << fg << " rstat " << rstat << dendl;
+ dout(10) << fg << " accounted_rstat " << accounted_rstat << dendl;
+ dout(10) << fg << " dirty_old_rstat " << dirty_old_rstat << dendl;
+
+ CDir *dir = get_dirfrag(fg);
+ if (is_auth()) {
+ ceph_assert(dir); // i am auth; i had better have this dir open
+ dout(10) << fg << " first " << dir->first << " -> " << fgfirst
+ << " on " << *dir << dendl;
+ dir->first = fgfirst;
+ dir->fnode.rstat = rstat;
+ dir->fnode.accounted_rstat = accounted_rstat;
+ dir->dirty_old_rstat.swap(dirty_old_rstat);
+ if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) {
+ dout(10) << fg << " setting nestlock updated flag" << dendl;
+ nestlock.mark_dirty(); // ok bc we're auth and caller will handle
+ }
+ } else {
+ if (dir && dir->is_auth()) {
+ dout(10) << fg << " first " << dir->first << " -> " << fgfirst
+ << " on " << *dir << dendl;
+ dir->first = fgfirst;
+ fnode_t *pf = dir->get_projected_fnode();
+ finish_scatter_update(&nestlock, dir,
+ inode.rstat.version, pf->accounted_rstat.version);
+ }
+ }
+ }
+ }
+ break;
+
+ case CEPH_LOCK_IXATTR:
+ decode(inode.version, p);
+ decode(tm, p);
+ if (inode.ctime < tm) inode.ctime = tm;
+ decode(xattrs, p);
+ break;
+
+ case CEPH_LOCK_ISNAP:
+ {
+ decode(inode.version, p);
+ decode(tm, p);
+ if (inode.ctime < tm) inode.ctime = tm;
+ decode_snap(p);
+ }
+ break;
+
+ case CEPH_LOCK_IFLOCK:
+ decode(inode.version, p);
+ _decode_file_locks(p);
+ break;
+
+ case CEPH_LOCK_IPOLICY:
+ if (inode.is_dir()) {
+ decode(inode.version, p);
+ decode(tm, p);
+ if (inode.ctime < tm) inode.ctime = tm;
+ decode(inode.layout, p);
+ decode(inode.quota, p);
+ mds_rank_t old_pin = inode.export_pin;
+ decode(inode.export_pin, p);
+ maybe_export_pin(old_pin != inode.export_pin);
+ }
+ break;
+
+ default:
+ ceph_abort();
+ }
+}
+
+
+bool CInode::is_dirty_scattered()
+{
+ return
+ filelock.is_dirty_or_flushing() ||
+ nestlock.is_dirty_or_flushing() ||
+ dirfragtreelock.is_dirty_or_flushing();
+}
+
+void CInode::clear_scatter_dirty()
+{
+ filelock.remove_dirty();
+ nestlock.remove_dirty();
+ dirfragtreelock.remove_dirty();
+}
+
+void CInode::clear_dirty_scattered(int type)
+{
+ dout(10) << __func__ << " " << type << " on " << *this << dendl;
+ ceph_assert(is_dir());
+ switch (type) {
+ case CEPH_LOCK_IFILE:
+ item_dirty_dirfrag_dir.remove_myself();
+ break;
+
+ case CEPH_LOCK_INEST:
+ item_dirty_dirfrag_nest.remove_myself();
+ break;
+
+ case CEPH_LOCK_IDFT:
+ item_dirty_dirfrag_dirfragtree.remove_myself();
+ break;
+
+ default:
+ ceph_abort();
+ }
+}
+
+
+/*
+ * when we initially scatter a lock, we need to check if any of the dirfrags
+ * have out of date accounted_rstat/fragstat. if so, mark the lock stale.
+ */
+/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
+void CInode::start_scatter(ScatterLock *lock)
+{
+ dout(10) << __func__ << " " << *lock << " on " << *this << dendl;
+ ceph_assert(is_auth());
+ mempool_inode *pi = get_projected_inode();
+
+ for (const auto &p : dirfrags) {
+ frag_t fg = p.first;
+ CDir *dir = p.second;
+ fnode_t *pf = dir->get_projected_fnode();
+ dout(20) << fg << " " << *dir << dendl;
+
+ if (!dir->is_auth())
+ continue;
+
+ switch (lock->get_type()) {
+ case CEPH_LOCK_IFILE:
+ finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version);
+ break;
+
+ case CEPH_LOCK_INEST:
+ finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version);
+ break;
+
+ case CEPH_LOCK_IDFT:
+ dir->state_clear(CDir::STATE_DIRTYDFT);
+ break;
+ }
+ }
+}
+
+
+class C_Inode_FragUpdate : public MDSLogContextBase {
+protected:
+ CInode *in;
+ CDir *dir;
+ MutationRef mut;
+ MDSRank *get_mds() override {return in->mdcache->mds;}
+ void finish(int r) override {
+ in->_finish_frag_update(dir, mut);
+ }
+
+public:
+ C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {}
+};
+
+void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir,
+ version_t inode_version, version_t dir_accounted_version)
+{
+ frag_t fg = dir->get_frag();
+ ceph_assert(dir->is_auth());
+
+ if (dir->is_frozen()) {
+ dout(10) << __func__ << " " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl;
+ } else if (dir->get_version() == 0) {
+ dout(10) << __func__ << " " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl;
+ } else {
+ if (dir_accounted_version != inode_version) {
+ dout(10) << __func__ << " " << fg << " journaling accounted scatterstat update v" << inode_version << dendl;
+
+ MDLog *mdlog = mdcache->mds->mdlog;
+ MutationRef mut(new MutationImpl());
+ mut->ls = mdlog->get_current_segment();
+
+ mempool_inode *pi = get_projected_inode();
+ fnode_t *pf = dir->project_fnode();
+
+ std::string_view ename = 0;
+ switch (lock->get_type()) {
+ case CEPH_LOCK_IFILE:
+ pf->fragstat.version = pi->dirstat.version;
+ pf->accounted_fragstat = pf->fragstat;
+ ename = "lock ifile accounted scatter stat update";
+ break;
+ case CEPH_LOCK_INEST:
+ pf->rstat.version = pi->rstat.version;
+ pf->accounted_rstat = pf->rstat;
+ ename = "lock inest accounted scatter stat update";
+
+ if (!is_auth() && lock->get_state() == LOCK_MIX) {
+ dout(10) << __func__ << " try to assimilate dirty rstat on "
+ << *dir << dendl;
+ dir->assimilate_dirty_rstat_inodes();
+ }
+
+ break;
+ default:
+ ceph_abort();
+ }
+
+ pf->version = dir->pre_dirty();
+ mut->add_projected_fnode(dir);
+
+ EUpdate *le = new EUpdate(mdlog, ename);
+ mdlog->start_entry(le);
+ le->metablob.add_dir_context(dir);
+ le->metablob.add_dir(dir, true);
+
+ ceph_assert(!dir->is_frozen());
+ mut->auth_pin(dir);
+
+ if (lock->get_type() == CEPH_LOCK_INEST &&
+ !is_auth() && lock->get_state() == LOCK_MIX) {
+ dout(10) << __func__ << " finish assimilating dirty rstat on "
+ << *dir << dendl;
+ dir->assimilate_dirty_rstat_inodes_finish(mut, &le->metablob);
+
+ if (!(pf->rstat == pf->accounted_rstat)) {
+ if (!mut->is_wrlocked(&nestlock)) {
+ mdcache->mds->locker->wrlock_force(&nestlock, mut);
+ }
+
+ mdcache->mds->locker->mark_updated_scatterlock(&nestlock);
+ mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest);
+ }
+ }
+
+ mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut));
+ } else {
+ dout(10) << __func__ << " " << fg << " accounted " << *lock
+ << " scatter stat unchanged at v" << dir_accounted_version << dendl;
+ }
+ }
+}
+
+void CInode::_finish_frag_update(CDir *dir, MutationRef& mut)
+{
+ dout(10) << __func__ << " on " << *dir << dendl;
+ mut->apply();
+ mdcache->mds->locker->drop_locks(mut.get());
+ mut->cleanup();
+}
+
+
+/*
+ * when we gather a lock, we need to assimilate dirfrag changes into the inode
+ * state. it's possible we can't update the dirfrag accounted_rstat/fragstat
+ * because the frag is auth and frozen, or that the replica couldn't for the same
+ * reason. hopefully it will get updated the next time the lock cycles.
+ *
+ * we have two dimensions of behavior:
+ * - we may be (auth and !frozen), and able to update, or not.
+ * - the frag may be stale, or not.
+ *
+ * if the frag is non-stale, we want to assimilate the diff into the
+ * inode, regardless of whether it's auth or updateable.
+ *
+ * if we update the frag, we want to set accounted_fragstat = frag,
+ * both if we took the diff or it was stale and we are making it
+ * un-stale.
+ */
+/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
+void CInode::finish_scatter_gather_update(int type)
+{
+ LogChannelRef clog = mdcache->mds->clog;
+
+ dout(10) << __func__ << " " << type << " on " << *this << dendl;
+ ceph_assert(is_auth());
+
+ switch (type) {
+ case CEPH_LOCK_IFILE:
+ {
+ fragtree_t tmpdft = dirfragtree;
+ struct frag_info_t dirstat;
+ bool dirstat_valid = true;
+
+ // adjust summation
+ ceph_assert(is_auth());
+ mempool_inode *pi = get_projected_inode();
+
+ bool touched_mtime = false, touched_chattr = false;
+ dout(20) << " orig dirstat " << pi->dirstat << dendl;
+ pi->dirstat.version++;
+ for (const auto &p : dirfrags) {
+ frag_t fg = p.first;
+ CDir *dir = p.second;
+ dout(20) << fg << " " << *dir << dendl;
+
+ bool update;
+ if (dir->get_version() != 0) {
+ update = dir->is_auth() && !dir->is_frozen();
+ } else {
+ update = false;
+ dirstat_valid = false;
+ }
+
+ fnode_t *pf = dir->get_projected_fnode();
+ if (update)
+ pf = dir->project_fnode();
+
+ if (pf->accounted_fragstat.version == pi->dirstat.version - 1) {
+ dout(20) << fg << " fragstat " << pf->fragstat << dendl;
+ dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
+ pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
+ } else {
+ dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl;
+ }
+
+ if (pf->fragstat.nfiles < 0 ||
+ pf->fragstat.nsubdirs < 0) {
+ clog->error() << "bad/negative dir size on "
+ << dir->dirfrag() << " " << pf->fragstat;
+ ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
+
+ if (pf->fragstat.nfiles < 0)
+ pf->fragstat.nfiles = 0;
+ if (pf->fragstat.nsubdirs < 0)
+ pf->fragstat.nsubdirs = 0;
+ }
+
+ if (update) {
+ pf->accounted_fragstat = pf->fragstat;
+ pf->fragstat.version = pf->accounted_fragstat.version = pi->dirstat.version;
+ dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl;
+ }
+
+ tmpdft.force_to_leaf(g_ceph_context, fg);
+ dirstat.add(pf->fragstat);
+ }
+ if (touched_mtime)
+ pi->mtime = pi->ctime = pi->dirstat.mtime;
+ if (touched_chattr)
+ pi->change_attr = pi->dirstat.change_attr;
+ dout(20) << " final dirstat " << pi->dirstat << dendl;
+
+ if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) {
+ frag_vec_t leaves;
+ tmpdft.get_leaves_under(frag_t(), leaves);
+ for (const auto& leaf : leaves) {
+ if (!dirfrags.count(leaf)) {
+ dirstat_valid = false;
+ break;
+ }
+ }
+ if (dirstat_valid) {
+ if (state_test(CInode::STATE_REPAIRSTATS)) {
+ dout(20) << " dirstat mismatch, fixing" << dendl;
+ } else {
+ clog->error() << "unmatched fragstat on " << ino() << ", inode has "
+ << pi->dirstat << ", dirfrags have " << dirstat;
+ ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter);
+ }
+ // trust the dirfrags for now
+ version_t v = pi->dirstat.version;
+ if (pi->dirstat.mtime > dirstat.mtime)
+ dirstat.mtime = pi->dirstat.mtime;
+ if (pi->dirstat.change_attr > dirstat.change_attr)
+ dirstat.change_attr = pi->dirstat.change_attr;
+ pi->dirstat = dirstat;
+ pi->dirstat.version = v;
+ }
+ }
+
+ if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0)
+ {
+ std::string path;
+ make_path_string(path);
+ clog->error() << "Inconsistent statistics detected: fragstat on inode "
+ << ino() << " (" << path << "), inode has " << pi->dirstat;
+ ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
+
+ if (pi->dirstat.nfiles < 0)
+ pi->dirstat.nfiles = 0;
+ if (pi->dirstat.nsubdirs < 0)
+ pi->dirstat.nsubdirs = 0;
+ }
+ }
+ break;
+
+ case CEPH_LOCK_INEST:
+ {
+ // adjust summation
+ ceph_assert(is_auth());
+
+ fragtree_t tmpdft = dirfragtree;
+ nest_info_t rstat;
+ bool rstat_valid = true;
+
+ rstat.rsubdirs = 1;
+ if (const sr_t *srnode = get_projected_srnode(); srnode)
+ rstat.rsnaps = srnode->snaps.size();
+
+ mempool_inode *pi = get_projected_inode();
+ dout(20) << " orig rstat " << pi->rstat << dendl;
+ pi->rstat.version++;
+ for (const auto &p : dirfrags) {
+ frag_t fg = p.first;
+ CDir *dir = p.second;
+ dout(20) << fg << " " << *dir << dendl;
+
+ bool update;
+ if (dir->get_version() != 0) {
+ update = dir->is_auth() && !dir->is_frozen();
+ } else {
+ update = false;
+ rstat_valid = false;
+ }
+
+ fnode_t *pf = dir->get_projected_fnode();
+ if (update)
+ pf = dir->project_fnode();
+
+ if (pf->accounted_rstat.version == pi->rstat.version-1) {
+ // only pull this frag's dirty rstat inodes into the frag if
+ // the frag is non-stale and updateable. if it's stale,
+ // that info will just get thrown out!
+ if (update)
+ dir->assimilate_dirty_rstat_inodes();
+
+ dout(20) << fg << " rstat " << pf->rstat << dendl;
+ dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl;
+ dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
+ mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat,
+ dir->first, CEPH_NOSNAP, this, true);
+ for (auto &p : dir->dirty_old_rstat) {
+ mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat,
+ p.second.first, p.first, this, true);
+ }
+ if (update) // dir contents not valid if frozen or non-auth
+ dir->check_rstats();
+ } else {
+ dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl;
+ }
+ if (update) {
+ pf->accounted_rstat = pf->rstat;
+ dir->dirty_old_rstat.clear();
+ pf->rstat.version = pf->accounted_rstat.version = pi->rstat.version;
+ dir->check_rstats();
+ dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl;
+ }
+
+ tmpdft.force_to_leaf(g_ceph_context, fg);
+ rstat.add(pf->rstat);
+ }
+ dout(20) << " final rstat " << pi->rstat << dendl;
+
+ if (rstat_valid && !rstat.same_sums(pi->rstat)) {
+ frag_vec_t leaves;
+ tmpdft.get_leaves_under(frag_t(), leaves);
+ for (const auto& leaf : leaves) {
+ if (!dirfrags.count(leaf)) {
+ rstat_valid = false;
+ break;
+ }
+ }
+ if (rstat_valid) {
+ if (state_test(CInode::STATE_REPAIRSTATS)) {
+ dout(20) << " rstat mismatch, fixing" << dendl;
+ } else {
+ clog->error() << "inconsistent rstat on inode " << ino()
+ << ", inode has " << pi->rstat
+ << ", directory fragments have " << rstat;
+ ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter);
+ }
+ // trust the dirfrag for now
+ version_t v = pi->rstat.version;
+ if (pi->rstat.rctime > rstat.rctime)
+ rstat.rctime = pi->rstat.rctime;
+ pi->rstat = rstat;
+ pi->rstat.version = v;
+ }
+ }
+
+ mdcache->broadcast_quota_to_client(this);
+ }
+ break;
+
+ case CEPH_LOCK_IDFT:
+ break;
+
+ default:
+ ceph_abort();
+ }
+}
+
+void CInode::finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob)
+{
+ dout(10) << __func__ << " " << type << " on " << *this << dendl;
+ ceph_assert(is_auth());
+
+ for (const auto &p : dirfrags) {
+ CDir *dir = p.second;
+ if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen())
+ continue;
+
+ if (type == CEPH_LOCK_IDFT)
+ continue; // nothing to do.
+
+ dout(10) << " journaling updated frag accounted_ on " << *dir << dendl;
+ ceph_assert(dir->is_projected());
+ fnode_t *pf = dir->get_projected_fnode();
+ pf->version = dir->pre_dirty();
+ mut->add_projected_fnode(dir);
+ metablob->add_dir(dir, true);
+ mut->auth_pin(dir);
+
+ if (type == CEPH_LOCK_INEST)
+ dir->assimilate_dirty_rstat_inodes_finish(mut, metablob);
+ }
+}
+
+// waiting
+
+bool CInode::is_frozen() const
+{
+ if (is_frozen_inode()) return true;
+ if (parent && parent->dir->is_frozen()) return true;
+ return false;
+}
+
+bool CInode::is_frozen_dir() const
+{
+ if (parent && parent->dir->is_frozen_dir()) return true;
+ return false;
+}
+
+bool CInode::is_freezing() const
+{
+ if (is_freezing_inode()) return true;
+ if (parent && parent->dir->is_freezing()) return true;
+ return false;
+}
+
+void CInode::add_dir_waiter(frag_t fg, MDSContext *c)
+{
+ if (waiting_on_dir.empty())
+ get(PIN_DIRWAITER);
+ waiting_on_dir[fg].push_back(c);
+ dout(10) << __func__ << " frag " << fg << " " << c << " on " << *this << dendl;
+}
+
+void CInode::take_dir_waiting(frag_t fg, MDSContext::vec& ls)
+{
+ if (waiting_on_dir.empty())
+ return;
+
+ auto it = waiting_on_dir.find(fg);
+ if (it != waiting_on_dir.end()) {
+ dout(10) << __func__ << " frag " << fg << " on " << *this << dendl;
+ auto& waiting = it->second;
+ ls.insert(ls.end(), waiting.begin(), waiting.end());
+ waiting_on_dir.erase(it);
+
+ if (waiting_on_dir.empty())
+ put(PIN_DIRWAITER);
+ }
+}
+
+void CInode::add_waiter(uint64_t tag, MDSContext *c)
+{
+ dout(10) << __func__ << " tag " << std::hex << tag << std::dec << " " << c
+ << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH)
+ << " !frozen " << !is_frozen_inode()
+ << " !freezing " << !is_freezing_inode()
+ << dendl;
+ // wait on the directory?
+ // make sure its not the inode that is explicitly ambiguous|freezing|frozen
+ if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) ||
+ ((tag & WAIT_UNFREEZE) &&
+ !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) {
+ dout(15) << "passing waiter up tree" << dendl;
+ parent->dir->add_waiter(tag, c);
+ return;
+ }
+ dout(15) << "taking waiter here" << dendl;
+ MDSCacheObject::add_waiter(tag, c);
+}
+
+void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls)
+{
+ if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) {
+ // take all dentry waiters
+ while (!waiting_on_dir.empty()) {
+ auto it = waiting_on_dir.begin();
+ dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl;
+ auto& waiting = it->second;
+ ls.insert(ls.end(), waiting.begin(), waiting.end());
+ waiting_on_dir.erase(it);
+ }
+ put(PIN_DIRWAITER);
+ }
+
+ // waiting
+ MDSCacheObject::take_waiting(mask, ls);
+}
+
+bool CInode::freeze_inode(int auth_pin_allowance)
+{
+ ceph_assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins
+ ceph_assert(auth_pins >= auth_pin_allowance);
+ if (auth_pins > auth_pin_allowance) {
+ dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl;
+ auth_pin_freeze_allowance = auth_pin_allowance;
+ get(PIN_FREEZING);
+ state_set(STATE_FREEZING);
+ return false;
+ }
+
+ dout(10) << "freeze_inode - frozen" << dendl;
+ ceph_assert(auth_pins == auth_pin_allowance);
+ if (!state_test(STATE_FROZEN)) {
+ get(PIN_FROZEN);
+ state_set(STATE_FROZEN);
+ }
+ return true;
+}
+
+void CInode::unfreeze_inode(MDSContext::vec& finished)
+{
+ dout(10) << __func__ << dendl;
+ if (state_test(STATE_FREEZING)) {
+ state_clear(STATE_FREEZING);
+ put(PIN_FREEZING);
+ } else if (state_test(STATE_FROZEN)) {
+ state_clear(STATE_FROZEN);
+ put(PIN_FROZEN);
+ } else
+ ceph_abort();
+ take_waiting(WAIT_UNFREEZE, finished);
+}
+
+void CInode::unfreeze_inode()
+{
+ MDSContext::vec finished;
+ unfreeze_inode(finished);
+ mdcache->mds->queue_waiters(finished);
+}
+
+void CInode::freeze_auth_pin()
+{
+ ceph_assert(state_test(CInode::STATE_FROZEN));
+ state_set(CInode::STATE_FROZENAUTHPIN);
+}
+
+void CInode::unfreeze_auth_pin()
+{
+ ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN));
+ state_clear(CInode::STATE_FROZENAUTHPIN);
+ if (!state_test(STATE_FREEZING|STATE_FROZEN)) {
+ MDSContext::vec finished;
+ take_waiting(WAIT_UNFREEZE, finished);
+ mdcache->mds->queue_waiters(finished);
+ }
+}
+
+void CInode::clear_ambiguous_auth(MDSContext::vec& finished)
+{
+ ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH));
+ state_clear(CInode::STATE_AMBIGUOUSAUTH);
+ take_waiting(CInode::WAIT_SINGLEAUTH, finished);
+}
+
+void CInode::clear_ambiguous_auth()
+{
+ MDSContext::vec finished;
+ clear_ambiguous_auth(finished);
+ mdcache->mds->queue_waiters(finished);
+}
+
+// auth_pins
+bool CInode::can_auth_pin(int *err_ret) const {
+ int err;
+ if (!is_auth()) {
+ err = ERR_NOT_AUTH;
+ } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
+ err = ERR_EXPORTING_INODE;
+ } else {
+ if (parent)
+ return parent->can_auth_pin(err_ret);
+ err = 0;
+ }
+ if (err && err_ret)
+ *err_ret = err;
+ return !err;
+}
+
+void CInode::auth_pin(void *by)
+{
+ if (auth_pins == 0)
+ get(PIN_AUTHPIN);
+ auth_pins++;
+
+#ifdef MDS_AUTHPIN_SET
+ auth_pin_set.insert(by);
+#endif
+
+ dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl;
+
+ if (parent)
+ parent->adjust_nested_auth_pins(1, this);
+}
+
+void CInode::auth_unpin(void *by)
+{
+ auth_pins--;
+
+#ifdef MDS_AUTHPIN_SET
+ {
+ auto it = auth_pin_set.find(by);
+ ceph_assert(it != auth_pin_set.end());
+ auth_pin_set.erase(it);
+ }
+#endif
+
+ if (auth_pins == 0)
+ put(PIN_AUTHPIN);
+
+ dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl;
+
+ ceph_assert(auth_pins >= 0);
+
+ if (parent)
+ parent->adjust_nested_auth_pins(-1, by);
+
+ if (is_freezing_inode() &&
+ auth_pins == auth_pin_freeze_allowance) {
+ dout(10) << "auth_unpin freezing!" << dendl;
+ get(PIN_FROZEN);
+ put(PIN_FREEZING);
+ state_clear(STATE_FREEZING);
+ state_set(STATE_FROZEN);
+ finish_waiting(WAIT_FROZEN);
+ }
+}
+
+// authority
+
+mds_authority_t CInode::authority() const
+{
+ if (inode_auth.first >= 0)
+ return inode_auth;
+
+ if (parent)
+ return parent->dir->authority();
+
+ // new items that are not yet linked in (in the committed plane) belong
+ // to their first parent.
+ if (!projected_parent.empty())
+ return projected_parent.front()->dir->authority();
+
+ return CDIR_AUTH_UNDEF;
+}
+
+
+// SNAP
+
+snapid_t CInode::get_oldest_snap()
+{
+ snapid_t t = first;
+ if (!old_inodes.empty())
+ t = old_inodes.begin()->second.first;
+ return std::min(t, oldest_snap);
+}
+
+CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head)
+{
+ ceph_assert(follows >= first);
+
+ mempool_inode *pi = cow_head ? get_projected_inode() : get_previous_projected_inode();
+ mempool_xattr_map *px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs();
+
+ mempool_old_inode &old = old_inodes[follows];
+ old.first = first;
+ old.inode = *pi;
+ old.xattrs = *px;
+
+ if (first < oldest_snap)
+ oldest_snap = first;
+
+ dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl;
+
+ old.inode.trim_client_ranges(follows);
+
+ if (g_conf()->mds_snap_rstat &&
+ !(old.inode.rstat == old.inode.accounted_rstat))
+ dirty_old_rstats.insert(follows);
+
+ first = follows+1;
+
+ dout(10) << __func__ << " " << (cow_head ? "head" : "previous_head" )
+ << " to [" << old.first << "," << follows << "] on "
+ << *this << dendl;
+
+ return old;
+}
+
+void CInode::split_old_inode(snapid_t snap)
+{
+ auto it = old_inodes.lower_bound(snap);
+ ceph_assert(it != old_inodes.end() && it->second.first < snap);
+
+ mempool_old_inode &old = old_inodes[snap - 1];
+ old = it->second;
+
+ it->second.first = snap;
+ dout(10) << __func__ << " " << "[" << old.first << "," << it->first
+ << "] to [" << snap << "," << it->first << "] on " << *this << dendl;
+}
+
+void CInode::pre_cow_old_inode()
+{
+ snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+ if (first <= follows)
+ cow_old_inode(follows, true);
+}
+
+bool CInode::has_snap_data(snapid_t snapid)
+{
+ bool found = snapid >= first && snapid <= last;
+ if (!found && is_multiversion()) {
+ auto p = old_inodes.lower_bound(snapid);
+ if (p != old_inodes.end()) {
+ if (p->second.first > snapid) {
+ if (p != old_inodes.begin())
+ --p;
+ }
+ if (p->second.first <= snapid && snapid <= p->first) {
+ found = true;
+ }
+ }
+ }
+ return found;
+}
+
+void CInode::purge_stale_snap_data(const set<snapid_t>& snaps)
+{
+ dout(10) << __func__ << " " << snaps << dendl;
+
+ for (auto it = old_inodes.begin(); it != old_inodes.end(); ) {
+ const snapid_t &id = it->first;
+ const auto &s = snaps.lower_bound(it->second.first);
+ if (s == snaps.end() || *s > id) {
+ dout(10) << " purging old_inode [" << it->second.first << "," << id << "]" << dendl;
+ it = old_inodes.erase(it);
+ } else {
+ ++it;
+ }
+ }
+}
+
+/*
+ * pick/create an old_inode
+ */
+CInode::mempool_old_inode * CInode::pick_old_inode(snapid_t snap)
+{
+ auto it = old_inodes.lower_bound(snap); // p is first key >= to snap
+ if (it != old_inodes.end() && it->second.first <= snap) {
+ dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl;
+ return &it->second;
+ }
+ dout(10) << __func__ << " snap " << snap << " -> nothing" << dendl;
+ return NULL;
+}
+
+void CInode::open_snaprealm(bool nosplit)
+{
+ if (!snaprealm) {
+ SnapRealm *parent = find_snaprealm();
+ snaprealm = new SnapRealm(mdcache, this);
+ if (parent) {
+ dout(10) << __func__ << " " << snaprealm
+ << " parent is " << parent
+ << dendl;
+ dout(30) << " siblings are " << parent->open_children << dendl;
+ snaprealm->parent = parent;
+ if (!nosplit)
+ parent->split_at(snaprealm);
+ parent->open_children.insert(snaprealm);
+ }
+ }
+}
+void CInode::close_snaprealm(bool nojoin)
+{
+ if (snaprealm) {
+ dout(15) << __func__ << " " << *snaprealm << dendl;
+ snaprealm->close_parents();
+ if (snaprealm->parent) {
+ snaprealm->parent->open_children.erase(snaprealm);
+ //if (!nojoin)
+ //snaprealm->parent->join(snaprealm);
+ }
+ delete snaprealm;
+ snaprealm = 0;
+ }
+}
+
+SnapRealm *CInode::find_snaprealm() const
+{
+ const CInode *cur = this;
+ while (!cur->snaprealm) {
+ const CDentry *pdn = cur->get_oldest_parent_dn();
+ if (!pdn)
+ break;
+ cur = pdn->get_dir()->get_inode();
+ }
+ return cur->snaprealm;
+}
+
+void CInode::encode_snap_blob(bufferlist &snapbl)
+{
+ if (snaprealm) {
+ using ceph::encode;
+ encode(snaprealm->srnode, snapbl);
+ dout(20) << __func__ << " " << *snaprealm << dendl;
+ }
+}
+void CInode::decode_snap_blob(const bufferlist& snapbl)
+{
+ using ceph::decode;
+ if (snapbl.length()) {
+ open_snaprealm();
+ auto old_flags = snaprealm->srnode.flags;
+ auto p = snapbl.cbegin();
+ decode(snaprealm->srnode, p);
+ if (is_base()) {
+ bool ok = snaprealm->_open_parents(NULL);
+ ceph_assert(ok);
+ } else {
+ if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
+ snaprealm->close_parents();
+ snaprealm->adjust_parent();
+ }
+ }
+ dout(20) << __func__ << " " << *snaprealm << dendl;
+ } else if (snaprealm &&
+ !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675
+ ceph_assert(mdcache->mds->is_any_replay());
+ snaprealm->merge_to(NULL);
+ }
+}
+
+void CInode::encode_snap(bufferlist& bl)
+{
+ using ceph::encode;
+ bufferlist snapbl;
+ encode_snap_blob(snapbl);
+ encode(snapbl, bl);
+ encode(oldest_snap, bl);
+}
+
+void CInode::decode_snap(bufferlist::const_iterator& p)
+{
+ using ceph::decode;
+ bufferlist snapbl;
+ decode(snapbl, p);
+ decode(oldest_snap, p);
+ decode_snap_blob(snapbl);
+}
+
+// =============================================
+
+client_t CInode::calc_ideal_loner()
+{
+ if (mdcache->is_readonly())
+ return -1;
+ if (!get_mds_caps_wanted().empty())
+ return -1;
+
+ int n = 0;
+ client_t loner = -1;
+ for (const auto &p : client_caps) {
+ if (!p.second.is_stale() &&
+ ((p.second.wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_WR|CEPH_CAP_FILE_RD)) ||
+ (inode.is_dir() && !has_subtree_root_dirfrag()))) {
+ if (n)
+ return -1;
+ n++;
+ loner = p.first;
+ }
+ }
+ return loner;
+}
+
+bool CInode::choose_ideal_loner()
+{
+ want_loner_cap = calc_ideal_loner();
+ int changed = false;
+ if (loner_cap >= 0 && loner_cap != want_loner_cap) {
+ if (!try_drop_loner())
+ return false;
+ changed = true;
+ }
+
+ if (want_loner_cap >= 0) {
+ if (loner_cap < 0) {
+ set_loner_cap(want_loner_cap);
+ changed = true;
+ } else
+ ceph_assert(loner_cap == want_loner_cap);
+ }
+ return changed;
+}
+
+bool CInode::try_set_loner()
+{
+ ceph_assert(want_loner_cap >= 0);
+ if (loner_cap >= 0 && loner_cap != want_loner_cap)
+ return false;
+ set_loner_cap(want_loner_cap);
+ return true;
+}
+
+void CInode::set_loner_cap(client_t l)
+{
+ loner_cap = l;
+ authlock.set_excl_client(loner_cap);
+ filelock.set_excl_client(loner_cap);
+ linklock.set_excl_client(loner_cap);
+ xattrlock.set_excl_client(loner_cap);
+}
+
+bool CInode::try_drop_loner()
+{
+ if (loner_cap < 0)
+ return true;
+
+ int other_allowed = get_caps_allowed_by_type(CAP_ANY);
+ Capability *cap = get_client_cap(loner_cap);
+ if (!cap ||
+ (cap->issued() & ~other_allowed) == 0) {
+ set_loner_cap(-1);
+ return true;
+ }
+ return false;
+}
+
+
+// choose new lock state during recovery, based on issued caps
+void CInode::choose_lock_state(SimpleLock *lock, int allissued)
+{
+ int shift = lock->get_cap_shift();
+ int issued = (allissued >> shift) & lock->get_cap_mask();
+ if (is_auth()) {
+ if (lock->is_xlocked()) {
+ // do nothing here
+ } else if (lock->get_state() != LOCK_MIX) {
+ if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER))
+ lock->set_state(LOCK_EXCL);
+ else if (issued & CEPH_CAP_GWR) {
+ if (issued & (CEPH_CAP_GCACHE | CEPH_CAP_GSHARED))
+ lock->set_state(LOCK_EXCL);
+ else
+ lock->set_state(LOCK_MIX);
+ } else if (lock->is_dirty()) {
+ if (is_replicated())
+ lock->set_state(LOCK_MIX);
+ else
+ lock->set_state(LOCK_LOCK);
+ } else
+ lock->set_state(LOCK_SYNC);
+ }
+ } else {
+ // our states have already been chosen during rejoin.
+ if (lock->is_xlocked())
+ ceph_assert(lock->get_state() == LOCK_LOCK);
+ }
+}
+
+void CInode::choose_lock_states(int dirty_caps)
+{
+ int issued = get_caps_issued() | dirty_caps;
+ if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR)))
+ choose_ideal_loner();
+ choose_lock_state(&filelock, issued);
+ choose_lock_state(&nestlock, issued);
+ choose_lock_state(&dirfragtreelock, issued);
+ choose_lock_state(&authlock, issued);
+ choose_lock_state(&xattrlock, issued);
+ choose_lock_state(&linklock, issued);
+}
+
+void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m)
+{
+ bool old_empty = mds_caps_wanted.empty();
+ mds_caps_wanted.swap(m);
+ if (old_empty != (bool)mds_caps_wanted.empty()) {
+ if (old_empty)
+ adjust_num_caps_wanted(1);
+ else
+ adjust_num_caps_wanted(-1);
+ }
+}
+
+void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted)
+{
+ bool old_empty = mds_caps_wanted.empty();
+ if (wanted) {
+ mds_caps_wanted[mds] = wanted;
+ if (old_empty)
+ adjust_num_caps_wanted(1);
+ } else if (!old_empty) {
+ mds_caps_wanted.erase(mds);
+ if (mds_caps_wanted.empty())
+ adjust_num_caps_wanted(-1);
+ }
+}
+
+void CInode::adjust_num_caps_wanted(int d)
+{
+ if (!num_caps_wanted && d > 0)
+ mdcache->open_file_table.add_inode(this);
+ else if (num_caps_wanted > 0 && num_caps_wanted == -d)
+ mdcache->open_file_table.remove_inode(this);
+
+ num_caps_wanted +=d;
+ ceph_assert(num_caps_wanted >= 0);
+}
+
+Capability *CInode::add_client_cap(client_t client, Session *session, SnapRealm *conrealm)
+{
+ ceph_assert(last == CEPH_NOSNAP);
+ if (client_caps.empty()) {
+ get(PIN_CAPS);
+ if (conrealm)
+ containing_realm = conrealm;
+ else
+ containing_realm = find_snaprealm();
+ containing_realm->inodes_with_caps.push_back(&item_caps);
+ dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl;
+
+ mdcache->num_inodes_with_caps++;
+ if (parent)
+ parent->dir->adjust_num_inodes_with_caps(1);
+ }
+
+ uint64_t cap_id = ++mdcache->last_cap_id;
+ auto ret = client_caps.emplace(std::piecewise_construct, std::forward_as_tuple(client),
+ std::forward_as_tuple(this, session, cap_id));
+ ceph_assert(ret.second == true);
+ Capability *cap = &ret.first->second;
+
+ cap->client_follows = first-1;
+ containing_realm->add_cap(client, cap);
+
+ return cap;
+}
+
+void CInode::remove_client_cap(client_t client)
+{
+ auto it = client_caps.find(client);
+ ceph_assert(it != client_caps.end());
+ Capability *cap = &it->second;
+
+ cap->item_session_caps.remove_myself();
+ cap->item_revoking_caps.remove_myself();
+ cap->item_client_revoking_caps.remove_myself();
+ containing_realm->remove_cap(client, cap);
+
+ if (client == loner_cap)
+ loner_cap = -1;
+
+ if (cap->wanted())
+ adjust_num_caps_wanted(-1);
+
+ client_caps.erase(it);
+ if (client_caps.empty()) {
+ dout(10) << __func__ << " last cap, leaving realm " << *containing_realm << dendl;
+ put(PIN_CAPS);
+ item_caps.remove_myself();
+ containing_realm = NULL;
+ mdcache->num_inodes_with_caps--;
+ if (parent)
+ parent->dir->adjust_num_inodes_with_caps(-1);
+ }
+
+ //clean up advisory locks
+ bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false;
+ bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false;
+ if (fcntl_removed || flock_removed) {
+ MDSContext::vec waiters;
+ take_waiting(CInode::WAIT_FLOCK, waiters);
+ mdcache->mds->queue_waiters(waiters);
+ }
+}
+
+void CInode::move_to_realm(SnapRealm *realm)
+{
+ dout(10) << __func__ << " joining realm " << *realm
+ << ", leaving realm " << *containing_realm << dendl;
+ for (auto& p : client_caps) {
+ containing_realm->remove_cap(p.first, &p.second);
+ realm->add_cap(p.first, &p.second);
+ }
+ item_caps.remove_myself();
+ realm->inodes_with_caps.push_back(&item_caps);
+ containing_realm = realm;
+}
+
+Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session)
+{
+ Capability *cap = get_client_cap(client);
+ if (cap) {
+ // FIXME?
+ cap->merge(icr.capinfo.wanted, icr.capinfo.issued);
+ } else {
+ cap = add_client_cap(client, session);
+ cap->set_cap_id(icr.capinfo.cap_id);
+ cap->set_wanted(icr.capinfo.wanted);
+ cap->issue_norevoke(icr.capinfo.issued);
+ cap->reset_seq();
+ }
+ cap->set_last_issue_stamp(ceph_clock_now());
+ return cap;
+}
+
+void CInode::clear_client_caps_after_export()
+{
+ while (!client_caps.empty())
+ remove_client_cap(client_caps.begin()->first);
+ loner_cap = -1;
+ want_loner_cap = -1;
+ if (!get_mds_caps_wanted().empty()) {
+ mempool::mds_co::compact_map<int32_t,int32_t> empty;
+ set_mds_caps_wanted(empty);
+ }
+}
+
+void CInode::export_client_caps(map<client_t,Capability::Export>& cl)
+{
+ for (const auto &p : client_caps) {
+ cl[p.first] = p.second.make_export();
+ }
+}
+
+ // caps allowed
+int CInode::get_caps_liked() const
+{
+ if (is_dir())
+ return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER
+ else
+ return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO;
+}
+
+int CInode::get_caps_allowed_ever() const
+{
+ int allowed;
+ if (is_dir())
+ allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED;
+ else
+ allowed = CEPH_CAP_ANY;
+ return allowed &
+ (CEPH_CAP_PIN |
+ (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) |
+ (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) |
+ (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) |
+ (linklock.gcaps_allowed_ever() << linklock.get_cap_shift()));
+}
+
+int CInode::get_caps_allowed_by_type(int type) const
+{
+ return
+ CEPH_CAP_PIN |
+ (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) |
+ (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) |
+ (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) |
+ (linklock.gcaps_allowed(type) << linklock.get_cap_shift());
+}
+
+int CInode::get_caps_careful() const
+{
+ return
+ (filelock.gcaps_careful() << filelock.get_cap_shift()) |
+ (authlock.gcaps_careful() << authlock.get_cap_shift()) |
+ (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) |
+ (linklock.gcaps_careful() << linklock.get_cap_shift());
+}
+
+int CInode::get_xlocker_mask(client_t client) const
+{
+ return
+ (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) |
+ (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) |
+ (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) |
+ (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift());
+}
+
+int CInode::get_caps_allowed_for_client(Session *session, Capability *cap,
+ mempool_inode *file_i) const
+{
+ client_t client = session->get_client();
+ int allowed;
+ if (client == get_loner()) {
+ // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked
+ allowed =
+ get_caps_allowed_by_type(CAP_LONER) |
+ (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client));
+ } else {
+ allowed = get_caps_allowed_by_type(CAP_ANY);
+ }
+
+ if (!is_dir()) {
+ if (file_i->inline_data.version == CEPH_INLINE_NONE &&
+ file_i->layout.pool_ns.empty()) {
+ // noop
+ } else if (cap) {
+ if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
+ cap->is_noinline()) ||
+ (!file_i->layout.pool_ns.empty() &&
+ cap->is_nopoolns()))
+ allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
+ } else {
+ auto& conn = session->get_connection();
+ if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
+ !conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) ||
+ (!file_i->layout.pool_ns.empty() &&
+ !conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)))
+ allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
+ }
+ }
+ return allowed;
+}
+
+// caps issued, wanted
+int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker,
+ int shift, int mask)
+{
+ int c = 0;
+ int loner = 0, other = 0, xlocker = 0;
+ if (!is_auth()) {
+ loner_cap = -1;
+ }
+
+ for (const auto &p : client_caps) {
+ int i = p.second.issued();
+ c |= i;
+ if (p.first == loner_cap)
+ loner |= i;
+ else
+ other |= i;
+ xlocker |= get_xlocker_mask(p.first) & i;
+ }
+ if (ploner) *ploner = (loner >> shift) & mask;
+ if (pother) *pother = (other >> shift) & mask;
+ if (pxlocker) *pxlocker = (xlocker >> shift) & mask;
+ return (c >> shift) & mask;
+}
+
+bool CInode::is_any_caps_wanted() const
+{
+ for (const auto &p : client_caps) {
+ if (p.second.wanted())
+ return true;
+ }
+ return false;
+}
+
+int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const
+{
+ int w = 0;
+ int loner = 0, other = 0;
+ for (const auto &p : client_caps) {
+ if (!p.second.is_stale()) {
+ int t = p.second.wanted();
+ w |= t;
+ if (p.first == loner_cap)
+ loner |= t;
+ else
+ other |= t;
+ }
+ //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
+ }
+ if (is_auth())
+ for (const auto &p : mds_caps_wanted) {
+ w |= p.second;
+ other |= p.second;
+ //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
+ }
+ if (ploner) *ploner = (loner >> shift) & mask;
+ if (pother) *pother = (other >> shift) & mask;
+ return (w >> shift) & mask;
+}
+
+bool CInode::issued_caps_need_gather(SimpleLock *lock)
+{
+ int loner_issued, other_issued, xlocker_issued;
+ get_caps_issued(&loner_issued, &other_issued, &xlocker_issued,
+ lock->get_cap_shift(), lock->get_cap_mask());
+ if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) ||
+ (other_issued & ~lock->gcaps_allowed(CAP_ANY)) ||
+ (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER)))
+ return true;
+ return false;
+}
+
+
+// =============================================
+
+int CInode::encode_inodestat(bufferlist& bl, Session *session,
+ SnapRealm *dir_realm,
+ snapid_t snapid,
+ unsigned max_bytes,
+ int getattr_caps)
+{
+ client_t client = session->get_client();
+ ceph_assert(snapid);
+
+ bool valid = true;
+
+ // pick a version!
+ mempool_inode *oi = &inode;
+ mempool_inode *pi = get_projected_inode();
+
+ CInode::mempool_xattr_map *pxattrs = nullptr;
+
+ if (snapid != CEPH_NOSNAP) {
+
+ // for now at least, old_inodes is only defined/valid on the auth
+ if (!is_auth())
+ valid = false;
+
+ if (is_multiversion()) {
+ auto it = old_inodes.lower_bound(snapid);
+ if (it != old_inodes.end()) {
+ if (it->second.first > snapid) {
+ if (it != old_inodes.begin())
+ --it;
+ }
+ if (it->second.first <= snapid && snapid <= it->first) {
+ dout(15) << __func__ << " snapid " << snapid
+ << " to old_inode [" << it->second.first << "," << it->first << "]"
+ << " " << it->second.inode.rstat
+ << dendl;
+ auto &p = it->second;
+ pi = oi = &p.inode;
+ pxattrs = &p.xattrs;
+ } else {
+ // snapshoted remote dentry can result this
+ dout(0) << __func__ << " old_inode for snapid " << snapid
+ << " not found" << dendl;
+ }
+ }
+ } else if (snapid < first || snapid > last) {
+ // snapshoted remote dentry can result this
+ dout(0) << __func__ << " [" << first << "," << last << "]"
+ << " not match snapid " << snapid << dendl;
+ }
+ }
+
+ utime_t snap_btime;
+ SnapRealm *realm = find_snaprealm();
+ if (snapid != CEPH_NOSNAP && realm) {
+ // add snapshot timestamp vxattr
+ map<snapid_t,const SnapInfo*> infomap;
+ realm->get_snap_info(infomap,
+ snapid, // min
+ snapid); // max
+ if (!infomap.empty()) {
+ ceph_assert(infomap.size() == 1);
+ const SnapInfo *si = infomap.begin()->second;
+ snap_btime = si->stamp;
+ }
+ }
+
+
+ bool no_caps = !valid ||
+ session->is_stale() ||
+ (dir_realm && realm != dir_realm) ||
+ is_frozen() ||
+ state_test(CInode::STATE_EXPORTINGCAPS);
+ if (no_caps)
+ dout(20) << __func__ << " no caps"
+ << (!valid?", !valid":"")
+ << (session->is_stale()?", session stale ":"")
+ << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"")
+ << (is_frozen()?", frozen inode":"")
+ << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"")
+ << dendl;
+
+
+ // "fake" a version that is old (stable) version, +1 if projected.
+ version_t version = (oi->version * 2) + is_projected();
+
+ Capability *cap = get_client_cap(client);
+ bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client;
+ //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL));
+ bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client;
+ bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client;
+ bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client;
+
+ bool plocal = versionlock.get_last_wrlock_client() == client;
+ bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client;
+
+ mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi;
+
+ dout(20) << " pfile " << pfile << " pauth " << pauth
+ << " plink " << plink << " pxattr " << pxattr
+ << " plocal " << plocal
+ << " ctime " << any_i->ctime
+ << " valid=" << valid << dendl;
+
+ // file
+ mempool_inode *file_i = pfile ? pi:oi;
+ file_layout_t layout;
+ if (is_dir()) {
+ layout = (ppolicy ? pi : oi)->layout;
+ } else {
+ layout = file_i->layout;
+ }
+
+ // max_size is min of projected, actual
+ uint64_t max_size =
+ std::min(oi->client_ranges.count(client) ?
+ oi->client_ranges[client].range.last : 0,
+ pi->client_ranges.count(client) ?
+ pi->client_ranges[client].range.last : 0);
+
+ // inline data
+ version_t inline_version = 0;
+ bufferlist inline_data;
+ if (file_i->inline_data.version == CEPH_INLINE_NONE) {
+ inline_version = CEPH_INLINE_NONE;
+ } else if ((!cap && !no_caps) ||
+ (cap && cap->client_inline_version < file_i->inline_data.version) ||
+ (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data
+ inline_version = file_i->inline_data.version;
+ if (file_i->inline_data.length() > 0)
+ inline_data = file_i->inline_data.get_data();
+ }
+
+ // nest (do same as file... :/)
+ if (cap) {
+ cap->last_rbytes = file_i->rstat.rbytes;
+ cap->last_rsize = file_i->rstat.rsize();
+ }
+
+ // auth
+ mempool_inode *auth_i = pauth ? pi:oi;
+
+ // link
+ mempool_inode *link_i = plink ? pi:oi;
+
+ // xattr
+ mempool_inode *xattr_i = pxattr ? pi:oi;
+
+ using ceph::encode;
+ // xattr
+ version_t xattr_version;
+ if ((!cap && !no_caps) ||
+ (cap && cap->client_xattr_version < xattr_i->xattr_version) ||
+ (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs
+ if (!pxattrs)
+ pxattrs = pxattr ? get_projected_xattrs() : &xattrs;
+ xattr_version = xattr_i->xattr_version;
+ } else {
+ xattr_version = 0;
+ }
+
+ // do we have room?
+ if (max_bytes) {
+ unsigned bytes =
+ 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) +
+ sizeof(struct ceph_file_layout) +
+ sizeof(struct ceph_timespec) * 3 + 4 + // ctime ~ time_warp_seq
+ 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink
+ 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + // dirstat.nfiles ~ rstat.rctime
+ sizeof(__u32) + sizeof(__u32) * 2 * dirfragtree._splits.size() + // dirfragtree
+ sizeof(__u32) + symlink.length() + // symlink
+ sizeof(struct ceph_dir_layout); // dir_layout
+
+ if (xattr_version) {
+ bytes += sizeof(__u32) + sizeof(__u32); // xattr buffer len + number entries
+ if (pxattrs) {
+ for (const auto &p : *pxattrs)
+ bytes += sizeof(__u32) * 2 + p.first.length() + p.second.length();
+ }
+ } else {
+ bytes += sizeof(__u32); // xattr buffer len
+ }
+ bytes +=
+ sizeof(version_t) + sizeof(__u32) + inline_data.length() + // inline data
+ 1 + 1 + 8 + 8 + 4 + // quota
+ 4 + layout.pool_ns.size() + // pool ns
+ sizeof(struct ceph_timespec) + 8; // btime + change_attr
+
+ if (bytes > max_bytes)
+ return -ENOSPC;
+ }
+
+
+ // encode caps
+ struct ceph_mds_reply_cap ecap;
+ if (snapid != CEPH_NOSNAP) {
+ /*
+ * snapped inodes (files or dirs) only get read-only caps. always
+ * issue everything possible, since it is read only.
+ *
+ * if a snapped inode has caps, limit issued caps based on the
+ * lock state.
+ *
+ * if it is a live inode, limit issued caps based on the lock
+ * state.
+ *
+ * do NOT adjust cap issued state, because the client always
+ * tracks caps per-snap and the mds does either per-interval or
+ * multiversion.
+ */
+ ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE;
+ if (last == CEPH_NOSNAP || is_any_caps())
+ ecap.caps = ecap.caps & get_caps_allowed_for_client(session, nullptr, file_i);
+ ecap.seq = 0;
+ ecap.mseq = 0;
+ ecap.realm = 0;
+ } else {
+ if (!no_caps && !cap) {
+ // add a new cap
+ cap = add_client_cap(client, session, realm);
+ if (is_auth())
+ choose_ideal_loner();
+ }
+
+ int issue = 0;
+ if (!no_caps && cap) {
+ int likes = get_caps_liked();
+ int allowed = get_caps_allowed_for_client(session, cap, file_i);
+ issue = (cap->wanted() | likes) & allowed;
+ cap->issue_norevoke(issue, true);
+ issue = cap->pending();
+ dout(10) << "encode_inodestat issuing " << ccap_string(issue)
+ << " seq " << cap->get_last_seq() << dendl;
+ } else if (cap && cap->is_new() && !dir_realm) {
+ // alway issue new caps to client, otherwise the caps get lost
+ ceph_assert(cap->is_stale());
+ ceph_assert(!cap->pending());
+ issue = CEPH_CAP_PIN;
+ cap->issue_norevoke(issue, true);
+ dout(10) << "encode_inodestat issuing " << ccap_string(issue)
+ << " seq " << cap->get_last_seq()
+ << "(stale&new caps)" << dendl;
+ }
+
+ if (issue) {
+ cap->set_last_issue();
+ cap->set_last_issue_stamp(ceph_clock_now());
+ ecap.caps = issue;
+ ecap.wanted = cap->wanted();
+ ecap.cap_id = cap->get_cap_id();
+ ecap.seq = cap->get_last_seq();
+ ecap.mseq = cap->get_mseq();
+ ecap.realm = realm->inode->ino();
+ } else {
+ ecap.cap_id = 0;
+ ecap.caps = 0;
+ ecap.seq = 0;
+ ecap.mseq = 0;
+ ecap.realm = 0;
+ ecap.wanted = 0;
+ }
+ }
+ ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0;
+ dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps)
+ << " seq " << ecap.seq << " mseq " << ecap.mseq
+ << " xattrv " << xattr_version << dendl;
+
+ if (inline_data.length() && cap) {
+ if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) {
+ dout(10) << "including inline version " << inline_version << dendl;
+ cap->client_inline_version = inline_version;
+ } else {
+ dout(10) << "dropping inline version " << inline_version << dendl;
+ inline_version = 0;
+ inline_data.clear();
+ }
+ }
+
+ // include those xattrs?
+ if (xattr_version && cap) {
+ if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) {
+ dout(10) << "including xattrs version " << xattr_version << dendl;
+ cap->client_xattr_version = xattr_version;
+ } else {
+ dout(10) << "dropping xattrs version " << xattr_version << dendl;
+ xattr_version = 0;
+ }
+ }
+
+ // The end result of encode_xattrs() is equivalent to:
+ // {
+ // bufferlist xbl;
+ // if (xattr_version) {
+ // if (pxattrs)
+ // encode(*pxattrs, bl);
+ // else
+ // encode((__u32)0, bl);
+ // }
+ // encode(xbl, bl);
+ // }
+ //
+ // But encoding xattrs into the 'xbl' requires a memory allocation.
+ // The 'bl' should have enough pre-allocated memory in most cases.
+ // Encoding xattrs directly into it can avoid the extra allocation.
+ auto encode_xattrs = [xattr_version, pxattrs, &bl]() {
+ using ceph::encode;
+ if (xattr_version) {
+ ceph_le32 xbl_len;
+ auto filler = bl.append_hole(sizeof(xbl_len));
+ const auto starting_bl_len = bl.length();
+ if (pxattrs)
+ encode(*pxattrs, bl);
+ else
+ encode((__u32)0, bl);
+ xbl_len = bl.length() - starting_bl_len;
+ filler.copy_in(sizeof(xbl_len), (char *)&xbl_len);
+ } else {
+ encode((__u32)0, bl);
+ }
+ };
+
+ /*
+ * note: encoding matches MClientReply::InodeStat
+ */
+ if (session->info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
+ ENCODE_START(3, 1, bl);
+ encode(oi->ino, bl);
+ encode(snapid, bl);
+ encode(oi->rdev, bl);
+ encode(version, bl);
+ encode(xattr_version, bl);
+ encode(ecap, bl);
+ {
+ ceph_file_layout legacy_layout;
+ layout.to_legacy(&legacy_layout);
+ encode(legacy_layout, bl);
+ }
+ encode(any_i->ctime, bl);
+ encode(file_i->mtime, bl);
+ encode(file_i->atime, bl);
+ encode(file_i->time_warp_seq, bl);
+ encode(file_i->size, bl);
+ encode(max_size, bl);
+ encode(file_i->truncate_size, bl);
+ encode(file_i->truncate_seq, bl);
+ encode(auth_i->mode, bl);
+ encode((uint32_t)auth_i->uid, bl);
+ encode((uint32_t)auth_i->gid, bl);
+ encode(link_i->nlink, bl);
+ encode(file_i->dirstat.nfiles, bl);
+ encode(file_i->dirstat.nsubdirs, bl);
+ encode(file_i->rstat.rbytes, bl);
+ encode(file_i->rstat.rfiles, bl);
+ encode(file_i->rstat.rsubdirs, bl);
+ encode(file_i->rstat.rctime, bl);
+ dirfragtree.encode(bl);
+ encode(symlink, bl);
+ encode(file_i->dir_layout, bl);
+ encode_xattrs();
+ encode(inline_version, bl);
+ encode(inline_data, bl);
+ mempool_inode *policy_i = ppolicy ? pi : oi;
+ encode(policy_i->quota, bl);
+ encode(layout.pool_ns, bl);
+ encode(any_i->btime, bl);
+ encode(any_i->change_attr, bl);
+ encode(file_i->export_pin, bl);
+ encode(snap_btime, bl);
+ ENCODE_FINISH(bl);
+ }
+ else {
+ ceph_assert(session->get_connection());
+
+ encode(oi->ino, bl);
+ encode(snapid, bl);
+ encode(oi->rdev, bl);
+ encode(version, bl);
+ encode(xattr_version, bl);
+ encode(ecap, bl);
+ {
+ ceph_file_layout legacy_layout;
+ layout.to_legacy(&legacy_layout);
+ encode(legacy_layout, bl);
+ }
+ encode(any_i->ctime, bl);
+ encode(file_i->mtime, bl);
+ encode(file_i->atime, bl);
+ encode(file_i->time_warp_seq, bl);
+ encode(file_i->size, bl);
+ encode(max_size, bl);
+ encode(file_i->truncate_size, bl);
+ encode(file_i->truncate_seq, bl);
+ encode(auth_i->mode, bl);
+ encode((uint32_t)auth_i->uid, bl);
+ encode((uint32_t)auth_i->gid, bl);
+ encode(link_i->nlink, bl);
+ encode(file_i->dirstat.nfiles, bl);
+ encode(file_i->dirstat.nsubdirs, bl);
+ encode(file_i->rstat.rbytes, bl);
+ encode(file_i->rstat.rfiles, bl);
+ encode(file_i->rstat.rsubdirs, bl);
+ encode(file_i->rstat.rctime, bl);
+ dirfragtree.encode(bl);
+ encode(symlink, bl);
+ auto& conn = session->get_connection();
+ if (conn->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) {
+ encode(file_i->dir_layout, bl);
+ }
+ encode_xattrs();
+ if (conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
+ encode(inline_version, bl);
+ encode(inline_data, bl);
+ }
+ if (conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) {
+ mempool_inode *policy_i = ppolicy ? pi : oi;
+ encode(policy_i->quota, bl);
+ }
+ if (conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) {
+ encode(layout.pool_ns, bl);
+ }
+ if (conn->has_feature(CEPH_FEATURE_FS_BTIME)) {
+ encode(any_i->btime, bl);
+ encode(any_i->change_attr, bl);
+ }
+ }
+
+ return valid;
+}
+
+void CInode::encode_cap_message(const MClientCaps::ref &m, Capability *cap)
+{
+ ceph_assert(cap);
+
+ client_t client = cap->get_client();
+
+ bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL);
+ bool pauth = authlock.is_xlocked_by_client(client);
+ bool plink = linklock.is_xlocked_by_client(client);
+ bool pxattr = xattrlock.is_xlocked_by_client(client);
+
+ mempool_inode *oi = &inode;
+ mempool_inode *pi = get_projected_inode();
+ mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi;
+
+ dout(20) << __func__ << " pfile " << pfile
+ << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr
+ << " ctime " << i->ctime << dendl;
+
+ i = pfile ? pi:oi;
+ m->set_layout(i->layout);
+ m->size = i->size;
+ m->truncate_seq = i->truncate_seq;
+ m->truncate_size = i->truncate_size;
+ m->mtime = i->mtime;
+ m->atime = i->atime;
+ m->ctime = i->ctime;
+ m->change_attr = i->change_attr;
+ m->time_warp_seq = i->time_warp_seq;
+ m->nfiles = i->dirstat.nfiles;
+ m->nsubdirs = i->dirstat.nsubdirs;
+
+ if (cap->client_inline_version < i->inline_data.version) {
+ m->inline_version = cap->client_inline_version = i->inline_data.version;
+ if (i->inline_data.length() > 0)
+ m->inline_data = i->inline_data.get_data();
+ } else {
+ m->inline_version = 0;
+ }
+
+ // max_size is min of projected, actual.
+ uint64_t oldms = oi->client_ranges.count(client) ? oi->client_ranges[client].range.last : 0;
+ uint64_t newms = pi->client_ranges.count(client) ? pi->client_ranges[client].range.last : 0;
+ m->max_size = std::min(oldms, newms);
+
+ i = pauth ? pi:oi;
+ m->head.mode = i->mode;
+ m->head.uid = i->uid;
+ m->head.gid = i->gid;
+
+ i = plink ? pi:oi;
+ m->head.nlink = i->nlink;
+
+ using ceph::encode;
+ i = pxattr ? pi:oi;
+ auto ix = pxattr ? get_projected_xattrs() : &xattrs;
+ if ((cap->pending() & CEPH_CAP_XATTR_SHARED) &&
+ i->xattr_version > cap->client_xattr_version) {
+ dout(10) << " including xattrs v " << i->xattr_version << dendl;
+ encode(*ix, m->xattrbl);
+ m->head.xattr_version = i->xattr_version;
+ cap->client_xattr_version = i->xattr_version;
+ }
+}
+
+
+
+void CInode::_encode_base(bufferlist& bl, uint64_t features)
+{
+ using ceph::encode;
+ encode(first, bl);
+ encode(inode, bl, features);
+ encode(symlink, bl);
+ encode(dirfragtree, bl);
+ encode(xattrs, bl);
+ encode(old_inodes, bl, features);
+ encode(damage_flags, bl);
+ encode_snap(bl);
+}
+void CInode::_decode_base(bufferlist::const_iterator& p)
+{
+ using ceph::decode;
+ decode(first, p);
+ decode(inode, p);
+ {
+ std::string tmp;
+ decode(tmp, p);
+ symlink = std::string_view(tmp);
+ }
+ decode(dirfragtree, p);
+ decode_noshare(xattrs, p);
+ decode(old_inodes, p);
+ decode(damage_flags, p);
+ decode_snap(p);
+}
+
+void CInode::_encode_locks_full(bufferlist& bl)
+{
+ using ceph::encode;
+ encode(authlock, bl);
+ encode(linklock, bl);
+ encode(dirfragtreelock, bl);
+ encode(filelock, bl);
+ encode(xattrlock, bl);
+ encode(snaplock, bl);
+ encode(nestlock, bl);
+ encode(flocklock, bl);
+ encode(policylock, bl);
+
+ encode(loner_cap, bl);
+}
+void CInode::_decode_locks_full(bufferlist::const_iterator& p)
+{
+ using ceph::decode;
+ decode(authlock, p);
+ decode(linklock, p);
+ decode(dirfragtreelock, p);
+ decode(filelock, p);
+ decode(xattrlock, p);
+ decode(snaplock, p);
+ decode(nestlock, p);
+ decode(flocklock, p);
+ decode(policylock, p);
+
+ decode(loner_cap, p);
+ set_loner_cap(loner_cap);
+ want_loner_cap = loner_cap; // for now, we'll eval() shortly.
+}
+
+void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover)
+{
+ authlock.encode_state_for_replica(bl);
+ linklock.encode_state_for_replica(bl);
+ dirfragtreelock.encode_state_for_replica(bl);
+ filelock.encode_state_for_replica(bl);
+ nestlock.encode_state_for_replica(bl);
+ xattrlock.encode_state_for_replica(bl);
+ snaplock.encode_state_for_replica(bl);
+ flocklock.encode_state_for_replica(bl);
+ policylock.encode_state_for_replica(bl);
+ using ceph::encode;
+ encode(need_recover, bl);
+}
+
+void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
+{
+ authlock.encode_state_for_replica(bl);
+ linklock.encode_state_for_replica(bl);
+ dirfragtreelock.encode_state_for_rejoin(bl, rep);
+ filelock.encode_state_for_rejoin(bl, rep);
+ nestlock.encode_state_for_rejoin(bl, rep);
+ xattrlock.encode_state_for_replica(bl);
+ snaplock.encode_state_for_replica(bl);
+ flocklock.encode_state_for_replica(bl);
+ policylock.encode_state_for_replica(bl);
+}
+
+void CInode::_decode_locks_state(bufferlist::const_iterator& p, bool is_new)
+{
+ authlock.decode_state(p, is_new);
+ linklock.decode_state(p, is_new);
+ dirfragtreelock.decode_state(p, is_new);
+ filelock.decode_state(p, is_new);
+ nestlock.decode_state(p, is_new);
+ xattrlock.decode_state(p, is_new);
+ snaplock.decode_state(p, is_new);
+ flocklock.decode_state(p, is_new);
+ policylock.decode_state(p, is_new);
+
+ using ceph::decode;
+ bool need_recover;
+ decode(need_recover, p);
+ if (need_recover && is_new) {
+ // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
+ // and change the object when replaying unsafe requests.
+ authlock.mark_need_recover();
+ linklock.mark_need_recover();
+ dirfragtreelock.mark_need_recover();
+ filelock.mark_need_recover();
+ nestlock.mark_need_recover();
+ xattrlock.mark_need_recover();
+ snaplock.mark_need_recover();
+ flocklock.mark_need_recover();
+ policylock.mark_need_recover();
+ }
+}
+void CInode::_decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters,
+ list<SimpleLock*>& eval_locks, bool survivor)
+{
+ authlock.decode_state_rejoin(p, waiters, survivor);
+ linklock.decode_state_rejoin(p, waiters, survivor);
+ dirfragtreelock.decode_state_rejoin(p, waiters, survivor);
+ filelock.decode_state_rejoin(p, waiters, survivor);
+ nestlock.decode_state_rejoin(p, waiters, survivor);
+ xattrlock.decode_state_rejoin(p, waiters, survivor);
+ snaplock.decode_state_rejoin(p, waiters, survivor);
+ flocklock.decode_state_rejoin(p, waiters, survivor);
+ policylock.decode_state_rejoin(p, waiters, survivor);
+
+ if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked())
+ eval_locks.push_back(&dirfragtreelock);
+ if (!filelock.is_stable() && !filelock.is_wrlocked())
+ eval_locks.push_back(&filelock);
+ if (!nestlock.is_stable() && !nestlock.is_wrlocked())
+ eval_locks.push_back(&nestlock);
+}
+
+
+// IMPORT/EXPORT
+
+void CInode::encode_export(bufferlist& bl)
+{
+ ENCODE_START(5, 4, bl);
+ _encode_base(bl, mdcache->mds->mdsmap->get_up_features());
+
+ encode(state, bl);
+
+ encode(pop, bl);
+
+ encode(get_replicas(), bl);
+
+ // include scatterlock info for any bounding CDirs
+ bufferlist bounding;
+ if (inode.is_dir())
+ for (const auto &p : dirfrags) {
+ CDir *dir = p.second;
+ if (dir->state_test(CDir::STATE_EXPORTBOUND)) {
+ encode(p.first, bounding);
+ encode(dir->fnode.fragstat, bounding);
+ encode(dir->fnode.accounted_fragstat, bounding);
+ encode(dir->fnode.rstat, bounding);
+ encode(dir->fnode.accounted_rstat, bounding);
+ dout(10) << " encoded fragstat/rstat info for " << *dir << dendl;
+ }
+ }
+ encode(bounding, bl);
+
+ _encode_locks_full(bl);
+
+ _encode_file_locks(bl);
+
+ ENCODE_FINISH(bl);
+
+ get(PIN_TEMPEXPORTING);
+}
+
+void CInode::finish_export()
+{
+ state &= MASK_STATE_EXPORT_KEPT;
+
+ pop.zero();
+
+ // just in case!
+ //dirlock.clear_updated();
+
+ loner_cap = -1;
+
+ put(PIN_TEMPEXPORTING);
+}
+
+void CInode::decode_import(bufferlist::const_iterator& p,
+ LogSegment *ls)
+{
+ DECODE_START(5, p);
+
+ _decode_base(p);
+
+ unsigned s;
+ decode(s, p);
+ state_set(STATE_AUTH | (s & MASK_STATE_EXPORTED));
+
+ if (is_dirty()) {
+ get(PIN_DIRTY);
+ _mark_dirty(ls);
+ }
+ if (is_dirty_parent()) {
+ get(PIN_DIRTYPARENT);
+ mark_dirty_parent(ls);
+ }
+
+ decode(pop, p);
+
+ decode(get_replicas(), p);
+ if (is_replicated())
+ get(PIN_REPLICATED);
+ replica_nonce = 0;
+
+ // decode fragstat info on bounding cdirs
+ bufferlist bounding;
+ decode(bounding, p);
+ auto q = bounding.cbegin();
+ while (!q.end()) {
+ frag_t fg;
+ decode(fg, q);
+ CDir *dir = get_dirfrag(fg);
+ ceph_assert(dir); // we should have all bounds open
+
+ // Only take the remote's fragstat/rstat if we are non-auth for
+ // this dirfrag AND the lock is NOT in a scattered (MIX) state.
+ // We know lock is stable, and MIX is the only state in which
+ // the inode auth (who sent us this data) may not have the best
+ // info.
+
+ // HMM: Are there cases where dir->is_auth() is an insufficient
+ // check because the dirfrag is under migration? That implies
+ // it is frozen (and in a SYNC or LOCK state). FIXME.
+
+ if (dir->is_auth() ||
+ filelock.get_state() == LOCK_MIX) {
+ dout(10) << " skipped fragstat info for " << *dir << dendl;
+ frag_info_t f;
+ decode(f, q);
+ decode(f, q);
+ } else {
+ decode(dir->fnode.fragstat, q);
+ decode(dir->fnode.accounted_fragstat, q);
+ dout(10) << " took fragstat info for " << *dir << dendl;
+ }
+ if (dir->is_auth() ||
+ nestlock.get_state() == LOCK_MIX) {
+ dout(10) << " skipped rstat info for " << *dir << dendl;
+ nest_info_t n;
+ decode(n, q);
+ decode(n, q);
+ } else {
+ decode(dir->fnode.rstat, q);
+ decode(dir->fnode.accounted_rstat, q);
+ dout(10) << " took rstat info for " << *dir << dendl;
+ }
+ }
+
+ _decode_locks_full(p);
+
+ _decode_file_locks(p);
+
+ DECODE_FINISH(p);
+}
+
+
+void InodeStoreBase::dump(Formatter *f) const
+{
+ inode.dump(f);
+ f->dump_string("symlink", symlink);
+ f->open_array_section("old_inodes");
+ for (const auto &p : old_inodes) {
+ f->open_object_section("old_inode");
+ // The key is the last snapid, the first is in the mempool_old_inode
+ f->dump_int("last", p.first);
+ p.second.dump(f);
+ f->close_section(); // old_inode
+ }
+ f->close_section(); // old_inodes
+
+ f->open_object_section("dirfragtree");
+ dirfragtree.dump(f);
+ f->close_section(); // dirfragtree
+}
+
+
+void InodeStore::generate_test_instances(list<InodeStore*> &ls)
+{
+ InodeStore *populated = new InodeStore;
+ populated->inode.ino = 0xdeadbeef;
+ populated->symlink = "rhubarb";
+ ls.push_back(populated);
+}
+
+void InodeStoreBare::generate_test_instances(list<InodeStoreBare*> &ls)
+{
+ InodeStoreBare *populated = new InodeStoreBare;
+ populated->inode.ino = 0xdeadbeef;
+ populated->symlink = "rhubarb";
+ ls.push_back(populated);
+}
+
+void CInode::validate_disk_state(CInode::validated_data *results,
+ MDSContext *fin)
+{
+ class ValidationContinuation : public MDSContinuation {
+ public:
+ MDSContext *fin;
+ CInode *in;
+ CInode::validated_data *results;
+ bufferlist bl;
+ CInode *shadow_in;
+
+ enum {
+ START = 0,
+ BACKTRACE,
+ INODE,
+ DIRFRAGS,
+ SNAPREALM,
+ };
+
+ ValidationContinuation(CInode *i,
+ CInode::validated_data *data_r,
+ MDSContext *fin_) :
+ MDSContinuation(i->mdcache->mds->server),
+ fin(fin_),
+ in(i),
+ results(data_r),
+ shadow_in(NULL) {
+ set_callback(START, static_cast<Continuation::stagePtr>(&ValidationContinuation::_start));
+ set_callback(BACKTRACE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_backtrace));
+ set_callback(INODE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_inode_disk));
+ set_callback(DIRFRAGS, static_cast<Continuation::stagePtr>(&ValidationContinuation::_dirfrags));
+ set_callback(SNAPREALM, static_cast<Continuation::stagePtr>(&ValidationContinuation::_snaprealm));
+ }
+
+ ~ValidationContinuation() override {
+ if (shadow_in) {
+ delete shadow_in;
+ in->mdcache->num_shadow_inodes--;
+ }
+ }
+
+ /**
+ * Fetch backtrace and set tag if tag is non-empty
+ */
+ void fetch_backtrace_and_tag(CInode *in,
+ std::string_view tag, bool is_internal,
+ Context *fin, int *bt_r, bufferlist *bt)
+ {
+ const int64_t pool = in->get_backtrace_pool();
+ object_t oid = CInode::get_object_name(in->ino(), frag_t(), "");
+
+ ObjectOperation fetch;
+ fetch.getxattr("parent", bt, bt_r);
+ in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP,
+ NULL, 0, fin);
+ using ceph::encode;
+ if (!is_internal) {
+ ObjectOperation scrub_tag;
+ bufferlist tag_bl;
+ encode(tag, tag_bl);
+ scrub_tag.setxattr("scrub_tag", tag_bl);
+ SnapContext snapc;
+ in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc,
+ ceph::real_clock::now(),
+ 0, NULL);
+ }
+ }
+
+ bool _start(int rval) {
+ if (in->is_dirty()) {
+ MDCache *mdcache = in->mdcache;
+ mempool_inode& inode = in->inode;
+ dout(20) << "validating a dirty CInode; results will be inconclusive"
+ << dendl;
+ }
+ if (in->is_symlink()) {
+ // there's nothing to do for symlinks!
+ return true;
+ }
+
+ // prefetch snaprealm's past parents
+ if (in->snaprealm && !in->snaprealm->have_past_parents_open())
+ in->snaprealm->open_parents(nullptr);
+
+ C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE),
+ in->mdcache->mds->finisher);
+
+ std::string_view tag = in->scrub_infop->header->get_tag();
+ bool is_internal = in->scrub_infop->header->is_internal_tag();
+ // Rather than using the usual CInode::fetch_backtrace,
+ // use a special variant that optionally writes a tag in the same
+ // operation.
+ fetch_backtrace_and_tag(in, tag, is_internal, conf, &results->backtrace.ondisk_read_retval, &bl);
+ return false;
+ }
+
+ bool _backtrace(int rval) {
+ // set up basic result reporting and make sure we got the data
+ results->performed_validation = true; // at least, some of it!
+ results->backtrace.checked = true;
+
+ const int64_t pool = in->get_backtrace_pool();
+ inode_backtrace_t& memory_backtrace = results->backtrace.memory_value;
+ in->build_backtrace(pool, memory_backtrace);
+ bool equivalent, divergent;
+ int memory_newer;
+
+ MDCache *mdcache = in->mdcache; // For the benefit of dout
+ const mempool_inode& inode = in->inode; // For the benefit of dout
+
+ // Ignore rval because it's the result of a FAILOK operation
+ // from fetch_backtrace_and_tag: the real result is in
+ // backtrace.ondisk_read_retval
+ dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl;
+ if (results->backtrace.ondisk_read_retval != 0) {
+ results->backtrace.error_str << "failed to read off disk; see retval";
+ // we probably have a new unwritten file!
+ // so skip the backtrace scrub for this entry and say that all's well
+ if (in->is_dirty_parent())
+ results->backtrace.passed = true;
+ goto next;
+ }
+
+ // extract the backtrace, and compare it to a newly-constructed one
+ try {
+ auto p = bl.cbegin();
+ using ceph::decode;
+ decode(results->backtrace.ondisk_value, p);
+ dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl;
+ } catch (buffer::error&) {
+ if (results->backtrace.ondisk_read_retval == 0 && rval != 0) {
+ // Cases where something has clearly gone wrong with the overall
+ // fetch op, though we didn't get a nonzero rc from the getxattr
+ // operation. e.g. object missing.
+ results->backtrace.ondisk_read_retval = rval;
+ }
+ results->backtrace.error_str << "failed to decode on-disk backtrace ("
+ << bl.length() << " bytes)!";
+ // we probably have a new unwritten file!
+ // so skip the backtrace scrub for this entry and say that all's well
+ if (in->is_dirty_parent())
+ results->backtrace.passed = true;
+
+ goto next;
+ }
+
+ memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value,
+ &equivalent, &divergent);
+
+ if (divergent || memory_newer < 0) {
+ // we're divergent, or on-disk version is newer
+ results->backtrace.error_str << "On-disk backtrace is divergent or newer";
+ // we probably have a new unwritten file!
+ // so skip the backtrace scrub for this entry and say that all's well
+ if (divergent && in->is_dirty_parent())
+ results->backtrace.passed = true;
+ } else {
+ results->backtrace.passed = true;
+ }
+next:
+
+ if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) {
+ std::string path;
+ in->make_path_string(path);
+ in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino()
+ << "(" << path << "), rewriting it";
+ in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(),
+ false);
+ // Flag that we repaired this BT so that it won't go into damagetable
+ results->backtrace.repaired = true;
+ }
+
+ // If the inode's number was free in the InoTable, fix that
+ // (#15619)
+ {
+ InoTable *inotable = mdcache->mds->inotable;
+
+ dout(10) << "scrub: inotable ino = " << inode.ino << dendl;
+ dout(10) << "scrub: inotable free says "
+ << inotable->is_marked_free(inode.ino) << dendl;
+
+ if (inotable->is_marked_free(inode.ino)) {
+ LogChannelRef clog = in->mdcache->mds->clog;
+ clog->error() << "scrub: inode wrongly marked free: " << inode.ino;
+
+ if (in->scrub_infop->header->get_repair()) {
+ bool repaired = inotable->repair(inode.ino);
+ if (repaired) {
+ clog->error() << "inode table repaired for inode: " << inode.ino;
+
+ inotable->save();
+ } else {
+ clog->error() << "Cannot repair inotable while other operations"
+ " are in progress";
+ }
+ }
+ }
+ }
+
+
+ if (in->is_dir()) {
+ return validate_directory_data();
+ } else {
+ // TODO: validate on-disk inode for normal files
+ return check_inode_snaprealm();
+ }
+ }
+
+ bool validate_directory_data() {
+ ceph_assert(in->is_dir());
+
+ if (in->is_base()) {
+ if (!shadow_in) {
+ shadow_in = new CInode(in->mdcache);
+ in->mdcache->create_unlinked_system_inode(shadow_in, in->inode.ino, in->inode.mode);
+ in->mdcache->num_shadow_inodes++;
+ }
+ shadow_in->fetch(get_internal_callback(INODE));
+ return false;
+ } else {
+ // TODO: validate on-disk inode for non-base directories
+ results->inode.passed = true;
+ return check_dirfrag_rstats();
+ }
+ }
+
+ bool _inode_disk(int rval) {
+ results->inode.checked = true;
+ results->inode.ondisk_read_retval = rval;
+ results->inode.ondisk_value = shadow_in->inode;
+ results->inode.memory_value = in->inode;
+
+ mempool_inode& si = shadow_in->inode;
+ mempool_inode& i = in->inode;
+ if (si.version > i.version) {
+ // uh, what?
+ results->inode.error_str << "On-disk inode is newer than in-memory one; ";
+ goto next;
+ } else {
+ bool divergent = false;
+ int r = i.compare(si, &divergent);
+ results->inode.passed = !divergent && r >= 0;
+ if (!results->inode.passed) {
+ results->inode.error_str <<
+ "On-disk inode is divergent or newer than in-memory one; ";
+ goto next;
+ }
+ }
+next:
+ return check_dirfrag_rstats();
+ }
+
+ bool check_dirfrag_rstats() {
+ MDSGatherBuilder gather(g_ceph_context);
+ frag_vec_t leaves;
+ in->dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ CDir *dir = in->get_or_open_dirfrag(in->mdcache, leaf);
+ dir->scrub_info();
+ if (!dir->scrub_infop->header)
+ dir->scrub_infop->header = in->scrub_infop->header;
+ if (dir->is_complete()) {
+ dir->scrub_local();
+ } else {
+ dir->scrub_infop->need_scrub_local = true;
+ dir->fetch(gather.new_sub(), false);
+ }
+ }
+ if (gather.has_subs()) {
+ gather.set_finisher(get_internal_callback(DIRFRAGS));
+ gather.activate();
+ return false;
+ } else {
+ return immediate(DIRFRAGS, 0);
+ }
+ }
+
+ bool _dirfrags(int rval) {
+ int frags_errors = 0;
+ // basic reporting setup
+ results->raw_stats.checked = true;
+ results->raw_stats.ondisk_read_retval = rval;
+
+ results->raw_stats.memory_value.dirstat = in->inode.dirstat;
+ results->raw_stats.memory_value.rstat = in->inode.rstat;
+ frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat;
+ nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat;
+
+ if (rval != 0) {
+ results->raw_stats.error_str << "Failed to read dirfrags off disk";
+ goto next;
+ }
+
+ // check each dirfrag...
+ for (const auto &p : in->dirfrags) {
+ CDir *dir = p.second;
+ ceph_assert(dir->get_version() > 0);
+ nest_info.add(dir->fnode.accounted_rstat);
+ dir_info.add(dir->fnode.accounted_fragstat);
+ if (dir->scrub_infop->pending_scrub_error) {
+ dir->scrub_infop->pending_scrub_error = false;
+ if (dir->scrub_infop->header->get_repair()) {
+ results->raw_stats.repaired = true;
+ results->raw_stats.error_str
+ << "dirfrag(" << p.first << ") has bad stats (will be fixed); ";
+ } else {
+ results->raw_stats.error_str
+ << "dirfrag(" << p.first << ") has bad stats; ";
+ }
+ frags_errors++;
+ }
+ }
+ nest_info.rsubdirs++; // it gets one to account for self
+ if (const sr_t *srnode = in->get_projected_srnode(); srnode)
+ nest_info.rsnaps += srnode->snaps.size();
+
+ // ...and that their sum matches our inode settings
+ if (!dir_info.same_sums(in->inode.dirstat) ||
+ !nest_info.same_sums(in->inode.rstat)) {
+ if (in->scrub_infop->header->get_repair()) {
+ results->raw_stats.error_str
+ << "freshly-calculated rstats don't match existing ones (will be fixed)";
+ in->mdcache->repair_inode_stats(in);
+ results->raw_stats.repaired = true;
+ } else {
+ results->raw_stats.error_str
+ << "freshly-calculated rstats don't match existing ones";
+ }
+ goto next;
+ }
+ if (frags_errors > 0)
+ goto next;
+
+ results->raw_stats.passed = true;
+next:
+ // snaprealm
+ return check_inode_snaprealm();
+ }
+
+ bool check_inode_snaprealm() {
+ if (!in->snaprealm)
+ return true;
+
+ if (!in->snaprealm->have_past_parents_open()) {
+ in->snaprealm->open_parents(get_internal_callback(SNAPREALM));
+ return false;
+ } else {
+ return immediate(SNAPREALM, 0);
+ }
+ }
+
+ bool _snaprealm(int rval) {
+
+ if (in->snaprealm->past_parents_dirty ||
+ !in->get_projected_srnode()->past_parents.empty()) {
+ // temporarily store error in field of on-disk inode validation temporarily
+ results->inode.checked = true;
+ results->inode.passed = false;
+ if (in->scrub_infop->header->get_repair()) {
+ results->inode.error_str << "Inode has old format snaprealm (will upgrade)";
+ results->inode.repaired = true;
+ in->mdcache->upgrade_inode_snaprealm(in);
+ } else {
+ results->inode.error_str << "Inode has old format snaprealm";
+ }
+ }
+ return true;
+ }
+
+ void _done() override {
+ if ((!results->raw_stats.checked || results->raw_stats.passed) &&
+ (!results->backtrace.checked || results->backtrace.passed) &&
+ (!results->inode.checked || results->inode.passed))
+ results->passed_validation = true;
+
+ // Flag that we did some repair work so that our repair operation
+ // can be flushed at end of scrub
+ if (results->backtrace.repaired ||
+ results->inode.repaired ||
+ results->raw_stats.repaired)
+ in->scrub_infop->header->set_repaired();
+ if (fin)
+ fin->complete(get_rval());
+ }
+ };
+
+
+ dout(10) << "scrub starting validate_disk_state on " << *this << dendl;
+ ValidationContinuation *vc = new ValidationContinuation(this,
+ results,
+ fin);
+ vc->begin();
+}
+
+void CInode::validated_data::dump(Formatter *f) const
+{
+ f->open_object_section("results");
+ {
+ f->dump_bool("performed_validation", performed_validation);
+ f->dump_bool("passed_validation", passed_validation);
+ f->open_object_section("backtrace");
+ {
+ f->dump_bool("checked", backtrace.checked);
+ f->dump_bool("passed", backtrace.passed);
+ f->dump_int("read_ret_val", backtrace.ondisk_read_retval);
+ f->dump_stream("ondisk_value") << backtrace.ondisk_value;
+ f->dump_stream("memoryvalue") << backtrace.memory_value;
+ f->dump_string("error_str", backtrace.error_str.str());
+ }
+ f->close_section(); // backtrace
+ f->open_object_section("raw_stats");
+ {
+ f->dump_bool("checked", raw_stats.checked);
+ f->dump_bool("passed", raw_stats.passed);
+ f->dump_int("read_ret_val", raw_stats.ondisk_read_retval);
+ f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat;
+ f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat;
+ f->dump_stream("memory_value.dirrstat") << raw_stats.memory_value.dirstat;
+ f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat;
+ f->dump_string("error_str", raw_stats.error_str.str());
+ }
+ f->close_section(); // raw_stats
+ // dump failure return code
+ int rc = 0;
+ if (backtrace.checked && backtrace.ondisk_read_retval)
+ rc = backtrace.ondisk_read_retval;
+ if (inode.checked && inode.ondisk_read_retval)
+ rc = inode.ondisk_read_retval;
+ if (raw_stats.checked && raw_stats.ondisk_read_retval)
+ rc = raw_stats.ondisk_read_retval;
+ f->dump_int("return_code", rc);
+ }
+ f->close_section(); // results
+}
+
+bool CInode::validated_data::all_damage_repaired() const
+{
+ bool unrepaired =
+ (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired)
+ ||
+ (backtrace.checked && !backtrace.passed && !backtrace.repaired)
+ ||
+ (inode.checked && !inode.passed && !inode.repaired);
+
+ return !unrepaired;
+}
+
+void CInode::dump(Formatter *f, int flags) const
+{
+ if (flags & DUMP_PATH) {
+ std::string path;
+ make_path_string(path, true);
+ if (path.empty())
+ path = "/";
+ f->dump_string("path", path);
+ }
+
+ if (flags & DUMP_INODE_STORE_BASE)
+ InodeStoreBase::dump(f);
+
+ if (flags & DUMP_MDS_CACHE_OBJECT)
+ MDSCacheObject::dump(f);
+
+ if (flags & DUMP_LOCKS) {
+ f->open_object_section("versionlock");
+ versionlock.dump(f);
+ f->close_section();
+
+ f->open_object_section("authlock");
+ authlock.dump(f);
+ f->close_section();
+
+ f->open_object_section("linklock");
+ linklock.dump(f);
+ f->close_section();
+
+ f->open_object_section("dirfragtreelock");
+ dirfragtreelock.dump(f);
+ f->close_section();
+
+ f->open_object_section("filelock");
+ filelock.dump(f);
+ f->close_section();
+
+ f->open_object_section("xattrlock");
+ xattrlock.dump(f);
+ f->close_section();
+
+ f->open_object_section("snaplock");
+ snaplock.dump(f);
+ f->close_section();
+
+ f->open_object_section("nestlock");
+ nestlock.dump(f);
+ f->close_section();
+
+ f->open_object_section("flocklock");
+ flocklock.dump(f);
+ f->close_section();
+
+ f->open_object_section("policylock");
+ policylock.dump(f);
+ f->close_section();
+ }
+
+ if (flags & DUMP_STATE) {
+ f->open_array_section("states");
+ MDSCacheObject::dump_states(f);
+ if (state_test(STATE_EXPORTING))
+ f->dump_string("state", "exporting");
+ if (state_test(STATE_OPENINGDIR))
+ f->dump_string("state", "openingdir");
+ if (state_test(STATE_FREEZING))
+ f->dump_string("state", "freezing");
+ if (state_test(STATE_FROZEN))
+ f->dump_string("state", "frozen");
+ if (state_test(STATE_AMBIGUOUSAUTH))
+ f->dump_string("state", "ambiguousauth");
+ if (state_test(STATE_EXPORTINGCAPS))
+ f->dump_string("state", "exportingcaps");
+ if (state_test(STATE_NEEDSRECOVER))
+ f->dump_string("state", "needsrecover");
+ if (state_test(STATE_PURGING))
+ f->dump_string("state", "purging");
+ if (state_test(STATE_DIRTYPARENT))
+ f->dump_string("state", "dirtyparent");
+ if (state_test(STATE_DIRTYRSTAT))
+ f->dump_string("state", "dirtyrstat");
+ if (state_test(STATE_STRAYPINNED))
+ f->dump_string("state", "straypinned");
+ if (state_test(STATE_FROZENAUTHPIN))
+ f->dump_string("state", "frozenauthpin");
+ if (state_test(STATE_DIRTYPOOL))
+ f->dump_string("state", "dirtypool");
+ if (state_test(STATE_ORPHAN))
+ f->dump_string("state", "orphan");
+ if (state_test(STATE_MISSINGOBJS))
+ f->dump_string("state", "missingobjs");
+ f->close_section();
+ }
+
+ if (flags & DUMP_CAPS) {
+ f->open_array_section("client_caps");
+ for (const auto &p : client_caps) {
+ auto &client = p.first;
+ auto cap = &p.second;
+ f->open_object_section("client_cap");
+ f->dump_int("client_id", client.v);
+ f->dump_string("pending", ccap_string(cap->pending()));
+ f->dump_string("issued", ccap_string(cap->issued()));
+ f->dump_string("wanted", ccap_string(cap->wanted()));
+ f->dump_int("last_sent", cap->get_last_seq());
+ f->close_section();
+ }
+ f->close_section();
+
+ f->dump_int("loner", loner_cap.v);
+ f->dump_int("want_loner", want_loner_cap.v);
+
+ f->open_array_section("mds_caps_wanted");
+ for (const auto &p : mds_caps_wanted) {
+ f->open_object_section("mds_cap_wanted");
+ f->dump_int("rank", p.first);
+ f->dump_string("cap", ccap_string(p.second));
+ f->close_section();
+ }
+ f->close_section();
+ }
+
+ if (flags & DUMP_DIRFRAGS) {
+ f->open_array_section("dirfrags");
+ list<CDir*> dfs;
+ get_dirfrags(dfs);
+ for(const auto &dir: dfs) {
+ f->open_object_section("dir");
+ dir->dump(f, CDir::DUMP_DEFAULT | CDir::DUMP_ITEMS);
+ dir->check_rstats();
+ f->close_section();
+ }
+ f->close_section();
+ }
+}
+
+/****** Scrub Stuff *****/
+void CInode::scrub_info_create() const
+{
+ dout(25) << __func__ << dendl;
+ ceph_assert(!scrub_infop);
+
+ // break out of const-land to set up implicit initial state
+ CInode *me = const_cast<CInode*>(this);
+ mempool_inode *in = me->get_projected_inode();
+
+ scrub_info_t *si = new scrub_info_t();
+ si->scrub_start_stamp = si->last_scrub_stamp = in->last_scrub_stamp;
+ si->scrub_start_version = si->last_scrub_version = in->last_scrub_version;
+
+ me->scrub_infop = si;
+}
+
+void CInode::scrub_maybe_delete_info()
+{
+ if (scrub_infop &&
+ !scrub_infop->scrub_in_progress &&
+ !scrub_infop->last_scrub_dirty) {
+ delete scrub_infop;
+ scrub_infop = NULL;
+ }
+}
+
+void CInode::scrub_initialize(CDentry *scrub_parent,
+ ScrubHeaderRef& header,
+ MDSContext *f)
+{
+ dout(20) << __func__ << " with scrub_version " << get_version() << dendl;
+ if (scrub_is_in_progress()) {
+ dout(20) << __func__ << " inode moved during scrub, reinitializing "
+ << dendl;
+ ceph_assert(scrub_infop->scrub_parent);
+ CDentry *dn = scrub_infop->scrub_parent;
+ CDir *dir = dn->dir;
+ dn->put(CDentry::PIN_SCRUBPARENT);
+ ceph_assert(dir->scrub_infop && dir->scrub_infop->directory_scrubbing);
+ dir->scrub_infop->directories_scrubbing.erase(dn->key());
+ dir->scrub_infop->others_scrubbing.erase(dn->key());
+ }
+ scrub_info();
+ if (!scrub_infop)
+ scrub_infop = new scrub_info_t();
+
+ if (get_projected_inode()->is_dir()) {
+ // fill in dirfrag_stamps with initial state
+ frag_vec_t leaves;
+ dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ if (header->get_force())
+ scrub_infop->dirfrag_stamps[leaf].reset();
+ else
+ scrub_infop->dirfrag_stamps[leaf];
+ }
+ }
+
+ if (scrub_parent)
+ scrub_parent->get(CDentry::PIN_SCRUBPARENT);
+ scrub_infop->scrub_parent = scrub_parent;
+ scrub_infop->on_finish = f;
+ scrub_infop->scrub_in_progress = true;
+ scrub_infop->children_scrubbed = false;
+ scrub_infop->header = header;
+
+ scrub_infop->scrub_start_version = get_version();
+ scrub_infop->scrub_start_stamp = ceph_clock_now();
+ // right now we don't handle remote inodes
+}
+
+int CInode::scrub_dirfrag_next(frag_t* out_dirfrag)
+{
+ dout(20) << __func__ << dendl;
+ ceph_assert(scrub_is_in_progress());
+
+ if (!is_dir()) {
+ return -ENOTDIR;
+ }
+
+ std::map<frag_t, scrub_stamp_info_t>::iterator i =
+ scrub_infop->dirfrag_stamps.begin();
+
+ while (i != scrub_infop->dirfrag_stamps.end()) {
+ if (i->second.scrub_start_version < scrub_infop->scrub_start_version) {
+ i->second.scrub_start_version = get_projected_version();
+ i->second.scrub_start_stamp = ceph_clock_now();
+ *out_dirfrag = i->first;
+ dout(20) << " return frag " << *out_dirfrag << dendl;
+ return 0;
+ }
+ ++i;
+ }
+
+ dout(20) << " no frags left, ENOENT " << dendl;
+ return ENOENT;
+}
+
+void CInode::scrub_dirfrags_scrubbing(frag_vec_t* out_dirfrags)
+{
+ ceph_assert(out_dirfrags != NULL);
+ ceph_assert(scrub_infop != NULL);
+
+ out_dirfrags->clear();
+ std::map<frag_t, scrub_stamp_info_t>::iterator i =
+ scrub_infop->dirfrag_stamps.begin();
+
+ while (i != scrub_infop->dirfrag_stamps.end()) {
+ if (i->second.scrub_start_version >= scrub_infop->scrub_start_version) {
+ if (i->second.last_scrub_version < scrub_infop->scrub_start_version)
+ out_dirfrags->push_back(i->first);
+ } else {
+ return;
+ }
+
+ ++i;
+ }
+}
+
+void CInode::scrub_dirfrag_finished(frag_t dirfrag)
+{
+ dout(20) << __func__ << " on frag " << dirfrag << dendl;
+ ceph_assert(scrub_is_in_progress());
+
+ std::map<frag_t, scrub_stamp_info_t>::iterator i =
+ scrub_infop->dirfrag_stamps.find(dirfrag);
+ ceph_assert(i != scrub_infop->dirfrag_stamps.end());
+
+ scrub_stamp_info_t &si = i->second;
+ si.last_scrub_stamp = si.scrub_start_stamp;
+ si.last_scrub_version = si.scrub_start_version;
+}
+
+void CInode::scrub_aborted(MDSContext **c) {
+ dout(20) << __func__ << dendl;
+ ceph_assert(scrub_is_in_progress());
+
+ *c = nullptr;
+ std::swap(*c, scrub_infop->on_finish);
+
+ if (scrub_infop->scrub_parent) {
+ CDentry *dn = scrub_infop->scrub_parent;
+ scrub_infop->scrub_parent = NULL;
+ dn->dir->scrub_dentry_finished(dn);
+ dn->put(CDentry::PIN_SCRUBPARENT);
+ }
+
+ delete scrub_infop;
+ scrub_infop = nullptr;
+}
+
+void CInode::scrub_finished(MDSContext **c) {
+ dout(20) << __func__ << dendl;
+ ceph_assert(scrub_is_in_progress());
+ for (std::map<frag_t, scrub_stamp_info_t>::iterator i =
+ scrub_infop->dirfrag_stamps.begin();
+ i != scrub_infop->dirfrag_stamps.end();
+ ++i) {
+ if(i->second.last_scrub_version != i->second.scrub_start_version) {
+ derr << i->second.last_scrub_version << " != "
+ << i->second.scrub_start_version << dendl;
+ }
+ ceph_assert(i->second.last_scrub_version == i->second.scrub_start_version);
+ }
+
+ scrub_infop->last_scrub_version = scrub_infop->scrub_start_version;
+ scrub_infop->last_scrub_stamp = scrub_infop->scrub_start_stamp;
+ scrub_infop->last_scrub_dirty = true;
+ scrub_infop->scrub_in_progress = false;
+
+ if (scrub_infop->scrub_parent) {
+ CDentry *dn = scrub_infop->scrub_parent;
+ scrub_infop->scrub_parent = NULL;
+ dn->dir->scrub_dentry_finished(dn);
+ dn->put(CDentry::PIN_SCRUBPARENT);
+ }
+
+ *c = scrub_infop->on_finish;
+ scrub_infop->on_finish = NULL;
+
+ if (scrub_infop->header->get_origin() == this) {
+ // We are at the point that a tagging scrub was initiated
+ LogChannelRef clog = mdcache->mds->clog;
+ clog->info() << "scrub complete with tag '"
+ << scrub_infop->header->get_tag() << "'";
+ }
+}
+
+int64_t CInode::get_backtrace_pool() const
+{
+ if (is_dir()) {
+ return mdcache->mds->mdsmap->get_metadata_pool();
+ } else {
+ // Files are required to have an explicit layout that specifies
+ // a pool
+ ceph_assert(inode.layout.pool_id != -1);
+ return inode.layout.pool_id;
+ }
+}
+
+void CInode::maybe_export_pin(bool update)
+{
+ if (!g_conf()->mds_bal_export_pin)
+ return;
+ if (!is_dir() || !is_normal())
+ return;
+
+ mds_rank_t export_pin = get_export_pin(false);
+ if (export_pin == MDS_RANK_NONE && !update)
+ return;
+
+ if (state_test(CInode::STATE_QUEUEDEXPORTPIN))
+ return;
+
+ bool queue = false;
+ for (auto p = dirfrags.begin(); p != dirfrags.end(); p++) {
+ CDir *dir = p->second;
+ if (!dir->is_auth())
+ continue;
+ if (export_pin != MDS_RANK_NONE) {
+ if (dir->is_subtree_root()) {
+ // set auxsubtree bit or export it
+ if (!dir->state_test(CDir::STATE_AUXSUBTREE) ||
+ export_pin != dir->get_dir_auth().first)
+ queue = true;
+ } else {
+ // create aux subtree or export it
+ queue = true;
+ }
+ } else {
+ // clear aux subtrees ?
+ queue = dir->state_test(CDir::STATE_AUXSUBTREE);
+ }
+ if (queue) {
+ state_set(CInode::STATE_QUEUEDEXPORTPIN);
+ mdcache->export_pin_queue.insert(this);
+ break;
+ }
+ }
+}
+
+void CInode::set_export_pin(mds_rank_t rank)
+{
+ ceph_assert(is_dir());
+ ceph_assert(is_projected());
+ get_projected_inode()->export_pin = rank;
+}
+
+mds_rank_t CInode::get_export_pin(bool inherit) const
+{
+ /* An inode that is export pinned may not necessarily be a subtree root, we
+ * need to traverse the parents. A base or system inode cannot be pinned.
+ * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
+ * have a parent yet.
+ */
+ const CInode *in = this;
+ while (true) {
+ if (in->is_system())
+ break;
+ const CDentry *pdn = in->get_parent_dn();
+ if (!pdn)
+ break;
+ // ignore export pin for unlinked directory
+ if (in->get_inode().nlink == 0)
+ break;
+ if (in->get_inode().export_pin >= 0)
+ return in->get_inode().export_pin;
+
+ if (!inherit)
+ break;
+ in = pdn->get_dir()->inode;
+ }
+ return MDS_RANK_NONE;
+}
+
+bool CInode::is_exportable(mds_rank_t dest) const
+{
+ mds_rank_t pin = get_export_pin();
+ if (pin == dest) {
+ return true;
+ } else if (pin >= 0) {
+ return false;
+ } else {
+ return true;
+ }
+}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co);
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
new file mode 100644
index 00000000..cbe8779a
--- /dev/null
+++ b/src/mds/CInode.h
@@ -0,0 +1,1227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef CEPH_CINODE_H
+#define CEPH_CINODE_H
+
+#include <list>
+#include <map>
+#include <set>
+#include <string_view>
+
+#include "common/config.h"
+#include "include/counter.h"
+#include "include/elist.h"
+#include "include/types.h"
+#include "include/lru.h"
+#include "include/compact_set.h"
+
+#include "MDSCacheObject.h"
+#include "MDSContext.h"
+#include "flock.h"
+
+#include "CDentry.h"
+#include "SimpleLock.h"
+#include "ScatterLock.h"
+#include "LocalLock.h"
+#include "Capability.h"
+#include "SnapRealm.h"
+#include "Mutation.h"
+
+#include "messages/MClientCaps.h"
+
+#define dout_context g_ceph_context
+
+class Context;
+class CDentry;
+class CDir;
+class CInode;
+class MDCache;
+class LogSegment;
+struct SnapRealm;
+class Session;
+struct ObjectOperation;
+class EMetaBlob;
+
+
+ostream& operator<<(ostream& out, const CInode& in);
+
+struct cinode_lock_info_t {
+ int lock;
+ int wr_caps;
+};
+
+extern cinode_lock_info_t cinode_lock_info[];
+extern int num_cinode_locks;
+
+
+/**
+ * Base class for CInode, containing the backing store data and
+ * serialization methods. This exists so that we can read and
+ * handle CInodes from the backing store without hitting all
+ * the business logic in CInode proper.
+ */
+class InodeStoreBase {
+public:
+ typedef inode_t<mempool::mds_co::pool_allocator> mempool_inode;
+ typedef old_inode_t<mempool::mds_co::pool_allocator> mempool_old_inode;
+ typedef mempool::mds_co::compact_map<snapid_t, mempool_old_inode> mempool_old_inode_map;
+ typedef xattr_map<mempool::mds_co::pool_allocator> mempool_xattr_map; // FIXME bufferptr not in mempool
+
+ mempool_inode inode; // the inode itself
+ mempool::mds_co::string symlink; // symlink dest, if symlink
+ mempool_xattr_map xattrs;
+ fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map.
+ mempool_old_inode_map old_inodes; // key = last, value.first = first
+ snapid_t oldest_snap = CEPH_NOSNAP;
+ damage_flags_t damage_flags = 0;
+
+ InodeStoreBase() {}
+
+ /* Helpers */
+ bool is_file() const { return inode.is_file(); }
+ bool is_symlink() const { return inode.is_symlink(); }
+ bool is_dir() const { return inode.is_dir(); }
+ static object_t get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix);
+
+ /* Full serialization for use in ".inode" root inode objects */
+ void encode(bufferlist &bl, uint64_t features, const bufferlist *snap_blob=NULL) const;
+ void decode(bufferlist::const_iterator &bl, bufferlist& snap_blob);
+
+ /* Serialization without ENCODE_START/FINISH blocks for use embedded in dentry */
+ void encode_bare(bufferlist &bl, uint64_t features, const bufferlist *snap_blob=NULL) const;
+ void decode_bare(bufferlist::const_iterator &bl, bufferlist &snap_blob, __u8 struct_v=5);
+
+ /* For test/debug output */
+ void dump(Formatter *f) const;
+
+ /* For use by offline tools */
+ __u32 hash_dentry_name(std::string_view dn);
+ frag_t pick_dirfrag(std::string_view dn);
+};
+
+inline void decode_noshare(InodeStoreBase::mempool_xattr_map& xattrs,
+ ceph::buffer::list::const_iterator &p)
+{
+ decode_noshare<mempool::mds_co::pool_allocator>(xattrs, p);
+}
+
+class InodeStore : public InodeStoreBase {
+public:
+ // FIXME bufferlist not part of mempool
+ bufferlist snap_blob; // Encoded copy of SnapRealm, because we can't
+ // rehydrate it without full MDCache
+ void encode(bufferlist &bl, uint64_t features) const {
+ InodeStoreBase::encode(bl, features, &snap_blob);
+ }
+ void decode(bufferlist::const_iterator &bl) {
+ InodeStoreBase::decode(bl, snap_blob);
+ }
+ void encode_bare(bufferlist &bl, uint64_t features) const {
+ InodeStoreBase::encode_bare(bl, features, &snap_blob);
+ }
+ void decode_bare(bufferlist::const_iterator &bl) {
+ InodeStoreBase::decode_bare(bl, snap_blob);
+ }
+
+ static void generate_test_instances(std::list<InodeStore*>& ls);
+};
+WRITE_CLASS_ENCODER_FEATURES(InodeStore)
+
+// just for ceph-dencoder
+class InodeStoreBare : public InodeStore {
+public:
+ void encode(bufferlist &bl, uint64_t features) const {
+ InodeStore::encode_bare(bl, features);
+ }
+ void decode(bufferlist::const_iterator &bl) {
+ InodeStore::decode_bare(bl);
+ }
+ static void generate_test_instances(std::list<InodeStoreBare*>& ls);
+};
+WRITE_CLASS_ENCODER_FEATURES(InodeStoreBare)
+
+// cached inode wrapper
+class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CInode> {
+ public:
+ MEMPOOL_CLASS_HELPERS();
+ // -- pins --
+ static const int PIN_DIRFRAG = -1;
+ static const int PIN_CAPS = 2; // client caps
+ static const int PIN_IMPORTING = -4; // importing
+ static const int PIN_OPENINGDIR = 7;
+ static const int PIN_REMOTEPARENT = 8;
+ static const int PIN_BATCHOPENJOURNAL = 9;
+ static const int PIN_SCATTERED = 10;
+ static const int PIN_STICKYDIRS = 11;
+ //static const int PIN_PURGING = -12;
+ static const int PIN_FREEZING = 13;
+ static const int PIN_FROZEN = 14;
+ static const int PIN_IMPORTINGCAPS = -15;
+ static const int PIN_PASTSNAPPARENT = -16;
+ static const int PIN_OPENINGSNAPPARENTS = 17;
+ static const int PIN_TRUNCATING = 18;
+ static const int PIN_STRAY = 19; // we pin our stray inode while active
+ static const int PIN_NEEDSNAPFLUSH = 20;
+ static const int PIN_DIRTYRSTAT = 21;
+ static const int PIN_EXPORTINGCAPS = 22;
+ static const int PIN_DIRTYPARENT = 23;
+ static const int PIN_DIRWAITER = 24;
+ static const int PIN_SCRUBQUEUE = 25;
+
+ std::string_view pin_name(int p) const override {
+ switch (p) {
+ case PIN_DIRFRAG: return "dirfrag";
+ case PIN_CAPS: return "caps";
+ case PIN_IMPORTING: return "importing";
+ case PIN_OPENINGDIR: return "openingdir";
+ case PIN_REMOTEPARENT: return "remoteparent";
+ case PIN_BATCHOPENJOURNAL: return "batchopenjournal";
+ case PIN_SCATTERED: return "scattered";
+ case PIN_STICKYDIRS: return "stickydirs";
+ //case PIN_PURGING: return "purging";
+ case PIN_FREEZING: return "freezing";
+ case PIN_FROZEN: return "frozen";
+ case PIN_IMPORTINGCAPS: return "importingcaps";
+ case PIN_EXPORTINGCAPS: return "exportingcaps";
+ case PIN_PASTSNAPPARENT: return "pastsnapparent";
+ case PIN_OPENINGSNAPPARENTS: return "openingsnapparents";
+ case PIN_TRUNCATING: return "truncating";
+ case PIN_STRAY: return "stray";
+ case PIN_NEEDSNAPFLUSH: return "needsnapflush";
+ case PIN_DIRTYRSTAT: return "dirtyrstat";
+ case PIN_DIRTYPARENT: return "dirtyparent";
+ case PIN_DIRWAITER: return "dirwaiter";
+ case PIN_SCRUBQUEUE: return "scrubqueue";
+ default: return generic_pin_name(p);
+ }
+ }
+
+ // -- dump flags --
+ static const int DUMP_INODE_STORE_BASE = (1 << 0);
+ static const int DUMP_MDS_CACHE_OBJECT = (1 << 1);
+ static const int DUMP_LOCKS = (1 << 2);
+ static const int DUMP_STATE = (1 << 3);
+ static const int DUMP_CAPS = (1 << 4);
+ static const int DUMP_PATH = (1 << 5);
+ static const int DUMP_DIRFRAGS = (1 << 6);
+ static const int DUMP_ALL = (-1);
+ static const int DUMP_DEFAULT = DUMP_ALL & (~DUMP_PATH) & (~DUMP_DIRFRAGS);
+
+ // -- state --
+ static const int STATE_EXPORTING = (1<<0); // on nonauth bystander.
+ static const int STATE_OPENINGDIR = (1<<1);
+ static const int STATE_FREEZING = (1<<2);
+ static const int STATE_FROZEN = (1<<3);
+ static const int STATE_AMBIGUOUSAUTH = (1<<4);
+ static const int STATE_EXPORTINGCAPS = (1<<5);
+ static const int STATE_NEEDSRECOVER = (1<<6);
+ static const int STATE_RECOVERING = (1<<7);
+ static const int STATE_PURGING = (1<<8);
+ static const int STATE_DIRTYPARENT = (1<<9);
+ static const int STATE_DIRTYRSTAT = (1<<10);
+ static const int STATE_STRAYPINNED = (1<<11);
+ static const int STATE_FROZENAUTHPIN = (1<<12);
+ static const int STATE_DIRTYPOOL = (1<<13);
+ static const int STATE_REPAIRSTATS = (1<<14);
+ static const int STATE_MISSINGOBJS = (1<<15);
+ static const int STATE_EVALSTALECAPS = (1<<16);
+ static const int STATE_QUEUEDEXPORTPIN = (1<<17);
+ static const int STATE_TRACKEDBYOFT = (1<<18); // tracked by open file table
+ static const int STATE_DELAYEDEXPORTPIN = (1<<19);
+ // orphan inode needs notification of releasing reference
+ static const int STATE_ORPHAN = STATE_NOTIFYREF;
+
+ static const int MASK_STATE_EXPORTED =
+ (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
+ static const int MASK_STATE_EXPORT_KEPT =
+ (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS|
+ STATE_QUEUEDEXPORTPIN|STATE_TRACKEDBYOFT|STATE_DELAYEDEXPORTPIN);
+
+ // -- waiters --
+ static const uint64_t WAIT_DIR = (1<<0);
+ static const uint64_t WAIT_FROZEN = (1<<1);
+ static const uint64_t WAIT_TRUNC = (1<<2);
+ static const uint64_t WAIT_FLOCK = (1<<3);
+
+ static const uint64_t WAIT_ANY_MASK = (uint64_t)(-1);
+
+ // misc
+ static const unsigned EXPORT_NONCE = 1; // nonce given to replicas created by export
+
+ ostream& print_db_line_prefix(ostream& out) override;
+
+ public:
+ MDCache *mdcache;
+
+ SnapRealm *snaprealm = nullptr;
+ SnapRealm *containing_realm = nullptr;
+ snapid_t first, last;
+ mempool::mds_co::compact_set<snapid_t> dirty_old_rstats;
+
+ class scrub_stamp_info_t {
+ public:
+ /// version we started our latest scrub (whether in-progress or finished)
+ version_t scrub_start_version = 0;
+ /// time we started our latest scrub (whether in-progress or finished)
+ utime_t scrub_start_stamp;
+ /// version we started our most recent finished scrub
+ version_t last_scrub_version = 0;
+ /// time we started our most recent finished scrub
+ utime_t last_scrub_stamp;
+ scrub_stamp_info_t() {}
+ void reset() {
+ scrub_start_version = last_scrub_version = 0;
+ scrub_start_stamp = last_scrub_stamp = utime_t();
+ }
+ };
+
+ class scrub_info_t : public scrub_stamp_info_t {
+ public:
+ CDentry *scrub_parent = nullptr;
+ MDSContext *on_finish = nullptr;
+
+ bool last_scrub_dirty = false; /// are our stamps dirty with respect to disk state?
+ bool scrub_in_progress = false; /// are we currently scrubbing?
+ bool children_scrubbed = false;
+
+ /// my own (temporary) stamps and versions for each dirfrag we have
+ std::map<frag_t, scrub_stamp_info_t> dirfrag_stamps; // XXX not part of mempool
+
+ ScrubHeaderRef header;
+
+ scrub_info_t() {}
+ };
+
+ const scrub_info_t *scrub_info() const{
+ if (!scrub_infop)
+ scrub_info_create();
+ return scrub_infop;
+ }
+
+ ScrubHeaderRef get_scrub_header() {
+ if (scrub_infop == nullptr) {
+ return nullptr;
+ } else {
+ return scrub_infop->header;
+ }
+ }
+
+ bool scrub_is_in_progress() const {
+ return (scrub_infop && scrub_infop->scrub_in_progress);
+ }
+ /**
+ * Start scrubbing on this inode. That could be very short if it's
+ * a file, or take a long time if we're recursively scrubbing a directory.
+ * @pre It is not currently scrubbing
+ * @post it has set up internal scrubbing state
+ * @param scrub_version What version are we scrubbing at (usually, parent
+ * directory's get_projected_version())
+ */
+ void scrub_initialize(CDentry *scrub_parent,
+ ScrubHeaderRef& header,
+ MDSContext *f);
+ /**
+ * Get the next dirfrag to scrub. Gives you a frag_t in output param which
+ * you must convert to a CDir (and possibly load off disk).
+ * @param dir A pointer to frag_t, will be filled in with the next dirfrag to
+ * scrub if there is one.
+ * @returns 0 on success, you should scrub the passed-out frag_t right now;
+ * ENOENT: There are no remaining dirfrags to scrub
+ * <0 There was some other error (It will return -ENOTDIR if not a directory)
+ */
+ int scrub_dirfrag_next(frag_t* out_dirfrag);
+ /**
+ * Get the currently scrubbing dirfrags. When returned, the
+ * passed-in list will be filled in with all frag_ts which have
+ * been returned from scrub_dirfrag_next but not sent back
+ * via scrub_dirfrag_finished.
+ */
+ void scrub_dirfrags_scrubbing(frag_vec_t *out_dirfrags);
+ /**
+ * Report to the CInode that a dirfrag it owns has been scrubbed. Call
+ * this for every frag_t returned from scrub_dirfrag_next().
+ * @param dirfrag The frag_t that was scrubbed
+ */
+ void scrub_dirfrag_finished(frag_t dirfrag);
+ /**
+ * Call this once the scrub has been completed, whether it's a full
+ * recursive scrub on a directory or simply the data on a file (or
+ * anything in between).
+ * @param c An out param which is filled in with a Context* that must
+ * be complete()ed.
+ */
+ void scrub_finished(MDSContext **c);
+
+ void scrub_aborted(MDSContext **c);
+
+ /**
+ * Report to the CInode that alldirfrags it owns have been scrubbed.
+ */
+ void scrub_children_finished() {
+ scrub_infop->children_scrubbed = true;
+ }
+ void scrub_set_finisher(MDSContext *c) {
+ ceph_assert(!scrub_infop->on_finish);
+ scrub_infop->on_finish = c;
+ }
+
+private:
+ /**
+ * Create a scrub_info_t struct for the scrub_infop pointer.
+ */
+ void scrub_info_create() const;
+ /**
+ * Delete the scrub_info_t struct if it's not got any useful data
+ */
+ void scrub_maybe_delete_info();
+public:
+
+ bool is_multiversion() const {
+ return snaprealm || // other snaprealms will link to me
+ inode.is_dir() || // links to me in other snaps
+ inode.nlink > 1 || // there are remote links, possibly snapped, that will need to find me
+ !old_inodes.empty(); // once multiversion, always multiversion. until old_inodes gets cleaned out.
+ }
+ snapid_t get_oldest_snap();
+
+ uint64_t last_journaled = 0; // log offset for the last time i was journaled
+ //loff_t last_open_journaled; // log offset for the last journaled EOpen
+ utime_t last_dirstat_prop;
+
+
+ // list item node for when we have unpropagated rstat data
+ elist<CInode*>::item dirty_rstat_item;
+
+ bool is_dirty_rstat() {
+ return state_test(STATE_DIRTYRSTAT);
+ }
+ void mark_dirty_rstat();
+ void clear_dirty_rstat();
+
+ //bool hack_accessed = false;
+ //utime_t hack_load_stamp;
+
+ /**
+ * Projection methods, used to store inode changes until they have been journaled,
+ * at which point they are popped.
+ * Usage:
+ * project_inode as needed. If you're changing xattrs or sr_t, then pass true
+ * as needed then change the xattrs/snapnode member as needed. (Dirty
+ * exception: project_past_snaprealm_parent allows you to project the
+ * snapnode after doing project_inode (i.e. you don't need to pass
+ * snap=true).
+ *
+ * Then, journal. Once journaling is done, pop_and_dirty_projected_inode.
+ * This function will take care of the inode itself, the xattrs, and the snaprealm.
+ */
+
+ class projected_inode {
+ public:
+ static sr_t* const UNDEF_SRNODE;
+
+ mempool_inode inode;
+ std::unique_ptr<mempool_xattr_map> xattrs;
+ sr_t *snapnode = UNDEF_SRNODE;
+
+ projected_inode() = delete;
+ explicit projected_inode(const mempool_inode &in) : inode(in) {}
+ };
+
+private:
+ mempool::mds_co::list<projected_inode> projected_nodes; // projected values (only defined while dirty)
+ size_t num_projected_xattrs = 0;
+ size_t num_projected_srnodes = 0;
+
+public:
+ CInode::projected_inode &project_inode(bool xattr = false, bool snap = false);
+ void pop_and_dirty_projected_inode(LogSegment *ls);
+
+ projected_inode *get_projected_node() {
+ if (projected_nodes.empty())
+ return NULL;
+ else
+ return &projected_nodes.back();
+ }
+
+ version_t get_projected_version() const {
+ if (projected_nodes.empty())
+ return inode.version;
+ else
+ return projected_nodes.back().inode.version;
+ }
+ bool is_projected() const {
+ return !projected_nodes.empty();
+ }
+
+ const mempool_inode *get_projected_inode() const {
+ if (projected_nodes.empty())
+ return &inode;
+ else
+ return &projected_nodes.back().inode;
+ }
+ mempool_inode *get_projected_inode() {
+ if (projected_nodes.empty())
+ return &inode;
+ else
+ return &projected_nodes.back().inode;
+ }
+ mempool_inode *get_previous_projected_inode() {
+ ceph_assert(!projected_nodes.empty());
+ auto it = projected_nodes.rbegin();
+ ++it;
+ if (it != projected_nodes.rend())
+ return &it->inode;
+ else
+ return &inode;
+ }
+
+ mempool_xattr_map *get_projected_xattrs() {
+ if (num_projected_xattrs > 0) {
+ for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
+ if (it->xattrs)
+ return it->xattrs.get();
+ }
+ return &xattrs;
+ }
+ mempool_xattr_map *get_previous_projected_xattrs() {
+ if (num_projected_xattrs > 0) {
+ for (auto it = ++projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
+ if (it->xattrs)
+ return it->xattrs.get();
+ }
+ return &xattrs;
+ }
+
+ sr_t *prepare_new_srnode(snapid_t snapid);
+ void project_snaprealm(sr_t *new_srnode);
+ sr_t *project_snaprealm(snapid_t snapid=0) {
+ sr_t* new_srnode = prepare_new_srnode(snapid);
+ project_snaprealm(new_srnode);
+ return new_srnode;
+ }
+ const sr_t *get_projected_srnode() const {
+ if (num_projected_srnodes > 0) {
+ for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
+ if (it->snapnode != projected_inode::UNDEF_SRNODE)
+ return it->snapnode;
+ }
+ if (snaprealm)
+ return &snaprealm->srnode;
+ else
+ return NULL;
+ }
+
+ void mark_snaprealm_global(sr_t *new_srnode);
+ void clear_snaprealm_global(sr_t *new_srnode);
+ bool is_projected_snaprealm_global() const;
+
+ void record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent);
+ void record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *newparent,
+ CDentry *dn, bool primary_dn);
+ void project_snaprealm_past_parent(SnapRealm *newparent);
+ void early_pop_projected_snaprealm();
+
+private:
+ void pop_projected_snaprealm(sr_t *next_snaprealm, bool early);
+
+public:
+ mempool_old_inode& cow_old_inode(snapid_t follows, bool cow_head);
+ void split_old_inode(snapid_t snap);
+ mempool_old_inode *pick_old_inode(snapid_t last);
+ void pre_cow_old_inode();
+ bool has_snap_data(snapid_t s);
+ void purge_stale_snap_data(const std::set<snapid_t>& snaps);
+
+ // -- cache infrastructure --
+private:
+ mempool::mds_co::compact_map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode
+
+ //for the purpose of quickly determining whether there's a subtree root or exporting dir
+ int num_subtree_roots = 0;
+ int num_exporting_dirs = 0;
+
+ int stickydir_ref = 0;
+ scrub_info_t *scrub_infop = nullptr;
+
+public:
+ bool has_dirfrags() { return !dirfrags.empty(); }
+ CDir* get_dirfrag(frag_t fg) {
+ auto pi = dirfrags.find(fg);
+ if (pi != dirfrags.end()) {
+ //assert(g_conf()->debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME
+ return pi->second;
+ }
+ return NULL;
+ }
+ bool get_dirfrags_under(frag_t fg, std::list<CDir*>& ls);
+ CDir* get_approx_dirfrag(frag_t fg);
+
+ template<typename Container>
+ void get_dirfrags(Container& ls) const {
+ // all dirfrags
+ if constexpr (std::is_same_v<Container, std::vector<CDir*>>)
+ ls.reserve(ls.size() + dirfrags.size());
+ for (const auto &p : dirfrags)
+ ls.push_back(p.second);
+ }
+ template<typename Container>
+ void get_nested_dirfrags(Container& ls) const {
+ // dirfrags in same subtree
+ if constexpr (std::is_same_v<Container, std::vector<CDir*>>)
+ ls.reserve(ls.size() + dirfrags.size() - num_subtree_roots);
+ for (const auto &p : dirfrags) {
+ typename Container::value_type dir = p.second;
+ if (!dir->is_subtree_root())
+ ls.push_back(dir);
+ }
+ }
+ template<typename Container>
+ void get_subtree_dirfrags(Container& ls) {
+ // dirfrags that are roots of new subtrees
+ if constexpr (std::is_same_v<Container, std::vector<CDir*>>)
+ ls.reserve(ls.size() + num_subtree_roots);
+ for (const auto &p : dirfrags) {
+ typename Container::value_type dir = p.second;
+ if (dir->is_subtree_root())
+ ls.push_back(dir);
+ }
+ }
+ int get_num_subtree_roots() const {
+ return num_subtree_roots;
+ }
+
+ CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg);
+ CDir *add_dirfrag(CDir *dir);
+ void close_dirfrag(frag_t fg);
+ void close_dirfrags();
+ bool has_subtree_root_dirfrag(int auth=-1);
+ bool has_subtree_or_exporting_dirfrag();
+
+ void force_dirfrags();
+ void verify_dirfrags();
+
+ void get_stickydirs();
+ void put_stickydirs();
+
+ protected:
+ // parent dentries in cache
+ CDentry *parent = nullptr; // primary link
+ mempool::mds_co::compact_set<CDentry*> remote_parents; // if hard linked
+
+ mempool::mds_co::list<CDentry*> projected_parent; // for in-progress rename, (un)link, etc.
+
+ mds_authority_t inode_auth = CDIR_AUTH_DEFAULT;
+
+ // -- distributed state --
+protected:
+ // file capabilities
+ using mempool_cap_map = mempool::mds_co::map<client_t, Capability>;
+ mempool_cap_map client_caps; // client -> caps
+ mempool::mds_co::compact_map<int32_t, int32_t> mds_caps_wanted; // [auth] mds -> caps wanted
+ int replica_caps_wanted = 0; // [replica] what i've requested from auth
+ int num_caps_wanted = 0;
+
+public:
+ mempool::mds_co::set<client_t> client_snap_caps;
+ mempool::mds_co::compact_map<snapid_t, mempool::mds_co::set<client_t> > client_need_snapflush;
+
+ void add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
+ void remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
+ pair<bool,bool> split_need_snapflush(CInode *cowin, CInode *in);
+
+protected:
+
+ ceph_lock_state_t *fcntl_locks = nullptr;
+ ceph_lock_state_t *flock_locks = nullptr;
+
+ ceph_lock_state_t *get_fcntl_lock_state() {
+ if (!fcntl_locks)
+ fcntl_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FCNTL);
+ return fcntl_locks;
+ }
+ void clear_fcntl_lock_state() {
+ delete fcntl_locks;
+ fcntl_locks = NULL;
+ }
+ ceph_lock_state_t *get_flock_lock_state() {
+ if (!flock_locks)
+ flock_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FLOCK);
+ return flock_locks;
+ }
+ void clear_flock_lock_state() {
+ delete flock_locks;
+ flock_locks = NULL;
+ }
+ void clear_file_locks() {
+ clear_fcntl_lock_state();
+ clear_flock_lock_state();
+ }
+ void _encode_file_locks(bufferlist& bl) const {
+ using ceph::encode;
+ bool has_fcntl_locks = fcntl_locks && !fcntl_locks->empty();
+ encode(has_fcntl_locks, bl);
+ if (has_fcntl_locks)
+ encode(*fcntl_locks, bl);
+ bool has_flock_locks = flock_locks && !flock_locks->empty();
+ encode(has_flock_locks, bl);
+ if (has_flock_locks)
+ encode(*flock_locks, bl);
+ }
+ void _decode_file_locks(bufferlist::const_iterator& p) {
+ using ceph::decode;
+ bool has_fcntl_locks;
+ decode(has_fcntl_locks, p);
+ if (has_fcntl_locks)
+ decode(*get_fcntl_lock_state(), p);
+ else
+ clear_fcntl_lock_state();
+ bool has_flock_locks;
+ decode(has_flock_locks, p);
+ if (has_flock_locks)
+ decode(*get_flock_lock_state(), p);
+ else
+ clear_flock_lock_state();
+ }
+
+ // LogSegment lists i (may) belong to
+public:
+ elist<CInode*>::item item_dirty;
+ elist<CInode*>::item item_caps;
+ elist<CInode*>::item item_open_file;
+ elist<CInode*>::item item_dirty_parent;
+ elist<CInode*>::item item_dirty_dirfrag_dir;
+ elist<CInode*>::item item_dirty_dirfrag_nest;
+ elist<CInode*>::item item_dirty_dirfrag_dirfragtree;
+ elist<CInode*>::item item_scrub;
+
+ // also update RecoveryQueue::RecoveryQueue() if you change this
+ elist<CInode*>::item& item_recover_queue = item_dirty_dirfrag_dir;
+ elist<CInode*>::item& item_recover_queue_front = item_dirty_dirfrag_nest;
+
+public:
+ int auth_pin_freeze_allowance = 0;
+
+ inode_load_vec_t pop;
+ elist<CInode*>::item item_pop_lru;
+
+ // friends
+ friend class Server;
+ friend class Locker;
+ friend class Migrator;
+ friend class MDCache;
+ friend class StrayManager;
+ friend class CDir;
+ friend class CInodeExport;
+
+ // ---------------------------
+ CInode() = delete;
+ CInode(MDCache *c, bool auth=true, snapid_t f=2, snapid_t l=CEPH_NOSNAP);
+ ~CInode() override {
+ close_dirfrags();
+ close_snaprealm();
+ clear_file_locks();
+ ceph_assert(num_projected_xattrs == 0);
+ ceph_assert(num_projected_srnodes == 0);
+ ceph_assert(num_caps_wanted == 0);
+ ceph_assert(num_subtree_roots == 0);
+ ceph_assert(num_exporting_dirs == 0);
+ }
+
+
+ // -- accessors --
+ bool is_root() const { return inode.ino == MDS_INO_ROOT; }
+ bool is_stray() const { return MDS_INO_IS_STRAY(inode.ino); }
+ mds_rank_t get_stray_owner() const {
+ return (mds_rank_t)MDS_INO_STRAY_OWNER(inode.ino);
+ }
+ bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(inode.ino); }
+ bool is_base() const { return MDS_INO_IS_BASE(inode.ino); }
+ bool is_system() const { return inode.ino < MDS_INO_SYSTEM_BASE; }
+ bool is_normal() const { return !(is_base() || is_system() || is_stray()); }
+
+ bool is_head() const { return last == CEPH_NOSNAP; }
+
+ // note: this overloads MDSCacheObject
+ bool is_ambiguous_auth() const {
+ return state_test(STATE_AMBIGUOUSAUTH) ||
+ MDSCacheObject::is_ambiguous_auth();
+ }
+ void set_ambiguous_auth() {
+ state_set(STATE_AMBIGUOUSAUTH);
+ }
+ void clear_ambiguous_auth(MDSContext::vec& finished);
+ void clear_ambiguous_auth();
+
+ inodeno_t ino() const { return inode.ino; }
+ vinodeno_t vino() const { return vinodeno_t(inode.ino, last); }
+ int d_type() const { return IFTODT(inode.mode); }
+
+ mempool_inode& get_inode() { return inode; }
+ const mempool_inode& get_inode() const { return inode; }
+ CDentry* get_parent_dn() { return parent; }
+ const CDentry* get_parent_dn() const { return parent; }
+ CDentry* get_projected_parent_dn() { return !projected_parent.empty() ? projected_parent.back() : parent; }
+ const CDentry* get_projected_parent_dn() const { return !projected_parent.empty() ? projected_parent.back() : parent; }
+ const CDentry* get_oldest_parent_dn() const {
+ if (parent)
+ return parent;
+ return !projected_parent.empty() ? projected_parent.front(): NULL;
+ }
+ CDir *get_parent_dir();
+ const CDir *get_projected_parent_dir() const;
+ CDir *get_projected_parent_dir();
+ CInode *get_parent_inode();
+
+ bool is_lt(const MDSCacheObject *r) const override {
+ const CInode *o = static_cast<const CInode*>(r);
+ return ino() < o->ino() ||
+ (ino() == o->ino() && last < o->last);
+ }
+
+ // -- misc --
+ bool is_ancestor_of(const CInode *other) const;
+ bool is_projected_ancestor_of(const CInode *other) const;
+
+ void make_path_string(std::string& s, bool projected=false, const CDentry *use_parent=NULL) const;
+ void make_path(filepath& s, bool projected=false) const;
+ void name_stray_dentry(std::string& dname);
+
+ // -- dirtyness --
+ version_t get_version() const { return inode.version; }
+
+ version_t pre_dirty();
+ void _mark_dirty(LogSegment *ls);
+ void mark_dirty(version_t projected_dirv, LogSegment *ls);
+ void mark_clean();
+
+ void store(MDSContext *fin);
+ void _stored(int r, version_t cv, Context *fin);
+ /**
+ * Flush a CInode to disk. This includes the backtrace, the parent
+ * directory's link, and the Inode object itself (if a base directory).
+ * @pre is_auth() on both the inode and its containing directory
+ * @pre can_auth_pin()
+ * @param fin The Context to call when the flush is completed.
+ */
+ void flush(MDSContext *fin);
+ void fetch(MDSContext *fin);
+ void _fetched(bufferlist& bl, bufferlist& bl2, Context *fin);
+
+
+ void build_backtrace(int64_t pool, inode_backtrace_t& bt);
+ void store_backtrace(MDSContext *fin, int op_prio=-1);
+ void _stored_backtrace(int r, version_t v, Context *fin);
+ void fetch_backtrace(Context *fin, bufferlist *backtrace);
+protected:
+ /**
+ * Return the pool ID where we currently write backtraces for
+ * this inode (in addition to inode.old_pools)
+ *
+ * @returns a pool ID >=0
+ */
+ int64_t get_backtrace_pool() const;
+public:
+ void mark_dirty_parent(LogSegment *ls, bool dirty_pool=false);
+ void clear_dirty_parent();
+ void verify_diri_backtrace(bufferlist &bl, int err);
+ bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); }
+ bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL); }
+
+ void encode_snap_blob(bufferlist &bl);
+ void decode_snap_blob(const bufferlist &bl);
+ void encode_store(bufferlist& bl, uint64_t features);
+ void decode_store(bufferlist::const_iterator& bl);
+
+ void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features, bool need_recover) {
+ ceph_assert(is_auth());
+
+ __u32 nonce = add_replica(rep);
+ using ceph::encode;
+ encode(nonce, bl);
+
+ _encode_base(bl, features);
+ _encode_locks_state_for_replica(bl, need_recover);
+ }
+ void decode_replica(bufferlist::const_iterator& p, bool is_new) {
+ using ceph::decode;
+ __u32 nonce;
+ decode(nonce, p);
+ replica_nonce = nonce;
+
+ _decode_base(p);
+ _decode_locks_state(p, is_new);
+ }
+
+ // -- waiting --
+protected:
+ mempool::mds_co::compact_map<frag_t, MDSContext::vec > waiting_on_dir;
+public:
+ void add_dir_waiter(frag_t fg, MDSContext *c);
+ void take_dir_waiting(frag_t fg, MDSContext::vec& ls);
+ bool is_waiting_for_dir(frag_t fg) {
+ return waiting_on_dir.count(fg);
+ }
+ void add_waiter(uint64_t tag, MDSContext *c) override;
+ void take_waiting(uint64_t tag, MDSContext::vec& ls) override;
+
+ // -- encode/decode helpers --
+ void _encode_base(bufferlist& bl, uint64_t features);
+ void _decode_base(bufferlist::const_iterator& p);
+ void _encode_locks_full(bufferlist& bl);
+ void _decode_locks_full(bufferlist::const_iterator& p);
+ void _encode_locks_state_for_replica(bufferlist& bl, bool need_recover);
+ void _encode_locks_state_for_rejoin(bufferlist& bl, int rep);
+ void _decode_locks_state(bufferlist::const_iterator& p, bool is_new);
+ void _decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters,
+ std::list<SimpleLock*>& eval_locks, bool survivor);
+
+ // -- import/export --
+ void encode_export(bufferlist& bl);
+ void finish_export();
+ void abort_export() {
+ put(PIN_TEMPEXPORTING);
+ ceph_assert(state_test(STATE_EXPORTINGCAPS));
+ state_clear(STATE_EXPORTINGCAPS);
+ put(PIN_EXPORTINGCAPS);
+ }
+ void decode_import(bufferlist::const_iterator& p, LogSegment *ls);
+
+
+ // for giving to clients
+ int encode_inodestat(bufferlist& bl, Session *session, SnapRealm *realm,
+ snapid_t snapid=CEPH_NOSNAP, unsigned max_bytes=0,
+ int getattr_wants=0);
+ void encode_cap_message(const MClientCaps::ref &m, Capability *cap);
+
+
+ // -- locks --
+public:
+ static LockType versionlock_type;
+ static LockType authlock_type;
+ static LockType linklock_type;
+ static LockType dirfragtreelock_type;
+ static LockType filelock_type;
+ static LockType xattrlock_type;
+ static LockType snaplock_type;
+ static LockType nestlock_type;
+ static LockType flocklock_type;
+ static LockType policylock_type;
+
+ // FIXME not part of mempool
+ LocalLock versionlock;
+ SimpleLock authlock;
+ SimpleLock linklock;
+ ScatterLock dirfragtreelock;
+ ScatterLock filelock;
+ SimpleLock xattrlock;
+ SimpleLock snaplock;
+ ScatterLock nestlock;
+ SimpleLock flocklock;
+ SimpleLock policylock;
+
+ SimpleLock* get_lock(int type) override {
+ switch (type) {
+ case CEPH_LOCK_IFILE: return &filelock;
+ case CEPH_LOCK_IAUTH: return &authlock;
+ case CEPH_LOCK_ILINK: return &linklock;
+ case CEPH_LOCK_IDFT: return &dirfragtreelock;
+ case CEPH_LOCK_IXATTR: return &xattrlock;
+ case CEPH_LOCK_ISNAP: return &snaplock;
+ case CEPH_LOCK_INEST: return &nestlock;
+ case CEPH_LOCK_IFLOCK: return &flocklock;
+ case CEPH_LOCK_IPOLICY: return &policylock;
+ }
+ return 0;
+ }
+
+ void set_object_info(MDSCacheObjectInfo &info) override;
+ void encode_lock_state(int type, bufferlist& bl) override;
+ void decode_lock_state(int type, const bufferlist& bl) override;
+
+ void _finish_frag_update(CDir *dir, MutationRef& mut);
+
+ void clear_dirty_scattered(int type) override;
+ bool is_dirty_scattered();
+ void clear_scatter_dirty(); // on rejoin ack
+
+ void start_scatter(ScatterLock *lock);
+ void finish_scatter_update(ScatterLock *lock, CDir *dir,
+ version_t inode_version, version_t dir_accounted_version);
+ void finish_scatter_gather_update(int type);
+ void finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob);
+
+ // -- snap --
+ void open_snaprealm(bool no_split=false);
+ void close_snaprealm(bool no_join=false);
+ SnapRealm *find_snaprealm() const;
+ void encode_snap(bufferlist& bl);
+ void decode_snap(bufferlist::const_iterator& p);
+
+ // -- caps -- (new)
+ // client caps
+ client_t loner_cap = -1, want_loner_cap = -1;
+
+ client_t get_loner() const { return loner_cap; }
+ client_t get_wanted_loner() const { return want_loner_cap; }
+
+ // this is the loner state our locks should aim for
+ client_t get_target_loner() const {
+ if (loner_cap == want_loner_cap)
+ return loner_cap;
+ else
+ return -1;
+ }
+
+ client_t calc_ideal_loner();
+ void set_loner_cap(client_t l);
+ bool choose_ideal_loner();
+ bool try_set_loner();
+ bool try_drop_loner();
+
+ // choose new lock state during recovery, based on issued caps
+ void choose_lock_state(SimpleLock *lock, int allissued);
+ void choose_lock_states(int dirty_caps);
+
+ int count_nonstale_caps() {
+ int n = 0;
+ for (const auto &p : client_caps) {
+ if (!p.second.is_stale())
+ n++;
+ }
+ return n;
+ }
+ bool multiple_nonstale_caps() {
+ int n = 0;
+ for (const auto &p : client_caps) {
+ if (!p.second.is_stale()) {
+ if (n)
+ return true;
+ n++;
+ }
+ }
+ return false;
+ }
+
+ bool is_any_caps() { return !client_caps.empty(); }
+ bool is_any_nonstale_caps() { return count_nonstale_caps(); }
+
+ const mempool::mds_co::compact_map<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted; }
+ void set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m);
+ void set_mds_caps_wanted(mds_rank_t mds, int32_t wanted);
+
+ const mempool_cap_map& get_client_caps() const { return client_caps; }
+ Capability *get_client_cap(client_t client) {
+ auto client_caps_entry = client_caps.find(client);
+ if (client_caps_entry != client_caps.end())
+ return &client_caps_entry->second;
+ return 0;
+ }
+ int get_client_cap_pending(client_t client) const {
+ auto client_caps_entry = client_caps.find(client);
+ if (client_caps_entry != client_caps.end()) {
+ return client_caps_entry->second.pending();
+ } else {
+ return 0;
+ }
+ }
+
+ int get_num_caps_wanted() const { return num_caps_wanted; }
+ void adjust_num_caps_wanted(int d);
+
+ Capability *add_client_cap(client_t client, Session *session, SnapRealm *conrealm=0);
+ void remove_client_cap(client_t client);
+ void move_to_realm(SnapRealm *realm);
+
+ Capability *reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session);
+ void clear_client_caps_after_export();
+ void export_client_caps(std::map<client_t,Capability::Export>& cl);
+
+ // caps allowed
+ int get_caps_liked() const;
+ int get_caps_allowed_ever() const;
+ int get_caps_allowed_by_type(int type) const;
+ int get_caps_careful() const;
+ int get_xlocker_mask(client_t client) const;
+ int get_caps_allowed_for_client(Session *s, Capability *cap, mempool_inode *file_i) const;
+
+ // caps issued, wanted
+ int get_caps_issued(int *ploner = 0, int *pother = 0, int *pxlocker = 0,
+ int shift = 0, int mask = -1);
+ bool is_any_caps_wanted() const;
+ int get_caps_wanted(int *ploner = 0, int *pother = 0, int shift = 0, int mask = -1) const;
+ bool issued_caps_need_gather(SimpleLock *lock);
+
+ // -- authority --
+ mds_authority_t authority() const override;
+
+ // -- auth pins --
+ bool can_auth_pin(int *err_ret=nullptr) const override;
+ void auth_pin(void *by) override;
+ void auth_unpin(void *by) override;
+
+ // -- freeze --
+ bool is_freezing_inode() const { return state_test(STATE_FREEZING); }
+ bool is_frozen_inode() const { return state_test(STATE_FROZEN); }
+ bool is_frozen_auth_pin() const { return state_test(STATE_FROZENAUTHPIN); }
+ bool is_frozen() const override;
+ bool is_frozen_dir() const;
+ bool is_freezing() const override;
+
+ /* Freeze the inode. auth_pin_allowance lets the caller account for any
+ * auth_pins it is itself holding/responsible for. */
+ bool freeze_inode(int auth_pin_allowance=0);
+ void unfreeze_inode(MDSContext::vec& finished);
+ void unfreeze_inode();
+
+ void freeze_auth_pin();
+ void unfreeze_auth_pin();
+
+ // -- reference counting --
+ void bad_put(int by) override {
+ generic_dout(0) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref
+#ifdef MDS_REF_SET
+ << " (" << ref_map << ")"
+#endif
+ << dendl;
+#ifdef MDS_REF_SET
+ ceph_assert(ref_map[by] > 0);
+#endif
+ ceph_assert(ref > 0);
+ }
+ void bad_get(int by) override {
+ generic_dout(0) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref
+#ifdef MDS_REF_SET
+ << " (" << ref_map << ")"
+#endif
+ << dendl;
+#ifdef MDS_REF_SET
+ ceph_assert(ref_map[by] >= 0);
+#endif
+ }
+ void first_get() override;
+ void last_put() override;
+ void _put() override;
+
+
+ // -- hierarchy stuff --
+public:
+ void set_primary_parent(CDentry *p) {
+ ceph_assert(parent == 0 ||
+ g_conf().get_val<bool>("mds_hack_allow_loading_invalid_metadata"));
+ parent = p;
+ }
+ void remove_primary_parent(CDentry *dn) {
+ ceph_assert(dn == parent);
+ parent = 0;
+ }
+ void add_remote_parent(CDentry *p);
+ void remove_remote_parent(CDentry *p);
+ int num_remote_parents() {
+ return remote_parents.size();
+ }
+
+ void push_projected_parent(CDentry *dn) {
+ projected_parent.push_back(dn);
+ }
+ void pop_projected_parent() {
+ ceph_assert(projected_parent.size());
+ parent = projected_parent.front();
+ projected_parent.pop_front();
+ }
+
+public:
+ void maybe_export_pin(bool update=false);
+ void set_export_pin(mds_rank_t rank);
+ mds_rank_t get_export_pin(bool inherit=true) const;
+ bool is_exportable(mds_rank_t dest) const;
+
+ void print(ostream& out) override;
+ void dump(Formatter *f, int flags = DUMP_DEFAULT) const;
+
+ /**
+ * @defgroup Scrubbing and fsck
+ * @{
+ */
+
+ /**
+ * Report the results of validation against a particular inode.
+ * Each member is a pair of bools.
+ * <member>.first represents if validation was performed against the member.
+ * <member.second represents if the member passed validation.
+ * performed_validation is set to true if the validation was actually
+ * run. It might not be run if, for instance, the inode is marked as dirty.
+ * passed_validation is set to true if everything that was checked
+ * passed its validation.
+ */
+ struct validated_data {
+ template<typename T>struct member_status {
+ bool checked = false;
+ bool passed = false;
+ bool repaired = false;
+ int ondisk_read_retval = 0;
+ T ondisk_value;
+ T memory_value;
+ std::stringstream error_str;
+ };
+
+ bool performed_validation = false;
+ bool passed_validation = false;
+
+ struct raw_stats_t {
+ frag_info_t dirstat;
+ nest_info_t rstat;
+ };
+
+ member_status<inode_backtrace_t> backtrace;
+ member_status<mempool_inode> inode; // XXX should not be in mempool; wait for pmr
+ member_status<raw_stats_t> raw_stats;
+
+ validated_data() {}
+
+ void dump(Formatter *f) const;
+
+ bool all_damage_repaired() const;
+ };
+
+ /**
+ * Validate that the on-disk state of an inode matches what
+ * we expect from our memory state. Currently this checks that:
+ * 1) The backtrace associated with the file data exists and is correct
+ * 2) For directories, the actual inode metadata matches our memory state,
+ * 3) For directories, the rstats match
+ *
+ * @param results A freshly-created validated_data struct, with values set
+ * as described in the struct documentation.
+ * @param mdr The request to be responeded upon the completion of the
+ * validation (or NULL)
+ * @param fin Context to call back on completion (or NULL)
+ */
+ void validate_disk_state(validated_data *results,
+ MDSContext *fin);
+ static void dump_validation_results(const validated_data& results,
+ Formatter *f);
+private:
+ bool _validate_disk_state(class ValidationContinuation *c,
+ int rval, int stage);
+ friend class ValidationContinuation;
+ /** @} Scrubbing and fsck */
+};
+
+ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si);
+
+#undef dout_context
+#endif
diff --git a/src/mds/CMakeLists.txt b/src/mds/CMakeLists.txt
new file mode 100644
index 00000000..025dbdd7
--- /dev/null
+++ b/src/mds/CMakeLists.txt
@@ -0,0 +1,47 @@
+set(mds_srcs
+ Capability.cc
+ MDSDaemon.cc
+ MDSRank.cc
+ Beacon.cc
+ flock.cc
+ locks.c
+ journal.cc
+ Server.cc
+ Mutation.cc
+ MDCache.cc
+ RecoveryQueue.cc
+ StrayManager.cc
+ PurgeQueue.cc
+ Locker.cc
+ Migrator.cc
+ MDBalancer.cc
+ CDentry.cc
+ CDir.cc
+ CInode.cc
+ LogEvent.cc
+ MDSTable.cc
+ InoTable.cc
+ JournalPointer.cc
+ MDSTableClient.cc
+ MDSTableServer.cc
+ ScrubStack.cc
+ DamageTable.cc
+ SimpleLock.cc
+ SnapRealm.cc
+ SnapServer.cc
+ SnapClient.cc
+ snap.cc
+ SessionMap.cc
+ MDSContext.cc
+ MDSAuthCaps.cc
+ MDLog.cc
+ MDSCacheObject.cc
+ Mantle.cc
+ Anchor.cc
+ OpenFileTable.cc
+ ${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc
+ ${CMAKE_SOURCE_DIR}/src/common/MemoryModel.cc
+ ${CMAKE_SOURCE_DIR}/src/osdc/Journaler.cc)
+add_library(mds STATIC ${mds_srcs})
+target_link_libraries(mds PRIVATE
+ heap_profiler cpu_profiler osdc liblua)
diff --git a/src/mds/Capability.cc b/src/mds/Capability.cc
new file mode 100644
index 00000000..1ce1803b
--- /dev/null
+++ b/src/mds/Capability.cc
@@ -0,0 +1,299 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "Capability.h"
+#include "CInode.h"
+#include "SessionMap.h"
+
+#include "common/Formatter.h"
+
+
+/*
+ * Capability::Export
+ */
+
+void Capability::Export::encode(bufferlist &bl) const
+{
+ ENCODE_START(3, 2, bl);
+ encode(cap_id, bl);
+ encode(wanted, bl);
+ encode(issued, bl);
+ encode(pending, bl);
+ encode(client_follows, bl);
+ encode(seq, bl);
+ encode(mseq, bl);
+ encode(last_issue_stamp, bl);
+ encode(state, bl);
+ ENCODE_FINISH(bl);
+}
+
+void Capability::Export::decode(bufferlist::const_iterator &p)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, p);
+ decode(cap_id, p);
+ decode(wanted, p);
+ decode(issued, p);
+ decode(pending, p);
+ decode(client_follows, p);
+ decode(seq, p);
+ decode(mseq, p);
+ decode(last_issue_stamp, p);
+ if (struct_v >= 3)
+ decode(state, p);
+ DECODE_FINISH(p);
+}
+
+void Capability::Export::dump(Formatter *f) const
+{
+ f->dump_unsigned("cap_id", cap_id);
+ f->dump_unsigned("wanted", wanted);
+ f->dump_unsigned("issued", issued);
+ f->dump_unsigned("pending", pending);
+ f->dump_unsigned("client_follows", client_follows);
+ f->dump_unsigned("seq", seq);
+ f->dump_unsigned("migrate_seq", mseq);
+ f->dump_stream("last_issue_stamp") << last_issue_stamp;
+}
+
+void Capability::Export::generate_test_instances(list<Capability::Export*>& ls)
+{
+ ls.push_back(new Export);
+ ls.push_back(new Export);
+ ls.back()->wanted = 1;
+ ls.back()->issued = 2;
+ ls.back()->pending = 3;
+ ls.back()->client_follows = 4;
+ ls.back()->mseq = 5;
+ ls.back()->last_issue_stamp = utime_t(6, 7);
+}
+
+void Capability::Import::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(cap_id, bl);
+ encode(issue_seq, bl);
+ encode(mseq, bl);
+ ENCODE_FINISH(bl);
+}
+
+void Capability::Import::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(cap_id, bl);
+ decode(issue_seq, bl);
+ decode(mseq, bl);
+ DECODE_FINISH(bl);
+}
+
+void Capability::Import::dump(Formatter *f) const
+{
+ f->dump_unsigned("cap_id", cap_id);
+ f->dump_unsigned("issue_seq", issue_seq);
+ f->dump_unsigned("migrate_seq", mseq);
+}
+
+/*
+ * Capability::revoke_info
+ */
+
+void Capability::revoke_info::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl)
+ encode(before, bl);
+ encode(seq, bl);
+ encode(last_issue, bl);
+ ENCODE_FINISH(bl);
+}
+
+void Capability::revoke_info::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(before, bl);
+ decode(seq, bl);
+ decode(last_issue, bl);
+ DECODE_FINISH(bl);
+}
+
+void Capability::revoke_info::dump(Formatter *f) const
+{
+ f->dump_unsigned("before", before);
+ f->dump_unsigned("seq", seq);
+ f->dump_unsigned("last_issue", last_issue);
+}
+
+void Capability::revoke_info::generate_test_instances(list<Capability::revoke_info*>& ls)
+{
+ ls.push_back(new revoke_info);
+ ls.push_back(new revoke_info);
+ ls.back()->before = 1;
+ ls.back()->seq = 2;
+ ls.back()->last_issue = 3;
+}
+
+
+/*
+ * Capability
+ */
+Capability::Capability(CInode *i, Session *s, uint64_t id) :
+ client_follows(0),
+ client_xattr_version(0), client_inline_version(0),
+ last_rbytes(0), last_rsize(0),
+ item_session_caps(this), item_snaprealm_caps(this),
+ item_revoking_caps(this), item_client_revoking_caps(this),
+ inode(i), session(s),
+ cap_id(id), _wanted(0), num_revoke_warnings(0),
+ _pending(0), _issued(0), last_sent(0), last_issue(0), mseq(0),
+ suppress(0), state(0)
+{
+ if (session) {
+ session->touch_cap_bottom(this);
+ cap_gen = session->get_cap_gen();
+ if (session->is_stale())
+ --cap_gen; // not valid
+
+ auto& conn = session->get_connection();
+ if (conn) {
+ if (!conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
+ state |= STATE_NOINLINE;
+ if (!conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
+ state |= STATE_NOPOOLNS;
+ if (!conn->has_feature(CEPH_FEATURE_MDS_QUOTA))
+ state |= STATE_NOQUOTA;
+ }
+ }
+}
+
+client_t Capability::get_client() const
+{
+ return session ? session->get_client() : client_t(-1);
+}
+
+bool Capability::is_stale() const
+{
+ return session ? session->is_stale() : false;
+}
+
+bool Capability::is_valid() const
+{
+ return !session || session->get_cap_gen() == cap_gen;
+}
+
+void Capability::revalidate()
+{
+ if (!is_valid())
+ cap_gen = session->get_cap_gen();
+}
+
+void Capability::mark_notable()
+{
+ state |= STATE_NOTABLE;
+ session->touch_cap(this);
+}
+
+void Capability::maybe_clear_notable()
+{
+ if ((_issued == _pending) &&
+ !is_clientwriteable() &&
+ !is_wanted_notable(_wanted)) {
+ ceph_assert(is_notable());
+ state &= ~STATE_NOTABLE;
+ session->touch_cap_bottom(this);
+ }
+}
+
+void Capability::set_wanted(int w) {
+ CInode *in = get_inode();
+ if (in) {
+ if (!_wanted && w) {
+ in->adjust_num_caps_wanted(1);
+ } else if (_wanted && !w) {
+ in->adjust_num_caps_wanted(-1);
+ }
+ if (!is_wanted_notable(_wanted) && is_wanted_notable(w)) {
+ if (!is_notable())
+ mark_notable();
+ } else if (is_wanted_notable(_wanted) && !is_wanted_notable(w)) {
+ maybe_clear_notable();
+ }
+ }
+ _wanted = w;
+}
+
+void Capability::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl)
+ encode(last_sent, bl);
+ encode(last_issue_stamp, bl);
+
+ encode(_wanted, bl);
+ encode(_pending, bl);
+ encode(_revokes, bl);
+ ENCODE_FINISH(bl);
+}
+
+void Capability::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl)
+ decode(last_sent, bl);
+ decode(last_issue_stamp, bl);
+
+ __u32 tmp_wanted;
+ decode(tmp_wanted, bl);
+ set_wanted(tmp_wanted);
+ decode(_pending, bl);
+ decode(_revokes, bl);
+ DECODE_FINISH(bl);
+
+ calc_issued();
+}
+
+void Capability::dump(Formatter *f) const
+{
+ f->dump_unsigned("last_sent", last_sent);
+ f->dump_unsigned("last_issue_stamp", last_issue_stamp);
+ f->dump_unsigned("wanted", _wanted);
+ f->dump_unsigned("pending", _pending);
+
+ f->open_array_section("revokes");
+ for (const auto &r : _revokes) {
+ f->open_object_section("revoke");
+ r.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void Capability::generate_test_instances(list<Capability*>& ls)
+{
+ ls.push_back(new Capability);
+ ls.push_back(new Capability);
+ ls.back()->last_sent = 11;
+ ls.back()->last_issue_stamp = utime_t(12, 13);
+ ls.back()->set_wanted(14);
+ ls.back()->_pending = 15;
+ {
+ auto &r = ls.back()->_revokes.emplace_back();
+ r.before = 16;
+ r.seq = 17;
+ r.last_issue = 18;
+ }
+ {
+ auto &r = ls.back()->_revokes.emplace_back();
+ r.before = 19;
+ r.seq = 20;
+ r.last_issue = 21;
+ }
+}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(Capability, co_cap, mds_co);
diff --git a/src/mds/Capability.h b/src/mds/Capability.h
new file mode 100644
index 00000000..a54f013c
--- /dev/null
+++ b/src/mds/Capability.h
@@ -0,0 +1,406 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_CAPABILITY_H
+#define CEPH_CAPABILITY_H
+
+#include "include/buffer_fwd.h"
+#include "include/counter.h"
+#include "include/mempool.h"
+#include "include/xlist.h"
+
+#include "common/config.h"
+
+#include "mdstypes.h"
+
+
+/*
+
+ Capability protocol notes.
+
+- two types of cap events from mds -> client:
+ - cap "issue" in a MClientReply, or an MClientCaps IMPORT op.
+ - cap "update" (revocation or grant) .. an MClientCaps message.
+- if client has cap, the mds should have it too.
+
+- if client has no dirty data, it can release it without waiting for an mds ack.
+ - client may thus get a cap _update_ and not have the cap. ignore it.
+
+- mds should track seq of last issue. any release
+ attempt will only succeed if the client has seen the latest.
+
+- a UPDATE updates the clients issued caps, wanted, etc. it may also flush dirty metadata.
+ - 'caps' are which caps the client retains.
+ - if 0, client wishes to release the cap
+ - 'wanted' is which caps the client wants.
+ - 'dirty' is which metadata is to be written.
+ - client gets a FLUSH_ACK with matching dirty flags indicating which caps were written.
+
+- a FLUSH_ACK acks a FLUSH.
+ - 'dirty' is the _original_ FLUSH's dirty (i.e., which metadata was written back)
+ - 'seq' is the _original_ FLUSH's seq.
+ - 'caps' is the _original_ FLUSH's caps (not actually important)
+ - client can conclude that (dirty & ~caps) bits were successfully cleaned.
+
+- a FLUSHSNAP flushes snapshot metadata.
+ - 'dirty' indicates which caps, were dirty, if any.
+ - mds writes metadata. if dirty!=0, replies with FLUSHSNAP_ACK.
+
+ */
+
+class CInode;
+class Session;
+
+namespace ceph {
+ class Formatter;
+}
+
+class Capability : public Counter<Capability> {
+public:
+ MEMPOOL_CLASS_HELPERS();
+
+ struct Export {
+ int64_t cap_id = 0;
+ int32_t wanted = 0;
+ int32_t issued = 0;
+ int32_t pending = 0;
+ snapid_t client_follows;
+ ceph_seq_t seq = 0;
+ ceph_seq_t mseq = 0;
+ utime_t last_issue_stamp;
+ uint32_t state = 0;
+ Export() {}
+ Export(int64_t id, int w, int i, int p, snapid_t cf,
+ ceph_seq_t s, ceph_seq_t m, utime_t lis, unsigned st) :
+ cap_id(id), wanted(w), issued(i), pending(p), client_follows(cf),
+ seq(s), mseq(m), last_issue_stamp(lis), state(st) {}
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &p);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<Export*>& ls);
+ };
+ struct Import {
+ int64_t cap_id;
+ ceph_seq_t issue_seq;
+ ceph_seq_t mseq;
+ Import() : cap_id(0), issue_seq(0), mseq(0) {}
+ Import(int64_t i, ceph_seq_t s, ceph_seq_t m) : cap_id(i), issue_seq(s), mseq(m) {}
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &p);
+ void dump(Formatter *f) const;
+ };
+ struct revoke_info {
+ __u32 before;
+ ceph_seq_t seq, last_issue;
+ revoke_info() : before(0), seq(0), last_issue(0) {}
+ revoke_info(__u32 b, ceph_seq_t s, ceph_seq_t li) : before(b), seq(s), last_issue(li) {}
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<revoke_info*>& ls);
+ };
+
+ const static unsigned STATE_NOTABLE = (1<<0);
+ const static unsigned STATE_NEW = (1<<1);
+ const static unsigned STATE_IMPORTING = (1<<2);
+ const static unsigned STATE_NEEDSNAPFLUSH = (1<<3);
+ const static unsigned STATE_CLIENTWRITEABLE = (1<<4);
+ const static unsigned STATE_NOINLINE = (1<<5);
+ const static unsigned STATE_NOPOOLNS = (1<<6);
+ const static unsigned STATE_NOQUOTA = (1<<7);
+
+ const static unsigned MASK_STATE_EXPORTED =
+ (STATE_CLIENTWRITEABLE | STATE_NOINLINE | STATE_NOPOOLNS | STATE_NOQUOTA);
+
+ Capability(CInode *i=nullptr, Session *s=nullptr, uint64_t id=0);
+ Capability(const Capability& other) = delete;
+
+ const Capability& operator=(const Capability& other) = delete;
+
+ int pending() const {
+ return _pending;
+ }
+ int issued() const {
+ return _issued;
+ }
+ int revoking() const {
+ return _issued & ~_pending;
+ }
+ ceph_seq_t issue(unsigned c, bool reval=false) {
+ if (reval)
+ revalidate();
+
+ if (_pending & ~c) {
+ // revoking (and maybe adding) bits. note caps prior to this revocation
+ _revokes.emplace_back(_pending, last_sent, last_issue);
+ _pending = c;
+ _issued |= c;
+ if (!is_notable())
+ mark_notable();
+ } else if (~_pending & c) {
+ // adding bits only. remove obsolete revocations?
+ _pending |= c;
+ _issued |= c;
+ // drop old _revokes with no bits we don't have
+ while (!_revokes.empty() &&
+ (_revokes.back().before & ~_pending) == 0)
+ _revokes.pop_back();
+ } else {
+ // no change.
+ ceph_assert(_pending == c);
+ }
+ //last_issue =
+ inc_last_seq();
+ return last_sent;
+ }
+ ceph_seq_t issue_norevoke(unsigned c, bool reval=false) {
+ if (reval)
+ revalidate();
+
+ _pending |= c;
+ _issued |= c;
+ clear_new();
+
+ inc_last_seq();
+ return last_sent;
+ }
+ void confirm_receipt(ceph_seq_t seq, unsigned caps) {
+ bool was_revoking = (_issued & ~_pending);
+ if (seq == last_sent) {
+ _revokes.clear();
+ _issued = caps;
+ // don't add bits
+ _pending &= caps;
+ } else {
+ // can i forget any revocations?
+ while (!_revokes.empty() && _revokes.front().seq < seq)
+ _revokes.pop_front();
+ if (!_revokes.empty()) {
+ if (_revokes.front().seq == seq)
+ _revokes.begin()->before = caps;
+ calc_issued();
+ } else {
+ // seq < last_sent
+ _issued = caps | _pending;
+ }
+ }
+
+ if (was_revoking && _issued == _pending) {
+ item_revoking_caps.remove_myself();
+ item_client_revoking_caps.remove_myself();
+ maybe_clear_notable();
+ }
+ //check_rdcaps_list();
+ }
+ // we may get a release racing with revocations, which means our revokes will be ignored
+ // by the client. clean them out of our _revokes history so we don't wait on them.
+ void clean_revoke_from(ceph_seq_t li) {
+ bool changed = false;
+ while (!_revokes.empty() && _revokes.front().last_issue <= li) {
+ _revokes.pop_front();
+ changed = true;
+ }
+ if (changed) {
+ bool was_revoking = (_issued & ~_pending);
+ calc_issued();
+ if (was_revoking && _issued == _pending) {
+ item_revoking_caps.remove_myself();
+ item_client_revoking_caps.remove_myself();
+ maybe_clear_notable();
+ }
+ }
+ }
+ ceph_seq_t get_mseq() const { return mseq; }
+ void inc_mseq() { mseq++; }
+
+ utime_t get_last_issue_stamp() const { return last_issue_stamp; }
+ utime_t get_last_revoke_stamp() const { return last_revoke_stamp; }
+
+ void set_last_issue() { last_issue = last_sent; }
+ void set_last_issue_stamp(utime_t t) { last_issue_stamp = t; }
+ void set_last_revoke_stamp(utime_t t) { last_revoke_stamp = t; }
+ void reset_num_revoke_warnings() { num_revoke_warnings = 0; }
+ void inc_num_revoke_warnings() { ++num_revoke_warnings; }
+ unsigned get_num_revoke_warnings() const { return num_revoke_warnings; }
+
+ void set_cap_id(uint64_t i) { cap_id = i; }
+ uint64_t get_cap_id() const { return cap_id; }
+
+ //ceph_seq_t get_last_issue() { return last_issue; }
+
+ bool is_suppress() const { return suppress > 0; }
+ void inc_suppress() { suppress++; }
+ void dec_suppress() { suppress--; }
+
+ static bool is_wanted_notable(int wanted) {
+ return wanted & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_WR|CEPH_CAP_FILE_RD);
+ }
+ bool is_notable() const { return state & STATE_NOTABLE; }
+
+ bool is_stale() const;
+ bool is_valid() const;
+ bool is_new() const { return state & STATE_NEW; }
+ void mark_new() { state |= STATE_NEW; }
+ void clear_new() { state &= ~STATE_NEW; }
+ bool is_importing() const { return state & STATE_IMPORTING; }
+ void mark_importing() { state |= STATE_IMPORTING; }
+ void clear_importing() { state &= ~STATE_IMPORTING; }
+ bool need_snapflush() const { return state & STATE_NEEDSNAPFLUSH; }
+ void mark_needsnapflush() { state |= STATE_NEEDSNAPFLUSH; }
+ void clear_needsnapflush() { state &= ~STATE_NEEDSNAPFLUSH; }
+
+ bool is_clientwriteable() const { return state & STATE_CLIENTWRITEABLE; }
+ void mark_clientwriteable() {
+ if (!is_clientwriteable()) {
+ state |= STATE_CLIENTWRITEABLE;
+ if (!is_notable())
+ mark_notable();
+ }
+ }
+ void clear_clientwriteable() {
+ if (is_clientwriteable()) {
+ state &= ~STATE_CLIENTWRITEABLE;
+ maybe_clear_notable();
+ }
+ }
+
+ bool is_noinline() const { return state & STATE_NOINLINE; }
+ bool is_nopoolns() const { return state & STATE_NOPOOLNS; }
+ bool is_noquota() const { return state & STATE_NOQUOTA; }
+
+ CInode *get_inode() const { return inode; }
+ Session *get_session() const { return session; }
+ client_t get_client() const;
+
+ // caps this client wants to hold
+ int wanted() const { return _wanted; }
+ void set_wanted(int w);
+
+ void inc_last_seq() { last_sent++; }
+ ceph_seq_t get_last_seq() const {
+ return last_sent;
+ }
+ ceph_seq_t get_last_issue() const { return last_issue; }
+
+ void reset_seq() {
+ last_sent = 0;
+ last_issue = 0;
+ }
+
+ // -- exports --
+ Export make_export() const {
+ return Export(cap_id, wanted(), issued(), pending(), client_follows, get_last_seq(), mseq+1, last_issue_stamp, state);
+ }
+ void merge(const Export& other, bool auth_cap) {
+ // issued + pending
+ int newpending = other.pending | pending();
+ if (other.issued & ~newpending)
+ issue(other.issued | newpending);
+ else
+ issue(newpending);
+ last_issue_stamp = other.last_issue_stamp;
+
+ client_follows = other.client_follows;
+
+ state |= other.state & MASK_STATE_EXPORTED;
+ if ((other.state & STATE_CLIENTWRITEABLE) && !is_notable())
+ mark_notable();
+
+ // wanted
+ set_wanted(wanted() | other.wanted);
+ if (auth_cap)
+ mseq = other.mseq;
+ }
+ void merge(int otherwanted, int otherissued) {
+ // issued + pending
+ int newpending = pending();
+ if (otherissued & ~newpending)
+ issue(otherissued | newpending);
+ else
+ issue(newpending);
+
+ // wanted
+ set_wanted(wanted() | otherwanted);
+ }
+
+ void revoke() {
+ if (revoking())
+ confirm_receipt(last_sent, pending());
+ }
+
+ // serializers
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<Capability*>& ls);
+
+ snapid_t client_follows;
+ version_t client_xattr_version;
+ version_t client_inline_version;
+ int64_t last_rbytes;
+ int64_t last_rsize;
+
+ xlist<Capability*>::item item_session_caps;
+ xlist<Capability*>::item item_snaprealm_caps;
+ xlist<Capability*>::item item_revoking_caps;
+ xlist<Capability*>::item item_client_revoking_caps;
+
+private:
+ CInode *inode;
+ Session *session;
+
+ uint64_t cap_id;
+ uint32_t cap_gen;
+
+ __u32 _wanted; // what the client wants (ideally)
+
+ utime_t last_issue_stamp;
+ utime_t last_revoke_stamp;
+ unsigned num_revoke_warnings;
+
+ // track in-flight caps --------------
+ // - add new caps to _pending
+ // - track revocations in _revokes list
+ __u32 _pending, _issued;
+ mempool::mds_co::list<revoke_info> _revokes;
+
+ ceph_seq_t last_sent;
+ ceph_seq_t last_issue;
+ ceph_seq_t mseq;
+
+ int suppress;
+ unsigned state;
+
+ void calc_issued() {
+ _issued = _pending;
+ for (const auto &r : _revokes) {
+ _issued |= r.before;
+ }
+ }
+
+ void revalidate();
+
+ void mark_notable();
+ void maybe_clear_notable();
+};
+
+WRITE_CLASS_ENCODER(Capability::Export)
+WRITE_CLASS_ENCODER(Capability::Import)
+WRITE_CLASS_ENCODER(Capability::revoke_info)
+WRITE_CLASS_ENCODER(Capability)
+
+
+
+#endif
diff --git a/src/mds/DamageTable.cc b/src/mds/DamageTable.cc
new file mode 100644
index 00000000..c474b078
--- /dev/null
+++ b/src/mds/DamageTable.cc
@@ -0,0 +1,280 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/debug.h"
+
+#include "mds/CDir.h"
+
+#include "DamageTable.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << rank << ".damage " << __func__ << " "
+
+namespace {
+/**
+ * Record damage to a particular dirfrag, implicitly affecting
+ * any dentries within it.
+ */
+class DirFragDamage : public DamageEntry
+{
+ public:
+ inodeno_t ino;
+ frag_t frag;
+
+ DirFragDamage(inodeno_t ino_, frag_t frag_)
+ : ino(ino_), frag(frag_)
+ {}
+
+ damage_entry_type_t get_type() const override
+ {
+ return DAMAGE_ENTRY_DIRFRAG;
+ }
+
+ void dump(Formatter *f) const override
+ {
+ f->open_object_section("dir_frag_damage");
+ f->dump_string("damage_type", "dir_frag");
+ f->dump_int("id", id);
+ f->dump_int("ino", ino);
+ f->dump_stream("frag") << frag;
+ f->dump_string("path", path);
+ f->close_section();
+ }
+};
+
+
+/**
+ * Record damage to a particular dname within a particular dirfrag
+ */
+class DentryDamage : public DamageEntry
+{
+ public:
+ inodeno_t ino;
+ frag_t frag;
+ std::string dname;
+ snapid_t snap_id;
+
+ DentryDamage(
+ inodeno_t ino_,
+ frag_t frag_,
+ std::string_view dname_,
+ snapid_t snap_id_)
+ : ino(ino_), frag(frag_), dname(dname_), snap_id(snap_id_)
+ {}
+
+ damage_entry_type_t get_type() const override
+ {
+ return DAMAGE_ENTRY_DENTRY;
+ }
+
+ void dump(Formatter *f) const override
+ {
+ f->open_object_section("dentry_damage");
+ f->dump_string("damage_type", "dentry");
+ f->dump_int("id", id);
+ f->dump_int("ino", ino);
+ f->dump_stream("frag") << frag;
+ f->dump_string("dname", dname);
+ f->dump_stream("snap_id") << snap_id;
+ f->dump_string("path", path);
+ f->close_section();
+ }
+};
+
+
+/**
+ * Record damage to our ability to look up an ino by number
+ */
+class BacktraceDamage : public DamageEntry
+{
+ public:
+ inodeno_t ino;
+
+ BacktraceDamage(inodeno_t ino_)
+ : ino(ino_)
+ {}
+
+ damage_entry_type_t get_type() const override
+ {
+ return DAMAGE_ENTRY_BACKTRACE;
+ }
+
+ void dump(Formatter *f) const override
+ {
+ f->open_object_section("backtrace_damage");
+ f->dump_string("damage_type", "backtrace");
+ f->dump_int("id", id);
+ f->dump_int("ino", ino);
+ f->dump_string("path", path);
+ f->close_section();
+ }
+};
+}
+
+DamageEntry::~DamageEntry()
+{}
+
+bool DamageTable::notify_dentry(
+ inodeno_t ino, frag_t frag,
+ snapid_t snap_id, std::string_view dname, std::string_view path)
+{
+ if (oversized()) {
+ return true;
+ }
+
+ // Special cases: damage to these dirfrags is considered fatal to
+ // the MDS rank that owns them.
+ if (
+ (MDS_INO_IS_MDSDIR(ino) && MDS_INO_MDSDIR_OWNER(ino) == rank)
+ ||
+ (MDS_INO_IS_STRAY(ino) && MDS_INO_STRAY_OWNER(ino) == rank)
+ ) {
+ derr << "Damage to dentries in fragment " << frag << " of ino " << ino
+ << "is fatal because it is a system directory for this rank" << dendl;
+ return true;
+ }
+
+ auto key = DirFragIdent(ino, frag);
+ if (dentries.count(key) == 0) {
+ DamageEntryRef entry = std::make_shared<DentryDamage>(
+ ino, frag, dname, snap_id);
+ entry->path = path;
+ dentries[key][DentryIdent(dname, snap_id)] = entry;
+ by_id[entry->id] = std::move(entry);
+ }
+
+ return false;
+}
+
+bool DamageTable::notify_dirfrag(inodeno_t ino, frag_t frag,
+ std::string_view path)
+{
+ // Special cases: damage to these dirfrags is considered fatal to
+ // the MDS rank that owns them.
+ if (
+ (MDS_INO_IS_STRAY(ino) && MDS_INO_STRAY_OWNER(ino) == rank)
+ ||
+ (ino == MDS_INO_ROOT)
+ ) {
+ derr << "Damage to fragment " << frag << " of ino " << ino
+ << " is fatal because it is a system directory for this rank" << dendl;
+ return true;
+ }
+
+ if (oversized()) {
+ return true;
+ }
+
+ auto key = DirFragIdent(ino, frag);
+ if (dirfrags.count(key) == 0) {
+ DamageEntryRef entry = std::make_shared<DirFragDamage>(ino, frag);
+ entry->path = path;
+ dirfrags[key] = entry;
+ by_id[entry->id] = std::move(entry);
+ }
+
+ return false;
+}
+
+bool DamageTable::notify_remote_damaged(inodeno_t ino, std::string_view path)
+{
+ if (oversized()) {
+ return true;
+ }
+
+ if (remotes.count(ino) == 0) {
+ auto entry = std::make_shared<BacktraceDamage>(ino);
+ entry->path = path;
+ remotes[ino] = entry;
+ by_id[entry->id] = std::move(entry);
+ }
+
+ return false;
+}
+
+bool DamageTable::oversized() const
+{
+ return by_id.size() > (size_t)(g_conf()->mds_damage_table_max_entries);
+}
+
+bool DamageTable::is_dentry_damaged(
+ const CDir *dir_frag,
+ std::string_view dname,
+ const snapid_t snap_id) const
+{
+ if (dentries.count(
+ DirFragIdent(dir_frag->inode->ino(), dir_frag->frag)
+ ) == 0) {
+ return false;
+ }
+
+ const std::map<DentryIdent, DamageEntryRef> &frag_dentries =
+ dentries.at(DirFragIdent(dir_frag->inode->ino(), dir_frag->frag));
+
+ return frag_dentries.count(DentryIdent(dname, snap_id)) > 0;
+}
+
+bool DamageTable::is_dirfrag_damaged(
+ const CDir *dir_frag) const
+{
+ return dirfrags.count(
+ DirFragIdent(dir_frag->inode->ino(), dir_frag->frag)) > 0;
+}
+
+bool DamageTable::is_remote_damaged(
+ const inodeno_t ino) const
+{
+ return remotes.count(ino) > 0;
+}
+
+void DamageTable::dump(Formatter *f) const
+{
+ f->open_array_section("damage_table");
+ for (const auto &i : by_id)
+ {
+ i.second->dump(f);
+ }
+ f->close_section();
+}
+
+void DamageTable::erase(damage_entry_id_t damage_id)
+{
+ auto by_id_entry = by_id.find(damage_id);
+ if (by_id_entry == by_id.end()) {
+ return;
+ }
+
+ DamageEntryRef entry = by_id_entry->second;
+ ceph_assert(entry->id == damage_id); // Sanity
+
+ const auto type = entry->get_type();
+ if (type == DAMAGE_ENTRY_DIRFRAG) {
+ auto dirfrag_entry = std::static_pointer_cast<DirFragDamage>(entry);
+ dirfrags.erase(DirFragIdent(dirfrag_entry->ino, dirfrag_entry->frag));
+ } else if (type == DAMAGE_ENTRY_DENTRY) {
+ auto dentry_entry = std::static_pointer_cast<DentryDamage>(entry);
+ dentries.erase(DirFragIdent(dentry_entry->ino, dentry_entry->frag));
+ } else if (type == DAMAGE_ENTRY_BACKTRACE) {
+ auto backtrace_entry = std::static_pointer_cast<BacktraceDamage>(entry);
+ remotes.erase(backtrace_entry->ino);
+ } else {
+ derr << "Invalid type " << type << dendl;
+ ceph_abort();
+ }
+
+ by_id.erase(by_id_entry);
+}
+
diff --git a/src/mds/DamageTable.h b/src/mds/DamageTable.h
new file mode 100644
index 00000000..a408036c
--- /dev/null
+++ b/src/mds/DamageTable.h
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef DAMAGE_TABLE_H_
+#define DAMAGE_TABLE_H_
+
+#include <string_view>
+
+#include "mdstypes.h"
+#include "include/random.h"
+
+class CDir;
+
+typedef uint64_t damage_entry_id_t;
+
+typedef enum
+{
+ DAMAGE_ENTRY_DIRFRAG,
+ DAMAGE_ENTRY_DENTRY,
+ DAMAGE_ENTRY_BACKTRACE
+
+} damage_entry_type_t;
+
+class DamageEntry
+{
+ public:
+ damage_entry_id_t id;
+ utime_t reported_at;
+
+ // path is optional, advisory. Used to give the admin an idea of what
+ // part of his tree the damage affects.
+ std::string path;
+
+ DamageEntry()
+ {
+ id = ceph::util::generate_random_number<damage_entry_id_t>(0, 0xffffffff);
+ reported_at = ceph_clock_now();
+ }
+
+ virtual damage_entry_type_t get_type() const = 0;
+
+ virtual ~DamageEntry();
+
+ virtual void dump(Formatter *f) const = 0;
+};
+
+
+typedef std::shared_ptr<DamageEntry> DamageEntryRef;
+
+
+class DirFragIdent
+{
+ public:
+ inodeno_t ino;
+ frag_t frag;
+
+ bool operator<(const DirFragIdent &rhs) const
+ {
+ if (ino == rhs.ino) {
+ return frag < rhs.frag;
+ } else {
+ return ino < rhs.ino;
+ }
+ }
+
+ DirFragIdent(inodeno_t ino_, frag_t frag_)
+ : ino(ino_), frag(frag_)
+ {}
+};
+
+class DentryIdent
+{
+ public:
+ std::string dname;
+ snapid_t snap_id;
+
+ bool operator<(const DentryIdent &rhs) const
+ {
+ if (dname == rhs.dname) {
+ return snap_id < rhs.snap_id;
+ } else {
+ return dname < rhs.dname;
+ }
+ }
+
+ DentryIdent(std::string_view dname_, snapid_t snap_id_)
+ : dname(dname_), snap_id(snap_id_)
+ {}
+};
+
+/**
+ * Registry of in-RADOS metadata damage identified
+ * during forward scrub or during normal fetches.
+ *
+ * Used to indicate damage to the administrator, and
+ * to cache known-bad paths so that we don't hit them
+ * repeatedly.
+ *
+ * Callers notifying damage must check return code; if
+ * an fatal condition is indicated then they should mark the MDS
+ * rank damaged.
+ *
+ * An artificial limit on the number of damage entries
+ * is imposed to avoid this structure growing indefinitely. If
+ * a notification causes the limit to be exceeded, the fatal
+ * condition will be indicated in the return code and the MDS
+ * rank should be marked damaged.
+ *
+ * Protected by MDS::mds_lock
+ */
+class DamageTable
+{
+protected:
+
+ // Map of all dirfrags reported damaged
+ std::map<DirFragIdent, DamageEntryRef> dirfrags;
+
+ // Store dentries in a map per dirfrag, so that we can
+ // readily look up all the bad dentries in a particular
+ // dirfrag
+ std::map<DirFragIdent, std::map<DentryIdent, DamageEntryRef> > dentries;
+
+ // Map of all inodes which could not be resolved remotely
+ // (i.e. have probably/possibly missing backtraces)
+ std::map<inodeno_t, DamageEntryRef> remotes;
+
+ // All damage, by ID. This is a secondary index
+ // to the dirfrag, dentry, remote maps. It exists
+ // to enable external tools to unambiguously operate
+ // on particular entries.
+ std::map<damage_entry_id_t, DamageEntryRef> by_id;
+
+ // I need to know my MDS rank so that I can check if
+ // metadata items are part of my mydir.
+ const mds_rank_t rank;
+
+ bool oversized() const;
+
+public:
+
+ /**
+ * Return true if no damage entries exist
+ */
+ bool empty() const
+ {
+ return by_id.empty();
+ }
+
+ /**
+ * Indicate that a dirfrag cannot be loaded.
+ *
+ * @return true if fatal
+ */
+ bool notify_dirfrag(inodeno_t ino, frag_t frag, std::string_view path);
+
+ /**
+ * Indicate that a particular dentry cannot be loaded.
+ *
+ * @return true if fatal
+ */
+ bool notify_dentry(
+ inodeno_t ino, frag_t frag,
+ snapid_t snap_id, std::string_view dname, std::string_view path);
+
+ /**
+ * Indicate that a particular Inode could not be loaded by number
+ */
+ bool notify_remote_damaged(
+ inodeno_t ino, std::string_view path);
+
+ bool is_dentry_damaged(
+ const CDir *dir_frag,
+ std::string_view dname,
+ const snapid_t snap_id) const;
+
+ bool is_dirfrag_damaged(
+ const CDir *dir_frag) const;
+
+ bool is_remote_damaged(
+ const inodeno_t ino) const;
+
+
+ explicit DamageTable(const mds_rank_t rank_)
+ : rank(rank_)
+ {
+ ceph_assert(rank_ != MDS_RANK_NONE);
+ }
+
+ void dump(Formatter *f) const;
+
+ void erase(damage_entry_id_t damage_id);
+};
+
+#endif // DAMAGE_TABLE_H_
+
diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc
new file mode 100644
index 00000000..623e1748
--- /dev/null
+++ b/src/mds/FSMap.cc
@@ -0,0 +1,1029 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "FSMap.h"
+
+#include "common/StackStringStream.h"
+
+#include <sstream>
+#ifdef WITH_SEASTAR
+#include "crimson/common/config_proxy.h"
+#else
+#include "common/config_proxy.h"
+#endif
+#include "global/global_context.h"
+#include "mon/health_check.h"
+
+using std::stringstream;
+
+void Filesystem::dump(Formatter *f) const
+{
+ f->open_object_section("mdsmap");
+ mds_map.dump(f);
+ f->close_section();
+ f->dump_int("id", fscid);
+}
+
+void FSMap::dump(Formatter *f) const
+{
+ f->dump_int("epoch", epoch);
+ // Use 'default' naming to match 'set-default' CLI
+ f->dump_int("default_fscid", legacy_client_fscid);
+
+ f->open_object_section("compat");
+ compat.dump(f);
+ f->close_section();
+
+ f->open_object_section("feature_flags");
+ f->dump_bool("enable_multiple", enable_multiple);
+ f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
+ f->close_section();
+
+ f->open_array_section("standbys");
+ for (const auto& [gid, info] : standby_daemons) {
+ f->open_object_section("info");
+ info.dump(f);
+ f->dump_int("epoch", standby_epochs.at(gid));
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("filesystems");
+ for (const auto &fs : filesystems) {
+ f->open_object_section("filesystem");
+ fs.second->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void FSMap::generate_test_instances(list<FSMap*>& ls)
+{
+ FSMap *m = new FSMap();
+
+ std::list<MDSMap*> mds_map_instances;
+ MDSMap::generate_test_instances(mds_map_instances);
+
+ int k = 20;
+ for (auto i : mds_map_instances) {
+ auto fs = Filesystem::create();
+ fs->fscid = k++;
+ fs->mds_map = *i;
+ delete i;
+ m->filesystems[fs->fscid] = fs;
+ }
+ mds_map_instances.clear();
+
+ ls.push_back(m);
+}
+
+void FSMap::print(ostream& out) const
+{
+ out << "e" << epoch << std::endl;
+ out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
+ << ever_enabled_multiple << std::endl;
+ out << "compat: " << compat << std::endl;
+ out << "legacy client fscid: " << legacy_client_fscid << std::endl;
+ out << " " << std::endl;
+
+ if (filesystems.empty()) {
+ out << "No filesystems configured" << std::endl;
+ }
+
+ for (const auto& p : filesystems) {
+ p.second->print(out);
+ out << " " << std::endl << " " << std::endl; // Space out a bit
+ }
+
+ if (!standby_daemons.empty()) {
+ out << "Standby daemons:" << std::endl << " " << std::endl;
+ }
+
+ for (const auto& p : standby_daemons) {
+ out << p.second << std::endl;
+ }
+}
+
+
+
+void FSMap::print_summary(Formatter *f, ostream *out) const
+{
+ if (f) {
+ f->dump_unsigned("epoch", get_epoch());
+ for (const auto &p : filesystems) {
+ auto& fs = p.second;
+ f->dump_unsigned("id", fs->fscid);
+ f->dump_unsigned("up", fs->mds_map.up.size());
+ f->dump_unsigned("in", fs->mds_map.in.size());
+ f->dump_unsigned("max", fs->mds_map.max_mds);
+ }
+ } else {
+ auto count = filesystems.size();
+ if (count <= 3) {
+ bool first = true;
+ for (const auto& p : filesystems) {
+ const auto& fs = p.second;
+ if (!first) {
+ *out << " ";
+ }
+ if (fs->mds_map.is_degraded()) {
+ *out << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
+ } else {
+ *out << fs->mds_map.fs_name << ":" << fs->mds_map.in.size();
+ }
+ first = false;
+ }
+ } else {
+ *out << count << " fs";
+ unsigned degraded = 0;
+ CachedStackStringStream css;
+ *css << " (degraded: ";
+ for (const auto& p : filesystems) {
+ const auto& fs = p.second;
+ if (fs->mds_map.is_degraded()) {
+ degraded++;
+ if (degraded <= 3) {
+ *css << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
+ }
+ }
+ }
+ if (degraded > 0) {
+ if (degraded <= 3) {
+ *css << ")";
+ *out << css->strv();
+ } else {
+ *out << " (degraded: " << degraded << " fs)";
+ }
+ }
+ }
+ }
+
+ if (f) {
+ f->open_array_section("by_rank");
+ }
+
+ std::map<MDSMap::DaemonState,unsigned> by_state;
+ std::map<mds_role_t, std::pair<MDSMap::DaemonState, std::string>> by_rank;
+ by_state[MDSMap::DaemonState::STATE_STANDBY] = standby_daemons.size();
+ for (const auto& [gid, fscid] : mds_roles) {
+ if (fscid == FS_CLUSTER_ID_NONE)
+ continue;
+
+ const auto& info = filesystems.at(fscid)->mds_map.get_info_gid(gid);
+ auto s = std::string(ceph_mds_state_name(info.state));
+ if (info.laggy()) {
+ s += "(laggy or crashed)";
+ }
+
+ if (f) {
+ f->open_object_section("mds");
+ f->dump_unsigned("filesystem_id", fscid);
+ f->dump_unsigned("rank", info.rank);
+ f->dump_string("name", info.name);
+ f->dump_string("status", s);
+ f->dump_unsigned("gid", gid);
+ f->close_section();
+ } else if (info.state != MDSMap::DaemonState::STATE_STANDBY_REPLAY) {
+ by_rank[mds_role_t(fscid, info.rank)] = std::make_pair(info.state, info.name + "=" + s);
+ }
+ by_state[info.state]++;
+ }
+
+ if (f) {
+ f->close_section();
+ } else {
+ if (0 < by_rank.size() && by_rank.size() < 5) {
+ if (filesystems.size() > 1) {
+ // Disambiguate filesystems
+ std::map<std::string, std::string> pretty;
+ for (const auto& [role,status] : by_rank) {
+ const auto &fs_name = filesystems.at(role.fscid)->mds_map.fs_name;
+ CachedStackStringStream css;
+ *css << fs_name << ":" << role.rank;
+ pretty.emplace(std::piecewise_construct, std::forward_as_tuple(css->strv()), std::forward_as_tuple(status.second));
+ --by_state[status.first]; /* already printed! */
+ }
+ *out << " " << pretty;
+ } else {
+ // Omit FSCID in output when only one filesystem exists
+ std::map<mds_rank_t, std::string> shortened;
+ for (const auto& [role,status] : by_rank) {
+ shortened[role.rank] = status.second;
+ --by_state[status.first]; /* already printed! */
+ }
+ *out << " " << shortened;
+ }
+ }
+ for (const auto& [state, count] : by_state) {
+ if (count > 0) {
+ auto s = std::string_view(ceph_mds_state_name(state));
+ *out << " " << count << " " << s;
+ }
+ }
+ }
+
+ if (f) {
+ const auto state = MDSMap::DaemonState::STATE_STANDBY;
+ auto&& name = ceph_mds_state_name(state);
+ auto count = standby_daemons.size();
+ f->dump_unsigned(name, count);
+ }
+
+ size_t failed = 0;
+ size_t damaged = 0;
+ for (const auto& p : filesystems) {
+ auto& fs = p.second;
+ failed += fs->mds_map.failed.size();
+ damaged += fs->mds_map.damaged.size();
+ }
+
+ if (failed > 0) {
+ if (f) {
+ f->dump_unsigned("failed", failed);
+ } else {
+ *out << ", " << failed << " failed";
+ }
+ }
+
+ if (damaged > 0) {
+ if (f) {
+ f->dump_unsigned("damaged", damaged);
+ } else {
+ *out << ", " << damaged << " damaged";
+ }
+ }
+ //if (stopped.size())
+ //out << ", " << stopped.size() << " stopped";
+}
+
+
+Filesystem::ref FSMap::create_filesystem(std::string_view name,
+ int64_t metadata_pool, int64_t data_pool, uint64_t features)
+{
+ auto fs = Filesystem::create();
+ fs->mds_map.epoch = epoch;
+ fs->mds_map.fs_name = name;
+ fs->mds_map.data_pools.push_back(data_pool);
+ fs->mds_map.metadata_pool = metadata_pool;
+ fs->mds_map.cas_pool = -1;
+ fs->mds_map.compat = compat;
+ fs->mds_map.created = ceph_clock_now();
+ fs->mds_map.modified = ceph_clock_now();
+ fs->mds_map.enabled = true;
+ if (features & CEPH_FEATURE_SERVER_JEWEL) {
+ fs->fscid = next_filesystem_id++;
+ // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
+ // have initialized next_filesystem_id such that it's never used here.
+ ceph_assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS);
+ } else {
+ // Use anon fscid because this will get thrown away when encoding
+ // as legacy MDSMap for legacy mons.
+ ceph_assert(filesystems.empty());
+ fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
+ }
+ filesystems[fs->fscid] = fs;
+
+ // Created first filesystem? Set it as the one
+ // for legacy clients to use
+ if (filesystems.size() == 1) {
+ legacy_client_fscid = fs->fscid;
+ }
+
+ return fs;
+}
+
+void FSMap::reset_filesystem(fs_cluster_id_t fscid)
+{
+ auto fs = get_filesystem(fscid);
+ auto new_fs = Filesystem::create();
+
+ // Populate rank 0 as existing (so don't go into CREATING)
+ // but failed (so that next available MDS is assigned the rank)
+ new_fs->mds_map.in.insert(mds_rank_t(0));
+ new_fs->mds_map.failed.insert(mds_rank_t(0));
+
+ // Carry forward what makes sense
+ new_fs->fscid = fs->fscid;
+ new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
+ new_fs->mds_map.data_pools = fs->mds_map.data_pools;
+ new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
+ new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
+ new_fs->mds_map.fs_name = fs->mds_map.fs_name;
+ new_fs->mds_map.compat = compat;
+ new_fs->mds_map.created = ceph_clock_now();
+ new_fs->mds_map.modified = ceph_clock_now();
+ new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
+ new_fs->mds_map.enabled = true;
+
+ // Remember mds ranks that have ever started. (They should load old inotable
+ // instead of creating new one if they start again.)
+ new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
+ new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
+ new_fs->mds_map.stopped.erase(mds_rank_t(0));
+
+ // Persist the new FSMap
+ filesystems[new_fs->fscid] = new_fs;
+}
+
+void FSMap::get_health(list<pair<health_status_t,string> >& summary,
+ list<pair<health_status_t,string> > *detail) const
+{
+ mds_rank_t standby_count_wanted = 0;
+ for (const auto &i : filesystems) {
+ const auto &fs = i.second;
+
+ // TODO: move get_health up into here so that we can qualify
+ // all the messages with what filesystem they're talking about
+ fs->mds_map.get_health(summary, detail);
+
+ standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
+ }
+
+ if (standby_count_wanted) {
+ std::ostringstream oss;
+ oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
+ summary.push_back(make_pair(HEALTH_WARN, oss.str()));
+ }
+}
+
+bool FSMap::check_health(void)
+{
+ bool changed = false;
+ for (auto &i : filesystems) {
+ changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
+ }
+ return changed;
+}
+
+void FSMap::get_health_checks(health_check_map_t *checks) const
+{
+ mds_rank_t standby_count_wanted = 0;
+ for (const auto &i : filesystems) {
+ const auto &fs = i.second;
+ health_check_map_t fschecks;
+
+ fs->mds_map.get_health_checks(&fschecks);
+
+ // Some of the failed ranks might be transient (i.e. there are standbys
+ // ready to replace them). We will report only on "stuck" failed, i.e.
+ // ranks which are failed and have no standby replacement available.
+ std::set<mds_rank_t> stuck_failed;
+
+ for (const auto &rank : fs->mds_map.failed) {
+ auto&& replacement = find_replacement_for({fs->fscid, rank}, {});
+ if (replacement == MDS_GID_NONE) {
+ stuck_failed.insert(rank);
+ }
+ }
+
+ // FS_WITH_FAILED_MDS
+ if (!stuck_failed.empty()) {
+ health_check_t& fscheck = checks->get_or_add(
+ "FS_WITH_FAILED_MDS", HEALTH_WARN,
+ "%num% filesystem%plurals% %hasorhave% a failed mds daemon");
+ ostringstream ss;
+ ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
+ << " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
+ fscheck.detail.push_back(ss.str()); }
+
+ checks->merge(fschecks);
+ standby_count_wanted = std::max(
+ standby_count_wanted,
+ fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
+ }
+
+ // MDS_INSUFFICIENT_STANDBY
+ if (standby_count_wanted) {
+ std::ostringstream oss, dss;
+ oss << "insufficient standby MDS daemons available";
+ auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str());
+ dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted
+ << " more";
+ d.detail.push_back(dss.str());
+ }
+}
+
+void FSMap::encode(bufferlist& bl, uint64_t features) const
+{
+ if (features & CEPH_FEATURE_SERVER_JEWEL) {
+ ENCODE_START(7, 6, bl);
+ encode(epoch, bl);
+ encode(next_filesystem_id, bl);
+ encode(legacy_client_fscid, bl);
+ encode(compat, bl);
+ encode(enable_multiple, bl);
+ {
+ std::vector<Filesystem::ref> v;
+ v.reserve(filesystems.size());
+ for (auto& p : filesystems) v.emplace_back(p.second);
+ encode(v, bl, features);
+ }
+ encode(mds_roles, bl);
+ encode(standby_daemons, bl, features);
+ encode(standby_epochs, bl);
+ encode(ever_enabled_multiple, bl);
+ ENCODE_FINISH(bl);
+ } else {
+ if (filesystems.empty()) {
+ MDSMap disabled_map;
+ disabled_map.epoch = epoch;
+ disabled_map.encode(bl, features);
+ } else {
+ // MDSMonitor should never have created multiple filesystems
+ // until the quorum features indicated Jewel
+ ceph_assert(filesystems.size() == 1);
+ auto fs = filesystems.begin()->second;
+
+ // Take the MDSMap for the enabled filesystem, and populated its
+ // mds_info with the standbys to get a pre-jewel-style mon MDSMap.
+ MDSMap full_mdsmap = fs->mds_map;
+ full_mdsmap.epoch = epoch;
+ for (const auto &p : standby_daemons) {
+ full_mdsmap.mds_info[p.first] = p.second;
+ }
+
+ // Old MDSMaps don't set rank on standby replay daemons
+ for (auto &i : full_mdsmap.mds_info) {
+ auto &info = i.second;
+ if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
+ info.rank = MDS_RANK_NONE;
+ }
+ }
+
+ full_mdsmap.encode(bl, features);
+ }
+ }
+}
+
+void FSMap::decode(bufferlist::const_iterator& p)
+{
+ // The highest MDSMap encoding version before we changed the
+ // MDSMonitor to store an FSMap instead of an MDSMap was
+ // 5, so anything older than 6 is decoded as an MDSMap,
+ // and anything newer is decoded as an FSMap.
+ DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p);
+ if (struct_v < 6) {
+ // Because the mon used to store an MDSMap where we now
+ // store an FSMap, FSMap knows how to decode the legacy
+ // MDSMap format (it never needs to encode it though).
+ MDSMap legacy_mds_map;
+
+ // Decoding an MDSMap (upgrade)
+ decode(epoch, p);
+ decode(legacy_mds_map.flags, p);
+ decode(legacy_mds_map.last_failure, p);
+ decode(legacy_mds_map.root, p);
+ decode(legacy_mds_map.session_timeout, p);
+ decode(legacy_mds_map.session_autoclose, p);
+ decode(legacy_mds_map.max_file_size, p);
+ decode(legacy_mds_map.max_mds, p);
+ decode(legacy_mds_map.mds_info, p);
+ if (struct_v < 3) {
+ __u32 n;
+ decode(n, p);
+ while (n--) {
+ __u32 m;
+ decode(m, p);
+ legacy_mds_map.data_pools.push_back(m);
+ }
+ __s32 s;
+ decode(s, p);
+ legacy_mds_map.cas_pool = s;
+ } else {
+ decode(legacy_mds_map.data_pools, p);
+ decode(legacy_mds_map.cas_pool, p);
+ }
+
+ // kclient ignores everything from here
+ __u16 ev = 1;
+ if (struct_v >= 2)
+ decode(ev, p);
+ if (ev >= 3)
+ decode(legacy_mds_map.compat, p);
+ else
+ legacy_mds_map.compat = MDSMap::get_compat_set_base();
+ if (ev < 5) {
+ __u32 n;
+ decode(n, p);
+ legacy_mds_map.metadata_pool = n;
+ } else {
+ decode(legacy_mds_map.metadata_pool, p);
+ }
+ decode(legacy_mds_map.created, p);
+ decode(legacy_mds_map.modified, p);
+ decode(legacy_mds_map.tableserver, p);
+ decode(legacy_mds_map.in, p);
+ std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
+ decode(inc, p);
+ decode(legacy_mds_map.up, p);
+ decode(legacy_mds_map.failed, p);
+ decode(legacy_mds_map.stopped, p);
+ if (ev >= 4)
+ decode(legacy_mds_map.last_failure_osd_epoch, p);
+ if (ev >= 6) {
+ if (ev < 10) {
+ // previously this was a bool about snaps, not a flag map
+ bool flag;
+ decode(flag, p);
+ legacy_mds_map.ever_allowed_features = flag ?
+ CEPH_MDSMAP_ALLOW_SNAPS : 0;
+ decode(flag, p);
+ legacy_mds_map.explicitly_allowed_features = flag ?
+ CEPH_MDSMAP_ALLOW_SNAPS : 0;
+ } else {
+ decode(legacy_mds_map.ever_allowed_features, p);
+ decode(legacy_mds_map.explicitly_allowed_features, p);
+ }
+ } else {
+ legacy_mds_map.ever_allowed_features = 0;
+ legacy_mds_map.explicitly_allowed_features = 0;
+ }
+ if (ev >= 7)
+ decode(legacy_mds_map.inline_data_enabled, p);
+
+ if (ev >= 8) {
+ ceph_assert(struct_v >= 5);
+ decode(legacy_mds_map.enabled, p);
+ decode(legacy_mds_map.fs_name, p);
+ } else {
+ legacy_mds_map.fs_name = "default";
+ if (epoch > 1) {
+ // If an MDS has ever been started, epoch will be greater than 1,
+ // assume filesystem is enabled.
+ legacy_mds_map.enabled = true;
+ } else {
+ // Upgrading from a cluster that never used an MDS, switch off
+ // filesystem until it's explicitly enabled.
+ legacy_mds_map.enabled = false;
+ }
+ }
+
+ if (ev >= 9) {
+ decode(legacy_mds_map.damaged, p);
+ }
+
+ // We're upgrading, populate filesystems from the legacy fields
+ filesystems.clear();
+ standby_daemons.clear();
+ standby_epochs.clear();
+ mds_roles.clear();
+ compat = legacy_mds_map.compat;
+ enable_multiple = false;
+
+ // Synthesise a Filesystem from legacy_mds_map, if enabled
+ if (legacy_mds_map.enabled) {
+ // Construct a Filesystem from the legacy MDSMap
+ auto migrate_fs = Filesystem::create();
+ migrate_fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
+ migrate_fs->mds_map = legacy_mds_map;
+ migrate_fs->mds_map.epoch = epoch;
+ filesystems[migrate_fs->fscid] = migrate_fs;
+
+ // List of GIDs that had invalid states
+ std::set<mds_gid_t> drop_gids;
+
+ // Construct mds_roles, standby_daemons, and remove
+ // standbys from the MDSMap in the Filesystem.
+ for (const auto& [gid, info] : migrate_fs->mds_map.mds_info) {
+ if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
+ /* drop any legacy standby-replay daemons */
+ drop_gids.insert(gid);
+ } else if (info.rank == MDS_RANK_NONE) {
+ if (info.state != MDSMap::STATE_STANDBY) {
+ // Old MDSMaps can have down:dne here, which
+ // is invalid in an FSMap (#17837)
+ drop_gids.insert(gid);
+ } else {
+ insert(info); // into standby_daemons
+ }
+ } else {
+ mds_roles[gid] = migrate_fs->fscid;
+ }
+ }
+ for (const auto &p : standby_daemons) {
+ // Erase from this Filesystem's MDSMap, because it has
+ // been copied into FSMap::Standby_daemons above
+ migrate_fs->mds_map.mds_info.erase(p.first);
+ }
+ for (const auto &gid : drop_gids) {
+ // Throw away all info for this MDS because it was identified
+ // as having invalid state above.
+ migrate_fs->mds_map.mds_info.erase(gid);
+ }
+
+ legacy_client_fscid = migrate_fs->fscid;
+ } else {
+ legacy_client_fscid = FS_CLUSTER_ID_NONE;
+ }
+ } else {
+ decode(epoch, p);
+ decode(next_filesystem_id, p);
+ decode(legacy_client_fscid, p);
+ decode(compat, p);
+ decode(enable_multiple, p);
+ {
+ std::vector<Filesystem::ref> v;
+ decode(v, p);
+ filesystems.clear();
+ for (auto& ref : v) {
+ auto em = filesystems.emplace(std::piecewise_construct, std::forward_as_tuple(ref->fscid), std::forward_as_tuple(std::move(ref)));
+ ceph_assert(em.second);
+ }
+ }
+ decode(mds_roles, p);
+ decode(standby_daemons, p);
+ decode(standby_epochs, p);
+ if (struct_v >= 7) {
+ decode(ever_enabled_multiple, p);
+ }
+ }
+
+ DECODE_FINISH(p);
+}
+
+void FSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
+{
+ for (auto &fs : filesystems) {
+ fs.second->mds_map.sanitize(pool_exists);
+ }
+}
+
+void Filesystem::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(fscid, bl);
+ bufferlist mdsmap_bl;
+ mds_map.encode(mdsmap_bl, features);
+ encode(mdsmap_bl, bl);
+ ENCODE_FINISH(bl);
+}
+
+void Filesystem::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(fscid, p);
+ bufferlist mdsmap_bl;
+ decode(mdsmap_bl, p);
+ auto mdsmap_bl_iter = mdsmap_bl.cbegin();
+ mds_map.decode(mdsmap_bl_iter);
+ DECODE_FINISH(p);
+}
+
+int FSMap::parse_filesystem(
+ std::string_view ns_str,
+ Filesystem::const_ref* result
+ ) const
+{
+ std::string ns_err;
+ std::string s(ns_str);
+ fs_cluster_id_t fscid = strict_strtol(s.c_str(), 10, &ns_err);
+ if (!ns_err.empty() || filesystems.count(fscid) == 0) {
+ for (auto &fs : filesystems) {
+ if (fs.second->mds_map.fs_name == s) {
+ *result = std::const_pointer_cast<const Filesystem>(fs.second);
+ return 0;
+ }
+ }
+ return -ENOENT;
+ } else {
+ *result = get_filesystem(fscid);
+ return 0;
+ }
+}
+
+void Filesystem::print(std::ostream &out) const
+{
+ out << "Filesystem '" << mds_map.fs_name
+ << "' (" << fscid << ")" << std::endl;
+ mds_map.print(out);
+}
+
+mds_gid_t FSMap::get_available_standby() const
+{
+ for (const auto& [gid, info] : standby_daemons) {
+ ceph_assert(info.rank == MDS_RANK_NONE);
+ ceph_assert(info.state == MDSMap::STATE_STANDBY);
+
+ if (info.laggy() || info.is_frozen()) {
+ continue;
+ }
+
+ return gid;
+ }
+ return MDS_GID_NONE;
+}
+
+mds_gid_t FSMap::find_replacement_for(mds_role_t role, std::string_view name) const
+{
+ auto&& fs = get_filesystem(role.fscid);
+
+ // First see if we have a STANDBY_REPLAY
+ for (const auto& [gid, info] : fs->mds_map.mds_info) {
+ if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
+ if (info.is_frozen()) {
+ /* the standby-replay is frozen, do nothing! */
+ return MDS_GID_NONE;
+ } else {
+ return gid;
+ }
+ }
+ }
+
+ return get_available_standby();
+}
+
+void FSMap::sanity() const
+{
+ if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
+ ceph_assert(filesystems.count(legacy_client_fscid) == 1);
+ }
+
+ for (const auto &i : filesystems) {
+ auto fs = i.second;
+ ceph_assert(fs->mds_map.compat.compare(compat) == 0);
+ ceph_assert(fs->fscid == i.first);
+ for (const auto &j : fs->mds_map.mds_info) {
+ ceph_assert(j.second.rank != MDS_RANK_NONE);
+ ceph_assert(mds_roles.count(j.first) == 1);
+ ceph_assert(standby_daemons.count(j.first) == 0);
+ ceph_assert(standby_epochs.count(j.first) == 0);
+ ceph_assert(mds_roles.at(j.first) == i.first);
+ if (j.second.state != MDSMap::STATE_STANDBY_REPLAY) {
+ ceph_assert(fs->mds_map.up.at(j.second.rank) == j.first);
+ ceph_assert(fs->mds_map.failed.count(j.second.rank) == 0);
+ ceph_assert(fs->mds_map.damaged.count(j.second.rank) == 0);
+ }
+ }
+
+ for (const auto &j : fs->mds_map.up) {
+ mds_rank_t rank = j.first;
+ ceph_assert(fs->mds_map.in.count(rank) == 1);
+ mds_gid_t gid = j.second;
+ ceph_assert(fs->mds_map.mds_info.count(gid) == 1);
+ }
+ }
+
+ for (const auto &i : standby_daemons) {
+ ceph_assert(i.second.state == MDSMap::STATE_STANDBY);
+ ceph_assert(i.second.rank == MDS_RANK_NONE);
+ ceph_assert(i.second.global_id == i.first);
+ ceph_assert(standby_epochs.count(i.first) == 1);
+ ceph_assert(mds_roles.count(i.first) == 1);
+ ceph_assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE);
+ }
+
+ for (const auto &i : standby_epochs) {
+ ceph_assert(standby_daemons.count(i.first) == 1);
+ }
+
+ for (const auto &i : mds_roles) {
+ if (i.second == FS_CLUSTER_ID_NONE) {
+ ceph_assert(standby_daemons.count(i.first) == 1);
+ } else {
+ ceph_assert(filesystems.count(i.second) == 1);
+ ceph_assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1);
+ }
+ }
+}
+
+void FSMap::promote(
+ mds_gid_t standby_gid,
+ Filesystem& filesystem,
+ mds_rank_t assigned_rank)
+{
+ ceph_assert(gid_exists(standby_gid));
+ bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE;
+ if (!is_standby_replay) {
+ ceph_assert(standby_daemons.count(standby_gid));
+ ceph_assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
+ }
+
+ MDSMap &mds_map = filesystem.mds_map;
+
+ // Insert daemon state to Filesystem
+ if (!is_standby_replay) {
+ mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
+ } else {
+ ceph_assert(mds_map.mds_info.count(standby_gid));
+ ceph_assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
+ ceph_assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
+ }
+ MDSMap::mds_info_t &info = mds_map.mds_info[standby_gid];
+
+ if (mds_map.stopped.erase(assigned_rank)) {
+ // The cluster is being expanded with a stopped rank
+ info.state = MDSMap::STATE_STARTING;
+ } else if (!mds_map.is_in(assigned_rank)) {
+ // The cluster is being expanded with a new rank
+ info.state = MDSMap::STATE_CREATING;
+ } else {
+ // An existing rank is being assigned to a replacement
+ info.state = MDSMap::STATE_REPLAY;
+ mds_map.failed.erase(assigned_rank);
+ }
+ info.rank = assigned_rank;
+ info.inc = epoch;
+ mds_roles[standby_gid] = filesystem.fscid;
+
+ // Update the rank state in Filesystem
+ mds_map.in.insert(assigned_rank);
+ mds_map.up[assigned_rank] = standby_gid;
+
+ // Remove from the list of standbys
+ if (!is_standby_replay) {
+ standby_daemons.erase(standby_gid);
+ standby_epochs.erase(standby_gid);
+ }
+
+ // Indicate that Filesystem has been modified
+ mds_map.epoch = epoch;
+}
+
+void FSMap::assign_standby_replay(
+ const mds_gid_t standby_gid,
+ const fs_cluster_id_t leader_ns,
+ const mds_rank_t leader_rank)
+{
+ ceph_assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE);
+ ceph_assert(gid_exists(standby_gid));
+ ceph_assert(!gid_has_rank(standby_gid));
+ ceph_assert(standby_daemons.count(standby_gid));
+
+ // Insert to the filesystem
+ auto fs = filesystems.at(leader_ns);
+ fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
+ fs->mds_map.mds_info[standby_gid].rank = leader_rank;
+ fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY;
+ mds_roles[standby_gid] = leader_ns;
+
+ // Remove from the list of standbys
+ standby_daemons.erase(standby_gid);
+ standby_epochs.erase(standby_gid);
+
+ // Indicate that Filesystem has been modified
+ fs->mds_map.epoch = epoch;
+}
+
+void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch)
+{
+ if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
+ standby_daemons.erase(who);
+ standby_epochs.erase(who);
+ } else {
+ auto &fs = filesystems.at(mds_roles.at(who));
+ const auto &info = fs->mds_map.mds_info.at(who);
+ if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
+ if (info.state == MDSMap::STATE_CREATING) {
+ // If this gid didn't make it past CREATING, then forget
+ // the rank ever existed so that next time it's handed out
+ // to a gid it'll go back into CREATING.
+ fs->mds_map.in.erase(info.rank);
+ } else {
+ // Put this rank into the failed list so that the next available
+ // STANDBY will pick it up.
+ fs->mds_map.failed.insert(info.rank);
+ }
+ ceph_assert(fs->mds_map.up.at(info.rank) == info.global_id);
+ fs->mds_map.up.erase(info.rank);
+ }
+ fs->mds_map.mds_info.erase(who);
+ fs->mds_map.last_failure_osd_epoch = blacklist_epoch;
+ fs->mds_map.epoch = epoch;
+ }
+
+ mds_roles.erase(who);
+}
+
+void FSMap::damaged(mds_gid_t who, epoch_t blacklist_epoch)
+{
+ ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
+ auto fs = filesystems.at(mds_roles.at(who));
+ mds_rank_t rank = fs->mds_map.mds_info[who].rank;
+
+ erase(who, blacklist_epoch);
+ fs->mds_map.failed.erase(rank);
+ fs->mds_map.damaged.insert(rank);
+
+ ceph_assert(fs->mds_map.epoch == epoch);
+}
+
+/**
+ * Update to indicate that the rank `rank` is to be removed
+ * from the damaged list of the filesystem `fscid`
+ */
+bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank)
+{
+ auto fs = filesystems.at(fscid);
+
+ if (fs->mds_map.damaged.erase(rank)) {
+ fs->mds_map.failed.insert(rank);
+ fs->mds_map.epoch = epoch;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void FSMap::insert(const MDSMap::mds_info_t &new_info)
+{
+ ceph_assert(new_info.state == MDSMap::STATE_STANDBY);
+ ceph_assert(new_info.rank == MDS_RANK_NONE);
+ mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE;
+ standby_daemons[new_info.global_id] = new_info;
+ standby_epochs[new_info.global_id] = epoch;
+}
+
+std::list<mds_gid_t> FSMap::stop(mds_gid_t who)
+{
+ ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
+ auto fs = filesystems.at(mds_roles.at(who));
+ const auto &info = fs->mds_map.mds_info.at(who);
+ fs->mds_map.up.erase(info.rank);
+ fs->mds_map.in.erase(info.rank);
+ fs->mds_map.stopped.insert(info.rank);
+
+ // Also drop any standby replays that were following this rank
+ std::list<mds_gid_t> standbys;
+ for (const auto &i : fs->mds_map.mds_info) {
+ const auto &other_gid = i.first;
+ const auto &other_info = i.second;
+ if (other_info.rank == info.rank
+ && other_info.state == MDSMap::STATE_STANDBY_REPLAY) {
+ standbys.push_back(other_gid);
+ erase(other_gid, 0);
+ }
+ }
+
+ fs->mds_map.mds_info.erase(who);
+ mds_roles.erase(who);
+
+ fs->mds_map.epoch = epoch;
+
+ return standbys;
+}
+
+
+/**
+ * Given one of the following forms:
+ * <fs name>:<rank>
+ * <fs id>:<rank>
+ * <rank>
+ *
+ * Parse into a mds_role_t. The rank-only form is only valid
+ * if legacy_client_ns is set.
+ */
+int FSMap::parse_role(
+ std::string_view role_str,
+ mds_role_t *role,
+ std::ostream &ss) const
+{
+ size_t colon_pos = role_str.find(":");
+ size_t rank_pos;
+ Filesystem::const_ref fs;
+ if (colon_pos == std::string::npos) {
+ if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
+ ss << "No filesystem selected";
+ return -ENOENT;
+ }
+ fs = get_filesystem(legacy_client_fscid);
+ rank_pos = 0;
+ } else {
+ if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) {
+ ss << "Invalid filesystem";
+ return -ENOENT;
+ }
+ rank_pos = colon_pos+1;
+ }
+
+ mds_rank_t rank;
+ std::string err;
+ std::string rank_str(role_str.substr(rank_pos));
+ long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
+ if (rank_i < 0 || !err.empty()) {
+ ss << "Invalid rank '" << rank_str << "'";
+ return -EINVAL;
+ } else {
+ rank = rank_i;
+ }
+
+ if (fs->mds_map.in.count(rank) == 0) {
+ ss << "Rank '" << rank << "' not found";
+ return -ENOENT;
+ }
+
+ *role = {fs->fscid, rank};
+
+ return 0;
+}
diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h
new file mode 100644
index 00000000..e02a3d72
--- /dev/null
+++ b/src/mds/FSMap.h
@@ -0,0 +1,532 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FSMAP_H
+#define CEPH_FSMAP_H
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <string_view>
+
+#include <errno.h>
+
+#include "include/types.h"
+#include "common/Clock.h"
+#include "mds/MDSMap.h"
+
+#include "include/CompatSet.h"
+#include "include/ceph_features.h"
+#include "common/Formatter.h"
+#include "mds/mdstypes.h"
+
+class CephContext;
+class health_check_map_t;
+
+#define MDS_FS_NAME_DEFAULT "cephfs"
+
+/**
+ * The MDSMap and any additional fields describing a particular
+ * filesystem (a unique fs_cluster_id_t).
+ */
+class Filesystem
+{
+public:
+ using ref = std::shared_ptr<Filesystem>;
+ using const_ref = std::shared_ptr<Filesystem const>;
+
+ template<typename... Args>
+ static ref create(Args&&... args)
+ {
+ return std::make_shared<Filesystem>(std::forward<Args>(args)...);
+ }
+
+ void encode(bufferlist& bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator& p);
+
+ void dump(Formatter *f) const;
+ void print(std::ostream& out) const;
+
+ /**
+ * Return true if a daemon is already assigned as
+ * STANDBY_REPLAY for the gid `who`
+ */
+ bool has_standby_replay(mds_gid_t who) const
+ {
+ return get_standby_replay(who) != MDS_GID_NONE;
+ }
+ mds_gid_t get_standby_replay(mds_gid_t who) const
+ {
+ for (const auto &i : mds_map.mds_info) {
+ const auto &info = i.second;
+ if (info.state == MDSMap::STATE_STANDBY_REPLAY
+ && info.rank == mds_map.mds_info.at(who).rank) {
+ return info.global_id;
+ }
+ }
+ return MDS_GID_NONE;
+ }
+ bool is_standby_replay(mds_gid_t who) const
+ {
+ auto p = mds_map.mds_info.find(who);
+ if (p != mds_map.mds_info.end() &&
+ p->second.state == MDSMap::STATE_STANDBY_REPLAY) {
+ return true;
+ }
+ return false;
+ }
+
+ fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+ MDSMap mds_map;
+};
+WRITE_CLASS_ENCODER_FEATURES(Filesystem)
+
+class FSMap {
+protected:
+ epoch_t epoch = 0;
+ uint64_t next_filesystem_id = FS_CLUSTER_ID_ANONYMOUS + 1;
+ fs_cluster_id_t legacy_client_fscid = FS_CLUSTER_ID_NONE;
+ CompatSet compat;
+ bool enable_multiple = false;
+ bool ever_enabled_multiple = false; // < the cluster had multiple MDSes enabled once
+
+ std::map<fs_cluster_id_t, Filesystem::ref> filesystems;
+
+ // Remember which Filesystem an MDS daemon's info is stored in
+ // (or in standby_daemons for FS_CLUSTER_ID_NONE)
+ std::map<mds_gid_t, fs_cluster_id_t> mds_roles;
+
+ // For MDS daemons not yet assigned to a Filesystem
+ std::map<mds_gid_t, MDSMap::mds_info_t> standby_daemons;
+ std::map<mds_gid_t, epoch_t> standby_epochs;
+
+public:
+
+ friend class MDSMonitor;
+ friend class PaxosFSMap;
+
+ FSMap() : compat(MDSMap::get_compat_set_default()) {}
+
+ FSMap(const FSMap &rhs)
+ :
+ epoch(rhs.epoch),
+ next_filesystem_id(rhs.next_filesystem_id),
+ legacy_client_fscid(rhs.legacy_client_fscid),
+ compat(rhs.compat),
+ enable_multiple(rhs.enable_multiple),
+ ever_enabled_multiple(rhs.ever_enabled_multiple),
+ mds_roles(rhs.mds_roles),
+ standby_daemons(rhs.standby_daemons),
+ standby_epochs(rhs.standby_epochs)
+ {
+ filesystems.clear();
+ for (const auto &i : rhs.filesystems) {
+ const auto &fs = i.second;
+ filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
+ }
+ }
+
+ FSMap &operator=(const FSMap &rhs)
+ {
+ epoch = rhs.epoch;
+ next_filesystem_id = rhs.next_filesystem_id;
+ legacy_client_fscid = rhs.legacy_client_fscid;
+ compat = rhs.compat;
+ enable_multiple = rhs.enable_multiple;
+ mds_roles = rhs.mds_roles;
+ standby_daemons = rhs.standby_daemons;
+ standby_epochs = rhs.standby_epochs;
+
+ filesystems.clear();
+ for (const auto &i : rhs.filesystems) {
+ const auto &fs = i.second;
+ filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
+ }
+
+ return *this;
+ }
+
+ const CompatSet &get_compat() const {return compat;}
+
+ void set_enable_multiple(const bool v)
+ {
+ enable_multiple = v;
+ if (true == v) {
+ ever_enabled_multiple = true;
+ }
+ }
+
+ bool get_enable_multiple() const
+ {
+ return enable_multiple;
+ }
+
+ void set_legacy_client_fscid(fs_cluster_id_t fscid)
+ {
+ ceph_assert(fscid == FS_CLUSTER_ID_NONE || filesystems.count(fscid));
+ legacy_client_fscid = fscid;
+ }
+
+ fs_cluster_id_t get_legacy_client_fscid() const
+ {
+ return legacy_client_fscid;
+ }
+
+ size_t get_num_standby() const {
+ return standby_daemons.size();
+ }
+
+ bool is_any_degraded() const {
+ for (auto& i : filesystems) {
+ if (i.second->mds_map.is_degraded()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Get state of all daemons (for all filesystems, including all standbys)
+ */
+ std::map<mds_gid_t, MDSMap::mds_info_t> get_mds_info() const
+ {
+ std::map<mds_gid_t, MDSMap::mds_info_t> result;
+ for (const auto &i : standby_daemons) {
+ result[i.first] = i.second;
+ }
+
+ for (const auto &i : filesystems) {
+ const auto &fs_info = i.second->mds_map.get_mds_info();
+ for (const auto &j : fs_info) {
+ result[j.first] = j.second;
+ }
+ }
+
+ return result;
+ }
+
+ mds_gid_t get_available_standby() const;
+
+ /**
+ * Resolve daemon name to GID
+ */
+ mds_gid_t find_mds_gid_by_name(std::string_view s) const
+ {
+ const auto info = get_mds_info();
+ for (const auto &p : info) {
+ if (p.second.name == s) {
+ return p.first;
+ }
+ }
+ return MDS_GID_NONE;
+ }
+
+ /**
+ * Resolve daemon name to status
+ */
+ const MDSMap::mds_info_t* find_by_name(std::string_view name) const
+ {
+ std::map<mds_gid_t, MDSMap::mds_info_t> result;
+ for (const auto &i : standby_daemons) {
+ if (i.second.name == name) {
+ return &(i.second);
+ }
+ }
+
+ for (const auto &i : filesystems) {
+ const auto &fs_info = i.second->mds_map.get_mds_info();
+ for (const auto &j : fs_info) {
+ if (j.second.name == name) {
+ return &(j.second);
+ }
+ }
+ }
+
+ return nullptr;
+ }
+
+ /**
+ * Does a daemon exist with this GID?
+ */
+ bool gid_exists(mds_gid_t gid) const
+ {
+ return mds_roles.count(gid) > 0;
+ }
+
+ /**
+ * Does a daemon with this GID exist, *and* have an MDS rank assigned?
+ */
+ bool gid_has_rank(mds_gid_t gid) const
+ {
+ return gid_exists(gid) && mds_roles.at(gid) != FS_CLUSTER_ID_NONE;
+ }
+
+ /**
+ * Insert a new MDS daemon, as a standby
+ */
+ void insert(const MDSMap::mds_info_t &new_info);
+
+ /**
+ * Assign an MDS cluster standby replay rank to a standby daemon
+ */
+ void assign_standby_replay(
+ const mds_gid_t standby_gid,
+ const fs_cluster_id_t leader_ns,
+ const mds_rank_t leader_rank);
+
+ /**
+ * Assign an MDS cluster rank to a standby daemon
+ */
+ void promote(
+ mds_gid_t standby_gid,
+ Filesystem& filesystem,
+ mds_rank_t assigned_rank);
+
+ /**
+ * A daemon reports that it is STATE_STOPPED: remove it,
+ * and the rank it held.
+ *
+ * @returns a list of any additional GIDs that were removed from the map
+ * as a side effect (like standby replays)
+ */
+ std::list<mds_gid_t> stop(mds_gid_t who);
+
+ /**
+ * The rank held by 'who', if any, is to be relinquished, and
+ * the state for the daemon GID is to be forgotten.
+ */
+ void erase(mds_gid_t who, epoch_t blacklist_epoch);
+
+ /**
+ * Update to indicate that the rank held by 'who' is damaged
+ */
+ void damaged(mds_gid_t who, epoch_t blacklist_epoch);
+
+ /**
+ * Update to indicate that the rank `rank` is to be removed
+ * from the damaged list of the filesystem `fscid`
+ */
+ bool undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank);
+
+ /**
+ * Initialize a Filesystem and assign a fscid. Update legacy_client_fscid
+ * to point to the new filesystem if it's the only one.
+ *
+ * Caller must already have validated all arguments vs. the existing
+ * FSMap and OSDMap contents.
+ */
+ Filesystem::ref create_filesystem(
+ std::string_view name, int64_t metadata_pool,
+ int64_t data_pool, uint64_t features);
+
+ /**
+ * Remove the filesystem (it must exist). Caller should already
+ * have failed out any MDSs that were assigned to the filesystem.
+ */
+ void erase_filesystem(fs_cluster_id_t fscid)
+ {
+ filesystems.erase(fscid);
+ }
+
+ /**
+ * Reset all the state information (not configuration information)
+ * in a particular filesystem. Caller must have verified that
+ * the filesystem already exists.
+ */
+ void reset_filesystem(fs_cluster_id_t fscid);
+
+ /**
+ * Mutator helper for Filesystem objects: expose a non-const
+ * Filesystem pointer to `fn` and update epochs appropriately.
+ */
+ template<typename T>
+ void modify_filesystem(fs_cluster_id_t fscid, T&& fn)
+ {
+ auto& fs = filesystems.at(fscid);
+ fn(fs);
+ fs->mds_map.epoch = epoch;
+ }
+
+ /**
+ * Apply a mutation to the mds_info_t structure for a particular
+ * daemon (identified by GID), and make appropriate updates to epochs.
+ */
+ template<typename T>
+ void modify_daemon(mds_gid_t who, T&& fn)
+ {
+ const auto& fscid = mds_roles.at(who);
+ if (fscid == FS_CLUSTER_ID_NONE) {
+ auto& info = standby_daemons.at(who);
+ fn(info);
+ ceph_assert(info.state == MDSMap::STATE_STANDBY);
+ standby_epochs[who] = epoch;
+ } else {
+ auto& fs = filesystems.at(fscid);
+ auto& info = fs->mds_map.mds_info.at(who);
+ fn(info);
+ fs->mds_map.epoch = epoch;
+ }
+ }
+
+ /**
+ * Given that gid exists in a filesystem or as a standby, return
+ * a reference to its info.
+ */
+ const MDSMap::mds_info_t& get_info_gid(mds_gid_t gid) const
+ {
+ auto fscid = mds_roles.at(gid);
+ if (fscid == FS_CLUSTER_ID_NONE) {
+ return standby_daemons.at(gid);
+ } else {
+ return filesystems.at(fscid)->mds_map.mds_info.at(gid);
+ }
+ }
+
+ bool is_standby_replay(mds_gid_t who) const
+ {
+ return filesystems.at(mds_roles.at(who))->is_standby_replay(who);
+ }
+
+ mds_gid_t get_standby_replay(mds_gid_t who) const
+ {
+ return filesystems.at(mds_roles.at(who))->get_standby_replay(who);
+ }
+
+ /**
+ * A daemon has told us it's compat, and it's too new
+ * for the one we had previously. Impose the new one
+ * on all filesystems.
+ */
+ void update_compat(const CompatSet &c)
+ {
+ // We could do something more complicated here to enable
+ // different filesystems to be served by different MDS versions,
+ // but this is a lot simpler because it doesn't require us to
+ // track the compat versions for standby daemons.
+ compat = c;
+ for (const auto &i : filesystems) {
+ MDSMap &mds_map = i.second->mds_map;
+ mds_map.compat = c;
+ mds_map.epoch = epoch;
+ }
+ }
+
+ Filesystem::const_ref get_legacy_filesystem()
+ {
+ if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
+ return nullptr;
+ } else {
+ return filesystems.at(legacy_client_fscid);
+ }
+ }
+
+ /**
+ * A daemon has informed us of its offload targets
+ */
+ void update_export_targets(mds_gid_t who, const std::set<mds_rank_t> &targets)
+ {
+ auto fscid = mds_roles.at(who);
+ modify_filesystem(fscid, [who, &targets](auto&& fs) {
+ fs->mds_map.mds_info.at(who).export_targets = targets;
+ });
+ }
+
+ epoch_t get_epoch() const { return epoch; }
+ void inc_epoch() { epoch++; }
+
+ size_t filesystem_count() const {return filesystems.size();}
+ bool filesystem_exists(fs_cluster_id_t fscid) const {return filesystems.count(fscid) > 0;}
+ Filesystem::const_ref get_filesystem(fs_cluster_id_t fscid) const {return std::const_pointer_cast<const Filesystem>(filesystems.at(fscid));}
+ Filesystem::ref get_filesystem(fs_cluster_id_t fscid) {return filesystems.at(fscid);}
+ Filesystem::const_ref get_filesystem(void) const {return std::const_pointer_cast<const Filesystem>(filesystems.begin()->second);}
+ Filesystem::const_ref get_filesystem(std::string_view name) const
+ {
+ for (const auto& p : filesystems) {
+ if (p.second->mds_map.fs_name == name) {
+ return p.second;
+ }
+ }
+ return nullptr;
+ }
+ std::vector<Filesystem::const_ref> get_filesystems(void) const
+ {
+ std::vector<Filesystem::const_ref> ret;
+ for (const auto& p : filesystems) {
+ ret.push_back(p.second);
+ }
+ return ret;
+ }
+
+ int parse_filesystem(
+ std::string_view ns_str,
+ Filesystem::const_ref *result
+ ) const;
+
+ int parse_role(
+ std::string_view role_str,
+ mds_role_t *role,
+ std::ostream &ss) const;
+
+ /**
+ * Return true if this pool is in use by any of the filesystems
+ */
+ bool pool_in_use(int64_t poolid) const {
+ for (auto const &i : filesystems) {
+ if (i.second->mds_map.is_data_pool(poolid)
+ || i.second->mds_map.metadata_pool == poolid) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ mds_gid_t find_replacement_for(mds_role_t mds, std::string_view name) const;
+
+ void get_health(list<pair<health_status_t,std::string> >& summary,
+ list<pair<health_status_t,std::string> > *detail) const;
+
+ void get_health_checks(health_check_map_t *checks) const;
+
+ bool check_health(void);
+
+ /**
+ * Assert that the FSMap, Filesystem, MDSMap, mds_info_t relations are
+ * all self-consistent.
+ */
+ void sanity() const;
+
+ void encode(bufferlist& bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator& p);
+ void decode(bufferlist& bl) {
+ auto p = bl.cbegin();
+ decode(p);
+ }
+ void sanitize(const std::function<bool(int64_t pool)>& pool_exists);
+
+ void print(ostream& out) const;
+ void print_summary(Formatter *f, ostream *out) const;
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<FSMap*>& ls);
+};
+WRITE_CLASS_ENCODER_FEATURES(FSMap)
+
+inline ostream& operator<<(ostream& out, const FSMap& m) {
+ m.print_summary(NULL, &out);
+ return out;
+}
+
+#endif
diff --git a/src/mds/FSMapUser.cc b/src/mds/FSMapUser.cc
new file mode 100644
index 00000000..47d5f19c
--- /dev/null
+++ b/src/mds/FSMapUser.cc
@@ -0,0 +1,81 @@
+#include "FSMapUser.h"
+
+void FSMapUser::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(epoch, bl);
+ encode(legacy_client_fscid, bl);
+ std::vector<fs_info_t> fs_list;
+ for (auto p = filesystems.begin(); p != filesystems.end(); ++p)
+ fs_list.push_back(p->second);
+ encode(fs_list, bl, features);
+ ENCODE_FINISH(bl);
+}
+
+void FSMapUser::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(epoch, p);
+ decode(legacy_client_fscid, p);
+ std::vector<fs_info_t> fs_list;
+ decode(fs_list, p);
+ filesystems.clear();
+ for (auto p = fs_list.begin(); p != fs_list.end(); ++p)
+ filesystems[p->cid] = *p;
+ DECODE_FINISH(p);
+}
+
+void FSMapUser::fs_info_t::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(cid, bl);
+ encode(name, bl);
+ ENCODE_FINISH(bl);
+}
+
+void FSMapUser::fs_info_t::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(cid, p);
+ decode(name, p);
+ DECODE_FINISH(p);
+}
+
+void FSMapUser::generate_test_instances(list<FSMapUser*>& ls)
+{
+ FSMapUser *m = new FSMapUser();
+ m->epoch = 2;
+ m->legacy_client_fscid = 1;
+ m->filesystems[1].cid = 1;
+ m->filesystems[2].name = "cephfs2";
+ m->filesystems[2].cid = 2;
+ m->filesystems[1].name = "cephfs1";
+ ls.push_back(m);
+}
+
+
+void FSMapUser::print(ostream& out) const
+{
+ out << "e" << epoch << std::endl;
+ out << "legacy_client_fscid: " << legacy_client_fscid << std::endl;
+ for (auto &p : filesystems)
+ out << " id " << p.second.cid << " name " << p.second.name << std::endl;
+}
+
+void FSMapUser::print_summary(Formatter *f, ostream *out)
+{
+ map<mds_role_t,string> by_rank;
+ map<string,int> by_state;
+
+ if (f) {
+ f->dump_unsigned("epoch", get_epoch());
+ for (auto &p : filesystems) {
+ f->dump_unsigned("id", p.second.cid);
+ f->dump_string("name", p.second.name);
+ }
+ } else {
+ *out << "e" << get_epoch() << ":";
+ for (auto &p : filesystems)
+ *out << " " << p.second.name << "(" << p.second.cid << ")";
+ }
+}
diff --git a/src/mds/FSMapUser.h b/src/mds/FSMapUser.h
new file mode 100644
index 00000000..23af8473
--- /dev/null
+++ b/src/mds/FSMapUser.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef CEPH_FSMAPCOMPACT_H
+#define CEPH_FSMAPCOMPACT_H
+
+#include <map>
+#include <string>
+#include <string_view>
+
+#include "mds/mdstypes.h"
+
+class FSMapUser {
+public:
+ struct fs_info_t {
+ fs_cluster_id_t cid;
+ std::string name;
+ fs_info_t() : cid(FS_CLUSTER_ID_NONE) {}
+ void encode(bufferlist& bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator &bl);
+ };
+
+ epoch_t epoch;
+ fs_cluster_id_t legacy_client_fscid;
+ std::map<fs_cluster_id_t, fs_info_t> filesystems;
+
+ FSMapUser()
+ : epoch(0), legacy_client_fscid(FS_CLUSTER_ID_NONE) { }
+
+ epoch_t get_epoch() const { return epoch; }
+
+ fs_cluster_id_t get_fs_cid(std::string_view name) const {
+ for (auto &p : filesystems) {
+ if (p.second.name == name)
+ return p.first;
+ }
+ return FS_CLUSTER_ID_NONE;
+ }
+
+ void encode(bufferlist& bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator& bl);
+
+ void print(ostream& out) const;
+ void print_summary(Formatter *f, ostream *out);
+
+ static void generate_test_instances(list<FSMapUser*>& ls);
+};
+WRITE_CLASS_ENCODER_FEATURES(FSMapUser::fs_info_t)
+WRITE_CLASS_ENCODER_FEATURES(FSMapUser)
+
+inline ostream& operator<<(ostream& out, FSMapUser& m) {
+ m.print_summary(NULL, &out);
+ return out;
+}
+#endif
diff --git a/src/mds/InoTable.cc b/src/mds/InoTable.cc
new file mode 100644
index 00000000..dfb6a41d
--- /dev/null
+++ b/src/mds/InoTable.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "InoTable.h"
+#include "MDSRank.h"
+
+#include "include/types.h"
+
+#include "common/config.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << rank << "." << table_name << ": "
+
+void InoTable::reset_state()
+{
+ // use generic range. FIXME THIS IS CRAP
+ free.clear();
+ //#ifdef __LP64__
+ uint64_t start = (uint64_t)(rank+1) << 40;
+ uint64_t len = (uint64_t)1 << 40;
+ //#else
+ //# warning this looks like a 32-bit system, using small inode numbers.
+ // uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 25;
+ // uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 25) - 1;
+ //#endif
+ free.insert(start, len);
+
+ projected_free = free;
+}
+
+inodeno_t InoTable::project_alloc_id(inodeno_t id)
+{
+ dout(10) << "project_alloc_id " << id << " to " << projected_free << "/" << free << dendl;
+ ceph_assert(is_active());
+ if (!id)
+ id = projected_free.range_start();
+ projected_free.erase(id);
+ ++projected_version;
+ return id;
+}
+void InoTable::apply_alloc_id(inodeno_t id)
+{
+ dout(10) << "apply_alloc_id " << id << " to " << projected_free << "/" << free << dendl;
+ free.erase(id);
+ ++version;
+}
+
+void InoTable::project_alloc_ids(interval_set<inodeno_t>& ids, int want)
+{
+ ceph_assert(is_active());
+ while (want > 0) {
+ inodeno_t start = projected_free.range_start();
+ inodeno_t end = projected_free.end_after(start);
+ inodeno_t num = end - start;
+ if (num > (inodeno_t)want)
+ num = want;
+ projected_free.erase(start, num);
+ ids.insert(start, num);
+ want -= num;
+ }
+ dout(10) << "project_alloc_ids " << ids << " to " << projected_free << "/" << free << dendl;
+ ++projected_version;
+}
+void InoTable::apply_alloc_ids(interval_set<inodeno_t>& ids)
+{
+ dout(10) << "apply_alloc_ids " << ids << " to " << projected_free << "/" << free << dendl;
+ free.subtract(ids);
+ ++version;
+}
+
+
+void InoTable::project_release_ids(interval_set<inodeno_t>& ids)
+{
+ dout(10) << "project_release_ids " << ids << " to " << projected_free << "/" << free << dendl;
+ projected_free.insert(ids);
+ ++projected_version;
+}
+void InoTable::apply_release_ids(interval_set<inodeno_t>& ids)
+{
+ dout(10) << "apply_release_ids " << ids << " to " << projected_free << "/" << free << dendl;
+ free.insert(ids);
+ ++version;
+}
+
+
+//
+
+void InoTable::replay_alloc_id(inodeno_t id)
+{
+ ceph_assert(mds); // Only usable in online mode
+
+ dout(10) << "replay_alloc_id " << id << dendl;
+ if (free.contains(id)) {
+ free.erase(id);
+ projected_free.erase(id);
+ } else {
+ mds->clog->error() << "journal replay alloc " << id
+ << " not in free " << free;
+ }
+ projected_version = ++version;
+}
+void InoTable::replay_alloc_ids(interval_set<inodeno_t>& ids)
+{
+ ceph_assert(mds); // Only usable in online mode
+
+ dout(10) << "replay_alloc_ids " << ids << dendl;
+ interval_set<inodeno_t> is;
+ is.intersection_of(free, ids);
+ if (!(is==ids)) {
+ mds->clog->error() << "journal replay alloc " << ids << ", only "
+ << is << " is in free " << free;
+ }
+ free.subtract(is);
+ projected_free.subtract(is);
+
+ projected_version = ++version;
+}
+void InoTable::replay_release_ids(interval_set<inodeno_t>& ids)
+{
+ dout(10) << "replay_release_ids " << ids << dendl;
+ free.insert(ids);
+ projected_free.insert(ids);
+ projected_version = ++version;
+}
+
+
+void InoTable::replay_reset()
+{
+ dout(10) << "replay_reset " << free << dendl;
+ skip_inos(inodeno_t(10000000)); // a lot!
+ projected_free = free;
+ projected_version = ++version;
+}
+
+
+void InoTable::skip_inos(inodeno_t i)
+{
+ dout(10) << "skip_inos was " << free << dendl;
+ inodeno_t first = free.range_start();
+ interval_set<inodeno_t> s;
+ s.insert(first, i);
+ s.intersection_of(free);
+ free.subtract(s);
+ projected_free = free;
+ projected_version = ++version;
+ dout(10) << "skip_inos now " << free << dendl;
+}
+
+void InoTable::dump(Formatter *f) const
+{
+ f->open_object_section("inotable");
+
+ f->open_array_section("projected_free");
+ for (interval_set<inodeno_t>::const_iterator i = projected_free.begin(); i != projected_free.end(); ++i) {
+ f->open_object_section("range");
+ f->dump_int("start", (*i).first);
+ f->dump_int("len", (*i).second);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("free");
+ for (interval_set<inodeno_t>::const_iterator i = free.begin(); i != free.end(); ++i) {
+ f->open_object_section("range");
+ f->dump_int("start", (*i).first);
+ f->dump_int("len", (*i).second);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->close_section();
+}
+
+
+void InoTable::generate_test_instances(list<InoTable*>& ls)
+{
+ ls.push_back(new InoTable());
+}
+
+
+bool InoTable::is_marked_free(inodeno_t id) const
+{
+ return free.contains(id) || projected_free.contains(id);
+}
+
+bool InoTable::intersects_free(
+ const interval_set<inodeno_t> &other,
+ interval_set<inodeno_t> *intersection)
+{
+ interval_set<inodeno_t> i;
+ i.intersection_of(free, other);
+ if (intersection != nullptr) {
+ *intersection = i;
+ }
+ return !(i.empty());
+}
+
+bool InoTable::repair(inodeno_t id)
+{
+ if (projected_version != version) {
+ // Can't do the repair while other things are in flight
+ return false;
+ }
+
+ ceph_assert(is_marked_free(id));
+ dout(10) << "repair: before status. ino = " << id << " pver =" << projected_version << " ver= " << version << dendl;
+ free.erase(id);
+ projected_free.erase(id);
+ projected_version = ++version;
+ dout(10) << "repair: after status. ino = " << id << " pver =" << projected_version << " ver= " << version << dendl;
+ return true;
+}
+
+bool InoTable::force_consume_to(inodeno_t ino)
+{
+ inodeno_t first = free.range_start();
+ if (first > ino)
+ return false;
+
+ skip_inos(inodeno_t(ino + 1 - first));
+ return true;
+}
diff --git a/src/mds/InoTable.h b/src/mds/InoTable.h
new file mode 100644
index 00000000..0e26e1e9
--- /dev/null
+++ b/src/mds/InoTable.h
@@ -0,0 +1,103 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_INOTABLE_H
+#define CEPH_INOTABLE_H
+
+#include "MDSTable.h"
+#include "include/interval_set.h"
+
+class MDSRank;
+
+class InoTable : public MDSTable {
+ interval_set<inodeno_t> free; // unused ids
+ interval_set<inodeno_t> projected_free;
+
+ public:
+ explicit InoTable(MDSRank *m) : MDSTable(m, "inotable", true) { }
+
+ inodeno_t project_alloc_id(inodeno_t id=0);
+ void apply_alloc_id(inodeno_t id);
+
+ void project_alloc_ids(interval_set<inodeno_t>& inos, int want);
+ void apply_alloc_ids(interval_set<inodeno_t>& inos);
+
+ void project_release_ids(interval_set<inodeno_t>& inos);
+ void apply_release_ids(interval_set<inodeno_t>& inos);
+
+ void replay_alloc_id(inodeno_t ino);
+ void replay_alloc_ids(interval_set<inodeno_t>& inos);
+ void replay_release_ids(interval_set<inodeno_t>& inos);
+ void replay_reset();
+ bool repair(inodeno_t id);
+ bool is_marked_free(inodeno_t id) const;
+ bool intersects_free(
+ const interval_set<inodeno_t> &other,
+ interval_set<inodeno_t> *intersection);
+
+ void reset_state() override;
+ void encode_state(bufferlist& bl) const override {
+ ENCODE_START(2, 2, bl);
+ encode(free, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode_state(bufferlist::const_iterator& bl) override {
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(free, bl);
+ projected_free = free;
+ DECODE_FINISH(bl);
+ }
+
+ // To permit enc/decoding in isolation in dencoder
+ InoTable() : MDSTable(NULL, "inotable", true) {}
+ void encode(bufferlist& bl) const {
+ encode_state(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ decode_state(bl);
+ }
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<InoTable*>& ls);
+
+ void skip_inos(inodeno_t i);
+
+ /**
+ * If the specified inode is marked as free, mark it as used.
+ * For use in tools, not normal operations.
+ *
+ * @returns true if the inode was previously marked as free
+ */
+ bool force_consume(inodeno_t ino)
+ {
+ if (free.contains(ino)) {
+ free.erase(ino);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * If this ino is in this rank's range, consume up to and including it.
+ * For use in tools, when we know the max ino in use and want to make
+ * sure we're only allocating new inodes from above it.
+ *
+ * @return true if the table was modified
+ */
+ bool force_consume_to(inodeno_t ino);
+};
+WRITE_CLASS_ENCODER(InoTable)
+
+#endif
diff --git a/src/mds/JournalPointer.cc b/src/mds/JournalPointer.cc
new file mode 100644
index 00000000..797798aa
--- /dev/null
+++ b/src/mds/JournalPointer.cc
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Cond.h"
+#include "osdc/Objecter.h"
+#include "mds/mdstypes.h"
+#include "msg/Messenger.h"
+
+#include "mds/JournalPointer.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_journaler
+#undef dout_prefix
+#define dout_prefix *_dout << objecter->messenger->get_myname() << ".journalpointer "
+
+
+std::string JournalPointer::get_object_id() const
+{
+ inodeno_t const pointer_ino = MDS_INO_LOG_POINTER_OFFSET + node_id;
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)pointer_ino, (long long unsigned)0);
+
+ return std::string(buf);
+}
+
+
+/**
+ * Blocking read of JournalPointer for this MDS
+ */
+int JournalPointer::load(Objecter *objecter)
+{
+ ceph_assert(objecter != NULL);
+
+ // Blocking read of data
+ std::string const object_id = get_object_id();
+ dout(4) << "Reading journal pointer '" << object_id << "'" << dendl;
+ bufferlist data;
+ C_SaferCond waiter;
+ objecter->read_full(object_t(object_id), object_locator_t(pool_id),
+ CEPH_NOSNAP, &data, 0, &waiter);
+ int r = waiter.wait();
+
+ // Construct JournalPointer result, null or decoded data
+ if (r == 0) {
+ auto q = data.cbegin();
+ try {
+ decode(q);
+ } catch (const buffer::error &e) {
+ return -EINVAL;
+ }
+ } else {
+ dout(1) << "Journal pointer '" << object_id << "' read failed: " << cpp_strerror(r) << dendl;
+ }
+ return r;
+}
+
+
+/**
+ * Blocking write of JournalPointer for this MDS
+ *
+ * @return objecter write op status code
+ */
+int JournalPointer::save(Objecter *objecter) const
+{
+ ceph_assert(objecter != NULL);
+ // It is not valid to persist a null pointer
+ ceph_assert(!is_null());
+
+ // Serialize JournalPointer object
+ bufferlist data;
+ encode(data);
+
+ // Write to RADOS and wait for durability
+ std::string const object_id = get_object_id();
+ dout(4) << "Writing pointer object '" << object_id << "': 0x"
+ << std::hex << front << ":0x" << back << std::dec << dendl;
+
+ C_SaferCond waiter;
+ objecter->write_full(object_t(object_id), object_locator_t(pool_id),
+ SnapContext(), data,
+ ceph::real_clock::now(), 0,
+ &waiter);
+ int write_result = waiter.wait();
+ if (write_result < 0) {
+ derr << "Error writing pointer object '" << object_id << "': " << cpp_strerror(write_result) << dendl;
+ }
+ return write_result;
+}
+
+
+/**
+ * Non-blocking variant of save() that assumes objecter lock already held by
+ * caller
+ */
+void JournalPointer::save(Objecter *objecter, Context *completion) const
+{
+ ceph_assert(objecter != NULL);
+
+ bufferlist data;
+ encode(data);
+
+ objecter->write_full(object_t(get_object_id()), object_locator_t(pool_id),
+ SnapContext(), data,
+ ceph::real_clock::now(), 0,
+ completion);
+}
+
diff --git a/src/mds/JournalPointer.h b/src/mds/JournalPointer.h
new file mode 100644
index 00000000..0f423266
--- /dev/null
+++ b/src/mds/JournalPointer.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef JOURNAL_POINTER_H
+#define JOURNAL_POINTER_H
+
+#include "include/encoding.h"
+#include "mdstypes.h"
+
+class Objecter;
+class Mutex;
+
+// This always lives in the same location for a given MDS
+// instance, it tells the daemon where to look for the journal.
+class JournalPointer {
+ // MDS rank
+ int node_id;
+ // Metadata pool ID
+ int64_t pool_id;
+
+ std::string get_object_id() const;
+
+ public:
+ // The currently active journal
+ inodeno_t front;
+ // The backup journal, if any (may be 0)
+ inodeno_t back;
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(front, bl);
+ encode(back, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator &bl) {
+ DECODE_START(1, bl);
+ decode(front, bl);
+ decode(back, bl);
+ DECODE_FINISH(bl);
+ }
+
+ JournalPointer(int node_id_, int64_t pool_id_) : node_id(node_id_), pool_id(pool_id_),
+ front(0), back(0) {}
+
+ JournalPointer() : node_id(-1), pool_id(-1), front(0), back(0) {}
+
+ int load(Objecter *objecter);
+ int save(Objecter *objecter) const;
+ void save(Objecter *objecter, Context *completion) const;
+
+ bool is_null() const {
+ return front == 0 && back == 0;
+ }
+
+ void dump(Formatter *f) const {
+ f->open_object_section("journal_pointer");
+ {
+ f->dump_unsigned("front", front);
+ f->dump_unsigned("back", back);
+ }
+ f->close_section(); // journal_header
+ }
+
+ static void generate_test_instances(std::list<JournalPointer*> &ls)
+ {
+ ls.push_back(new JournalPointer());
+ ls.push_back(new JournalPointer());
+ ls.back()->front = 0xdeadbeef;
+ ls.back()->back = 0xfeedbead;
+ }
+};
+WRITE_CLASS_ENCODER(JournalPointer)
+
+#endif // JOURNAL_POINTER_H
diff --git a/src/mds/LocalLock.h b/src/mds/LocalLock.h
new file mode 100644
index 00000000..d405a6b3
--- /dev/null
+++ b/src/mds/LocalLock.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_LOCALLOCK_H
+#define CEPH_LOCALLOCK_H
+
+#include "SimpleLock.h"
+
+class LocalLock : public SimpleLock {
+public:
+ client_t last_wrlock_client;
+
+ LocalLock(MDSCacheObject *o, LockType *t) :
+ SimpleLock(o, t) {
+ set_state(LOCK_LOCK); // always.
+ }
+
+ bool is_locallock() const override {
+ return true;
+ }
+
+ bool can_xlock_local() const {
+ return !is_wrlocked() && (get_xlock_by() == MutationRef());
+ }
+
+ bool can_wrlock() const {
+ return !is_xlocked();
+ }
+ void get_wrlock(client_t client) {
+ ceph_assert(can_wrlock());
+ SimpleLock::get_wrlock();
+ last_wrlock_client = client;
+ }
+ void put_wrlock() {
+ SimpleLock::put_wrlock();
+ if (get_num_wrlocks() == 0)
+ last_wrlock_client = client_t();
+ }
+ client_t get_last_wrlock_client() const {
+ return last_wrlock_client;
+ }
+
+ void print(ostream& out) const override {
+ out << "(";
+ _print(out);
+ if (last_wrlock_client >= 0)
+ out << " last_client=" << last_wrlock_client;
+ out << ")";
+ }
+};
+
+
+#endif
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
new file mode 100644
index 00000000..284cb254
--- /dev/null
+++ b/src/mds/Locker.cc
@@ -0,0 +1,5479 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string_view>
+
+#include "MDSRank.h"
+#include "MDCache.h"
+#include "Locker.h"
+#include "MDBalancer.h"
+#include "Migrator.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+#include "Mutation.h"
+#include "MDSContext.h"
+
+#include "MDLog.h"
+#include "MDSMap.h"
+
+#include "events/EUpdate.h"
+#include "events/EOpen.h"
+
+#include "msg/Messenger.h"
+#include "osdc/Objecter.h"
+
+#include "messages/MInodeFileCaps.h"
+#include "messages/MLock.h"
+#include "messages/MClientLease.h"
+#include "messages/MClientReply.h"
+#include "messages/MClientCaps.h"
+#include "messages/MClientCapRelease.h"
+
+#include "messages/MMDSSlaveRequest.h"
+
+#include <errno.h>
+
+#include "common/config.h"
+
+
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_context g_ceph_context
+#define dout_prefix _prefix(_dout, mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+ return *_dout << "mds." << mds->get_nodeid() << ".locker ";
+}
+
+
+class LockerContext : public MDSContext {
+protected:
+ Locker *locker;
+ MDSRank *get_mds() override
+ {
+ return locker->mds;
+ }
+
+public:
+ explicit LockerContext(Locker *locker_) : locker(locker_) {
+ ceph_assert(locker != NULL);
+ }
+};
+
+class LockerLogContext : public MDSLogContextBase {
+protected:
+ Locker *locker;
+ MDSRank *get_mds() override
+ {
+ return locker->mds;
+ }
+
+public:
+ explicit LockerLogContext(Locker *locker_) : locker(locker_) {
+ ceph_assert(locker != NULL);
+ }
+};
+
+Locker::Locker(MDSRank *m, MDCache *c) :
+ mds(m), mdcache(c), need_snapflush_inodes(member_offset(CInode, item_caps)) {}
+
+
+void Locker::dispatch(const Message::const_ref &m)
+{
+
+ switch (m->get_type()) {
+ // inter-mds locking
+ case MSG_MDS_LOCK:
+ handle_lock(MLock::msgref_cast(m));
+ break;
+ // inter-mds caps
+ case MSG_MDS_INODEFILECAPS:
+ handle_inode_file_caps(MInodeFileCaps::msgref_cast(m));
+ break;
+ // client sync
+ case CEPH_MSG_CLIENT_CAPS:
+ handle_client_caps(MClientCaps::msgref_cast(m));
+ break;
+ case CEPH_MSG_CLIENT_CAPRELEASE:
+ handle_client_cap_release(MClientCapRelease::msgref_cast(m));
+ break;
+ case CEPH_MSG_CLIENT_LEASE:
+ handle_client_lease(MClientLease::msgref_cast(m));
+ break;
+ default:
+ derr << "locker unknown message " << m->get_type() << dendl;
+ ceph_abort_msg("locker unknown message");
+ }
+}
+
+void Locker::tick()
+{
+ scatter_tick();
+ caps_tick();
+}
+
+/*
+ * locks vs rejoin
+ *
+ *
+ *
+ */
+
+void Locker::send_lock_message(SimpleLock *lock, int msg)
+{
+ for (const auto &it : lock->get_parent()->get_replicas()) {
+ if (mds->is_cluster_degraded() &&
+ mds->mdsmap->get_state(it.first) < MDSMap::STATE_REJOIN)
+ continue;
+ auto m = MLock::create(lock, msg, mds->get_nodeid());
+ mds->send_message_mds(m, it.first);
+ }
+}
+
+void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data)
+{
+ for (const auto &it : lock->get_parent()->get_replicas()) {
+ if (mds->is_cluster_degraded() &&
+ mds->mdsmap->get_state(it.first) < MDSMap::STATE_REJOIN)
+ continue;
+ auto m = MLock::create(lock, msg, mds->get_nodeid());
+ m->set_data(data);
+ mds->send_message_mds(m, it.first);
+ }
+}
+
+
+
+
+void Locker::include_snap_rdlocks(CInode *in, MutationImpl::LockOpVec& lov)
+{
+ // rdlock ancestor snaps
+ CInode *t = in;
+ while (t->get_projected_parent_dn()) {
+ t = t->get_projected_parent_dn()->get_dir()->get_inode();
+ lov.add_rdlock(&t->snaplock);
+ }
+ lov.add_rdlock(&in->snaplock);
+}
+
+void Locker::include_snap_rdlocks_wlayout(CInode *in, MutationImpl::LockOpVec& lov,
+ file_layout_t **layout)
+{
+ //rdlock ancestor snaps
+ CInode *t = in;
+ lov.add_rdlock(&in->snaplock);
+ lov.add_rdlock(&in->policylock);
+ bool found_layout = false;
+ while (t) {
+ lov.add_rdlock(&t->snaplock);
+ if (!found_layout) {
+ lov.add_rdlock(&t->policylock);
+ if (t->get_projected_inode()->has_layout()) {
+ *layout = &t->get_projected_inode()->layout;
+ found_layout = true;
+ }
+ }
+ if (t->get_projected_parent_dn() &&
+ t->get_projected_parent_dn()->get_dir())
+ t = t->get_projected_parent_dn()->get_dir()->get_inode();
+ else t = NULL;
+ }
+}
+
+struct MarkEventOnDestruct {
+ MDRequestRef& mdr;
+ std::string_view message;
+ bool mark_event;
+ MarkEventOnDestruct(MDRequestRef& _mdr, std::string_view _message) :
+ mdr(_mdr),
+ message(_message),
+ mark_event(true) {}
+ ~MarkEventOnDestruct() {
+ if (mark_event)
+ mdr->mark_event(message);
+ }
+};
+
+/* If this function returns false, the mdr has been placed
+ * on the appropriate wait list */
+bool Locker::acquire_locks(MDRequestRef& mdr,
+ MutationImpl::LockOpVec& lov,
+ CInode *auth_pin_freeze,
+ bool auth_pin_nonblock)
+{
+ if (mdr->done_locking &&
+ !mdr->is_slave()) { // not on slaves! master requests locks piecemeal.
+ dout(10) << "acquire_locks " << *mdr << " - done locking" << dendl;
+ return true; // at least we had better be!
+ }
+ dout(10) << "acquire_locks " << *mdr << dendl;
+
+ MarkEventOnDestruct marker(mdr, "failed to acquire_locks");
+
+ client_t client = mdr->get_client();
+
+ set<MDSCacheObject*> mustpin; // items to authpin
+
+ // xlocks
+ for (int i = 0, size = lov.size(); i < size; ++i) {
+ auto& p = lov[i];
+ SimpleLock *lock = p.lock;
+ MDSCacheObject *object = lock->get_parent();
+
+ if (p.is_xlock()) {
+ if ((lock->get_type() == CEPH_LOCK_ISNAP ||
+ lock->get_type() == CEPH_LOCK_IPOLICY) &&
+ mds->is_cluster_degraded() &&
+ mdr->is_master() &&
+ !mdr->is_queued_for_replay()) {
+ // waiting for recovering mds, to guarantee replayed requests and mksnap/setlayout
+ // get processed in proper order.
+ bool wait = false;
+ if (object->is_auth()) {
+ if (!mdr->locks.count(lock)) {
+ set<mds_rank_t> ls;
+ object->list_replicas(ls);
+ for (auto m : ls) {
+ if (mds->mdsmap->get_state(m) < MDSMap::STATE_ACTIVE) {
+ wait = true;
+ break;
+ }
+ }
+ }
+ } else {
+ // if the lock is the latest locked one, it's possible that slave mds got the lock
+ // while there are recovering mds.
+ if (!mdr->locks.count(lock) || lock == *mdr->locks.rbegin())
+ wait = true;
+ }
+ if (wait) {
+ dout(10) << " must xlock " << *lock << " " << *object
+ << ", waiting for cluster recovered" << dendl;
+ mds->locker->drop_locks(mdr.get(), NULL);
+ mdr->drop_local_auth_pins();
+ mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
+ return false;
+ }
+ }
+
+ dout(20) << " must xlock " << *lock << " " << *object << dendl;
+
+ mustpin.insert(object);
+
+ // augment xlock with a versionlock?
+ if (lock->get_type() == CEPH_LOCK_DN) {
+ CDentry *dn = static_cast<CDentry*>(object);
+ if (!dn->is_auth())
+ continue;
+ if (mdr->is_master()) {
+ // master. wrlock versionlock so we can pipeline dentry updates to journal.
+ lov.add_wrlock(&dn->versionlock);
+ } else {
+ // slave. exclusively lock the dentry version (i.e. block other journal updates).
+ // this makes rollback safe.
+ lov.add_xlock(&dn->versionlock);
+ }
+ }
+ if (lock->get_type() > CEPH_LOCK_IVERSION) {
+ // inode version lock?
+ CInode *in = static_cast<CInode*>(object);
+ if (!in->is_auth())
+ continue;
+ if (mdr->is_master()) {
+ // master. wrlock versionlock so we can pipeline inode updates to journal.
+ lov.add_wrlock(&in->versionlock);
+ } else {
+ // slave. exclusively lock the inode version (i.e. block other journal updates).
+ // this makes rollback safe.
+ lov.add_xlock(&in->versionlock);
+ }
+ }
+ } else if (p.is_wrlock()) {
+ dout(20) << " must wrlock " << *lock << " " << *object << dendl;
+ if (object->is_auth()) {
+ mustpin.insert(object);
+ } else if (!object->is_auth() &&
+ !lock->can_wrlock(client) && // we might have to request a scatter
+ !mdr->is_slave()) { // if we are slave (remote_wrlock), the master already authpinned
+ dout(15) << " will also auth_pin " << *object
+ << " in case we need to request a scatter" << dendl;
+ mustpin.insert(object);
+ }
+ } else if (p.is_remote_wrlock()) {
+ dout(20) << " must remote_wrlock on mds." << p.wrlock_target << " "
+ << *lock << " " << *object << dendl;
+ mustpin.insert(object);
+ } else if (p.is_rdlock()) {
+
+ dout(20) << " must rdlock " << *lock << " " << *object << dendl;
+ if (object->is_auth()) {
+ mustpin.insert(object);
+ } else if (!object->is_auth() &&
+ !lock->can_rdlock(client)) { // we might have to request an rdlock
+ dout(15) << " will also auth_pin " << *object
+ << " in case we need to request a rdlock" << dendl;
+ mustpin.insert(object);
+ }
+ } else {
+ ceph_assert(0 == "locker unknown lock operation");
+ }
+ }
+
+ lov.sort_and_merge();
+
+ // AUTH PINS
+ map<mds_rank_t, set<MDSCacheObject*> > mustpin_remote; // mds -> (object set)
+
+ // can i auth pin them all now?
+ marker.message = "failed to authpin local pins";
+ for (const auto &p : mustpin) {
+ MDSCacheObject *object = p;
+
+ dout(10) << " must authpin " << *object << dendl;
+
+ if (mdr->is_auth_pinned(object)) {
+ if (object != (MDSCacheObject*)auth_pin_freeze)
+ continue;
+ if (mdr->more()->is_remote_frozen_authpin) {
+ if (mdr->more()->rename_inode == auth_pin_freeze)
+ continue;
+ // unfreeze auth pin for the wrong inode
+ mustpin_remote[mdr->more()->rename_inode->authority().first].size();
+ }
+ }
+
+ if (!object->is_auth()) {
+ if (!mdr->locks.empty())
+ drop_locks(mdr.get());
+ if (object->is_ambiguous_auth()) {
+ // wait
+ marker.message = "waiting for single auth, object is being migrated";
+ dout(10) << " ambiguous auth, waiting to authpin " << *object << dendl;
+ object->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
+ mdr->drop_local_auth_pins();
+ return false;
+ }
+ mustpin_remote[object->authority().first].insert(object);
+ continue;
+ }
+ int err = 0;
+ if (!object->can_auth_pin(&err)) {
+ // wait
+ drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ if (auth_pin_nonblock) {
+ dout(10) << " can't auth_pin (freezing?) " << *object << ", nonblocking" << dendl;
+ mdr->aborted = true;
+ return false;
+ }
+ if (err == MDSCacheObject::ERR_EXPORTING_TREE) {
+ marker.message = "failed to authpin, subtree is being exported";
+ } else if (err == MDSCacheObject::ERR_FRAGMENTING_DIR) {
+ marker.message = "failed to authpin, dir is being fragmented";
+ } else if (err == MDSCacheObject::ERR_EXPORTING_INODE) {
+ marker.message = "failed to authpin, inode is being exported";
+ }
+ dout(10) << " can't auth_pin (freezing?), waiting to authpin " << *object << dendl;
+ object->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+
+ if (!mdr->remote_auth_pins.empty())
+ notify_freeze_waiter(object);
+
+ return false;
+ }
+ }
+
+ // ok, grab local auth pins
+ for (const auto& p : mustpin) {
+ MDSCacheObject *object = p;
+ if (mdr->is_auth_pinned(object)) {
+ dout(10) << " already auth_pinned " << *object << dendl;
+ } else if (object->is_auth()) {
+ dout(10) << " auth_pinning " << *object << dendl;
+ mdr->auth_pin(object);
+ }
+ }
+
+ // request remote auth_pins
+ if (!mustpin_remote.empty()) {
+ marker.message = "requesting remote authpins";
+ for (const auto& p : mdr->remote_auth_pins) {
+ if (mustpin.count(p.first)) {
+ ceph_assert(p.second == p.first->authority().first);
+ map<mds_rank_t, set<MDSCacheObject*> >::iterator q = mustpin_remote.find(p.second);
+ if (q != mustpin_remote.end())
+ q->second.insert(p.first);
+ }
+ }
+ for (map<mds_rank_t, set<MDSCacheObject*> >::iterator p = mustpin_remote.begin();
+ p != mustpin_remote.end();
+ ++p) {
+ dout(10) << "requesting remote auth_pins from mds." << p->first << dendl;
+
+ // wait for active auth
+ if (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(p->first)) {
+ dout(10) << " mds." << p->first << " is not active" << dendl;
+ if (mdr->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(p->first, new C_MDS_RetryRequest(mdcache, mdr));
+ return false;
+ }
+
+ auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPIN);
+ for (set<MDSCacheObject*>::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q) {
+ dout(10) << " req remote auth_pin of " << **q << dendl;
+ MDSCacheObjectInfo info;
+ (*q)->set_object_info(info);
+ req->get_authpins().push_back(info);
+ if (*q == auth_pin_freeze)
+ (*q)->set_object_info(req->get_authpin_freeze());
+ mdr->pin(*q);
+ }
+ if (auth_pin_nonblock)
+ req->mark_nonblock();
+ mds->send_message_mds(req, p->first);
+
+ // put in waiting list
+ ceph_assert(mdr->more()->waiting_on_slave.count(p->first) == 0);
+ mdr->more()->waiting_on_slave.insert(p->first);
+ }
+ return false;
+ }
+
+ // caps i'll need to issue
+ set<CInode*> issue_set;
+ bool result = false;
+
+ // acquire locks.
+ // make sure they match currently acquired locks.
+ auto existing = mdr->locks.begin();
+ for (const auto& p : lov) {
+ bool need_wrlock = p.is_wrlock();
+ bool need_remote_wrlock = p.is_remote_wrlock();
+
+ // already locked?
+ if (existing != mdr->locks.end() && existing->lock == p.lock) {
+ // right kind?
+ auto it = existing++;
+ auto have = *it; // don't reference
+
+ if (have.is_xlock() && p.is_xlock()) {
+ dout(10) << " already xlocked " << *have.lock << " " << *have.lock->get_parent() << dendl;
+ continue;
+ }
+
+ if (have.is_remote_wrlock() &&
+ (!need_remote_wrlock || have.wrlock_target != p.wrlock_target)) {
+ dout(10) << " unlocking remote_wrlock on wrong mds." << have.wrlock_target
+ << " " << *have.lock << " " << *have.lock->get_parent() << dendl;
+ remote_wrlock_finish(it, mdr.get());
+ have.clear_remote_wrlock();
+ }
+
+ if (need_wrlock || need_remote_wrlock) {
+ if (need_wrlock == have.is_wrlock() &&
+ need_remote_wrlock == have.is_remote_wrlock()) {
+ if (need_wrlock)
+ dout(10) << " already wrlocked " << *have.lock << " " << *have.lock->get_parent() << dendl;
+ if (need_remote_wrlock)
+ dout(10) << " already remote_wrlocked " << *have.lock << " " << *have.lock->get_parent() << dendl;
+ continue;
+ }
+
+ if (have.is_wrlock()) {
+ if (!need_wrlock)
+ dout(10) << " unlocking extra " << *have.lock << " " << *have.lock->get_parent() << dendl;
+ else if (need_remote_wrlock) // acquire remote_wrlock first
+ dout(10) << " unlocking out-of-order " << *have.lock << " " << *have.lock->get_parent() << dendl;
+ bool need_issue = false;
+ wrlock_finish(it, mdr.get(), &need_issue);
+ if (need_issue)
+ issue_set.insert(static_cast<CInode*>(have.lock->get_parent()));
+ }
+ } else if (have.is_rdlock() && p.is_rdlock()) {
+ dout(10) << " already rdlocked " << *have.lock << " " << *have.lock->get_parent() << dendl;
+ continue;
+ }
+ }
+
+ // hose any stray locks
+ while (existing != mdr->locks.end()) {
+ auto it = existing++;
+ auto stray = *it; // don't reference
+ dout(10) << " unlocking out-of-order " << *stray.lock << " " << *stray.lock->get_parent() << dendl;
+ bool need_issue = false;
+ if (stray.is_xlock()) {
+ xlock_finish(it, mdr.get(), &need_issue);
+ } else if (stray.is_rdlock()) {
+ rdlock_finish(it, mdr.get(), &need_issue);
+ } else {
+ // may have acquired both wrlock and remore wrlock
+ if (stray.is_wrlock())
+ wrlock_finish(it, mdr.get(), &need_issue);
+ if (stray.is_remote_wrlock())
+ remote_wrlock_finish(it, mdr.get());
+ }
+ if (need_issue)
+ issue_set.insert(static_cast<CInode*>(stray.lock->get_parent()));
+ }
+
+ // lock
+ if (mdr->locking && p.lock != mdr->locking) {
+ cancel_locking(mdr.get(), &issue_set);
+ }
+ if (p.is_xlock()) {
+ marker.message = "failed to xlock, waiting";
+ if (!xlock_start(p.lock, mdr))
+ goto out;
+ dout(10) << " got xlock on " << *p.lock << " " << *p.lock->get_parent() << dendl;
+ } else if (need_wrlock || need_remote_wrlock) {
+ if (need_remote_wrlock && !mdr->is_remote_wrlocked(p)) {
+ marker.message = "waiting for remote wrlocks";
+ remote_wrlock_start(p, p.wrlock_target, mdr);
+ goto out;
+ }
+ if (need_wrlock) {
+ marker.message = "failed to wrlock, waiting";
+ if (need_remote_wrlock && !p.lock->can_wrlock(mdr->get_client())) {
+ marker.message = "failed to wrlock, dropping remote wrlock and waiting";
+ // can't take the wrlock because the scatter lock is gathering. need to
+ // release the remote wrlock, so that the gathering process can finish.
+ auto it = mdr->locks.end();
+ ++it;
+ remote_wrlock_finish(it, mdr.get());
+ remote_wrlock_start(p, p.wrlock_target, mdr);
+ goto out;
+ }
+ // nowait if we have already gotten remote wrlock
+ if (!wrlock_start(p, mdr, need_remote_wrlock))
+ goto out;
+ dout(10) << " got wrlock on " << *p.lock << " " << *p.lock->get_parent() << dendl;
+ }
+ } else {
+ ceph_assert(mdr->is_master());
+ if (p.lock->needs_recover()) {
+ if (mds->is_cluster_degraded()) {
+ if (!mdr->is_queued_for_replay()) {
+ // see comments in SimpleLock::set_state_rejoin() and
+ // ScatterLock::encode_state_for_rejoin()
+ drop_locks(mdr.get());
+ mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
+ dout(10) << " rejoin recovering " << *p.lock << " " << *p.lock->get_parent()
+ << ", waiting for cluster recovered" << dendl;
+ marker.message = "rejoin recovering lock, waiting for cluster recovered";
+ return false;
+ }
+ } else {
+ p.lock->clear_need_recover();
+ }
+ }
+
+ marker.message = "failed to rdlock, waiting";
+ if (!rdlock_start(p, mdr))
+ goto out;
+ dout(10) << " got rdlock on " << *p.lock << " " << *p.lock->get_parent() << dendl;
+ }
+ }
+
+ // any extra unneeded locks?
+ while (existing != mdr->locks.end()) {
+ auto it = existing++;
+ auto stray = *it;
+ dout(10) << " unlocking extra " << *stray.lock << " " << *stray.lock->get_parent() << dendl;
+ bool need_issue = false;
+ if (stray.is_xlock()) {
+ xlock_finish(it, mdr.get(), &need_issue);
+ } else if (stray.is_rdlock()) {
+ rdlock_finish(it, mdr.get(), &need_issue);
+ } else {
+ // may have acquired both wrlock and remore wrlock
+ if (stray.is_wrlock())
+ wrlock_finish(it, mdr.get(), &need_issue);
+ if (stray.is_remote_wrlock())
+ remote_wrlock_finish(it, mdr.get());
+ }
+ if (need_issue)
+ issue_set.insert(static_cast<CInode*>(stray.lock->get_parent()));
+ }
+
+ mdr->done_locking = true;
+ mdr->set_mds_stamp(ceph_clock_now());
+ result = true;
+ marker.message = "acquired locks";
+
+ out:
+ issue_caps_set(issue_set);
+ return result;
+}
+
+void Locker::notify_freeze_waiter(MDSCacheObject *o)
+{
+ CDir *dir = NULL;
+ if (CInode *in = dynamic_cast<CInode*>(o)) {
+ if (!in->is_root())
+ dir = in->get_parent_dir();
+ } else if (CDentry *dn = dynamic_cast<CDentry*>(o)) {
+ dir = dn->get_dir();
+ } else {
+ dir = dynamic_cast<CDir*>(o);
+ ceph_assert(dir);
+ }
+ if (dir) {
+ if (dir->is_freezing_dir())
+ mdcache->fragment_freeze_inc_num_waiters(dir);
+ if (dir->is_freezing_tree()) {
+ while (!dir->is_freezing_tree_root())
+ dir = dir->get_parent_dir();
+ mdcache->migrator->export_freeze_inc_num_waiters(dir);
+ }
+ }
+}
+
+void Locker::set_xlocks_done(MutationImpl *mut, bool skip_dentry)
+{
+ for (const auto &p : mut->locks) {
+ if (!p.is_xlock())
+ continue;
+ MDSCacheObject *obj = p.lock->get_parent();
+ ceph_assert(obj->is_auth());
+ if (skip_dentry &&
+ (p.lock->get_type() == CEPH_LOCK_DN || p.lock->get_type() == CEPH_LOCK_DVERSION))
+ continue;
+ dout(10) << "set_xlocks_done on " << *p.lock << " " << *obj << dendl;
+ p.lock->set_xlock_done();
+ }
+}
+
+void Locker::_drop_locks(MutationImpl *mut, set<CInode*> *pneed_issue,
+ bool drop_rdlocks)
+{
+ set<mds_rank_t> slaves;
+
+ for (auto it = mut->locks.begin(); it != mut->locks.end(); ) {
+ SimpleLock *lock = it->lock;
+ MDSCacheObject *obj = lock->get_parent();
+
+ if (it->is_xlock()) {
+ if (obj->is_auth()) {
+ bool ni = false;
+ xlock_finish(it++, mut, &ni);
+ if (ni)
+ pneed_issue->insert(static_cast<CInode*>(obj));
+ } else {
+ ceph_assert(lock->get_sm()->can_remote_xlock);
+ slaves.insert(obj->authority().first);
+ lock->put_xlock();
+ mut->locks.erase(it++);
+ }
+ } else if (it->is_wrlock() || it->is_remote_wrlock()) {
+ if (it->is_remote_wrlock()) {
+ slaves.insert(it->wrlock_target);
+ it->clear_remote_wrlock();
+ }
+ if (it->is_wrlock()) {
+ bool ni = false;
+ wrlock_finish(it++, mut, &ni);
+ if (ni)
+ pneed_issue->insert(static_cast<CInode*>(obj));
+ } else {
+ mut->locks.erase(it++);
+ }
+ } else if (drop_rdlocks && it->is_rdlock()) {
+ bool ni = false;
+ rdlock_finish(it++, mut, &ni);
+ if (ni)
+ pneed_issue->insert(static_cast<CInode*>(obj));
+ } else {
+ ++it;
+ }
+ }
+
+ for (set<mds_rank_t>::iterator p = slaves.begin(); p != slaves.end(); ++p) {
+ if (!mds->is_cluster_degraded() ||
+ mds->mdsmap->get_state(*p) >= MDSMap::STATE_REJOIN) {
+ dout(10) << "_drop_non_rdlocks dropping remote locks on mds." << *p << dendl;
+ auto slavereq = MMDSSlaveRequest::create(mut->reqid, mut->attempt, MMDSSlaveRequest::OP_DROPLOCKS);
+ mds->send_message_mds(slavereq, *p);
+ }
+ }
+}
+
+void Locker::cancel_locking(MutationImpl *mut, set<CInode*> *pneed_issue)
+{
+ SimpleLock *lock = mut->locking;
+ ceph_assert(lock);
+ dout(10) << "cancel_locking " << *lock << " on " << *mut << dendl;
+
+ if (lock->get_parent()->is_auth()) {
+ bool need_issue = false;
+ if (lock->get_state() == LOCK_PREXLOCK) {
+ _finish_xlock(lock, -1, &need_issue);
+ } else if (lock->get_state() == LOCK_LOCK_XLOCK) {
+ lock->set_state(LOCK_XLOCKDONE);
+ eval_gather(lock, true, &need_issue);
+ }
+ if (need_issue)
+ pneed_issue->insert(static_cast<CInode *>(lock->get_parent()));
+ }
+ mut->finish_locking(lock);
+}
+
+void Locker::drop_locks(MutationImpl *mut, set<CInode*> *pneed_issue)
+{
+ // leftover locks
+ set<CInode*> my_need_issue;
+ if (!pneed_issue)
+ pneed_issue = &my_need_issue;
+
+ if (mut->locking)
+ cancel_locking(mut, pneed_issue);
+ _drop_locks(mut, pneed_issue, true);
+
+ if (pneed_issue == &my_need_issue)
+ issue_caps_set(*pneed_issue);
+ mut->done_locking = false;
+}
+
+void Locker::drop_non_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue)
+{
+ set<CInode*> my_need_issue;
+ if (!pneed_issue)
+ pneed_issue = &my_need_issue;
+
+ _drop_locks(mut, pneed_issue, false);
+
+ if (pneed_issue == &my_need_issue)
+ issue_caps_set(*pneed_issue);
+}
+
+void Locker::drop_rdlocks_for_early_reply(MutationImpl *mut)
+{
+ set<CInode*> need_issue;
+
+ for (auto it = mut->locks.begin(); it != mut->locks.end(); ) {
+ if (!it->is_rdlock()) {
+ ++it;
+ continue;
+ }
+ SimpleLock *lock = it->lock;
+ // make later mksnap/setlayout (at other mds) wait for this unsafe request
+ if (lock->get_type() == CEPH_LOCK_ISNAP ||
+ lock->get_type() == CEPH_LOCK_IPOLICY) {
+ ++it;
+ continue;
+ }
+ bool ni = false;
+ rdlock_finish(it++, mut, &ni);
+ if (ni)
+ need_issue.insert(static_cast<CInode*>(lock->get_parent()));
+ }
+
+ issue_caps_set(need_issue);
+}
+
+void Locker::drop_locks_for_fragment_unfreeze(MutationImpl *mut)
+{
+ set<CInode*> need_issue;
+
+ for (auto it = mut->locks.begin(); it != mut->locks.end(); ) {
+ SimpleLock *lock = it->lock;
+ if (lock->get_type() == CEPH_LOCK_IDFT) {
+ ++it;
+ continue;
+ }
+ bool ni = false;
+ wrlock_finish(it++, mut, &ni);
+ if (ni)
+ need_issue.insert(static_cast<CInode*>(lock->get_parent()));
+ }
+ issue_caps_set(need_issue);
+}
+
+// generics
+
+void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, MDSContext::vec *pfinishers)
+{
+ dout(10) << "eval_gather " << *lock << " on " << *lock->get_parent() << dendl;
+ ceph_assert(!lock->is_stable());
+
+ int next = lock->get_next_state();
+
+ CInode *in = 0;
+ bool caps = lock->get_cap_shift();
+ if (lock->get_type() != CEPH_LOCK_DN)
+ in = static_cast<CInode *>(lock->get_parent());
+
+ bool need_issue = false;
+
+ int loner_issued = 0, other_issued = 0, xlocker_issued = 0;
+ ceph_assert(!caps || in != NULL);
+ if (caps && in->is_head()) {
+ in->get_caps_issued(&loner_issued, &other_issued, &xlocker_issued,
+ lock->get_cap_shift(), lock->get_cap_mask());
+ dout(10) << " next state is " << lock->get_state_name(next)
+ << " issued/allows loner " << gcap_string(loner_issued)
+ << "/" << gcap_string(lock->gcaps_allowed(CAP_LONER, next))
+ << " xlocker " << gcap_string(xlocker_issued)
+ << "/" << gcap_string(lock->gcaps_allowed(CAP_XLOCKER, next))
+ << " other " << gcap_string(other_issued)
+ << "/" << gcap_string(lock->gcaps_allowed(CAP_ANY, next))
+ << dendl;
+
+ if (first && ((~lock->gcaps_allowed(CAP_ANY, next) & other_issued) ||
+ (~lock->gcaps_allowed(CAP_LONER, next) & loner_issued) ||
+ (~lock->gcaps_allowed(CAP_XLOCKER, next) & xlocker_issued)))
+ need_issue = true;
+ }
+
+#define IS_TRUE_AND_LT_AUTH(x, auth) (x && ((auth && x <= AUTH) || (!auth && x < AUTH)))
+ bool auth = lock->get_parent()->is_auth();
+ if (!lock->is_gathering() &&
+ (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_rdlock, auth) || !lock->is_rdlocked()) &&
+ (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_wrlock, auth) || !lock->is_wrlocked()) &&
+ (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_xlock, auth) || !lock->is_xlocked()) &&
+ (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_lease, auth) || !lock->is_leased()) &&
+ !(lock->get_parent()->is_auth() && lock->is_flushing()) && // i.e. wait for scatter_writebehind!
+ (!caps || ((~lock->gcaps_allowed(CAP_ANY, next) & other_issued) == 0 &&
+ (~lock->gcaps_allowed(CAP_LONER, next) & loner_issued) == 0 &&
+ (~lock->gcaps_allowed(CAP_XLOCKER, next) & xlocker_issued) == 0)) &&
+ lock->get_state() != LOCK_SYNC_MIX2 && // these states need an explicit trigger from the auth mds
+ lock->get_state() != LOCK_MIX_SYNC2
+ ) {
+ dout(7) << "eval_gather finished gather on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+
+ if (lock->get_sm() == &sm_filelock) {
+ ceph_assert(in);
+ if (in->state_test(CInode::STATE_RECOVERING)) {
+ dout(7) << "eval_gather finished gather, but still recovering" << dendl;
+ return;
+ } else if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
+ dout(7) << "eval_gather finished gather, but need to recover" << dendl;
+ mds->mdcache->queue_file_recover(in);
+ mds->mdcache->do_file_recover();
+ return;
+ }
+ }
+
+ if (!lock->get_parent()->is_auth()) {
+ // replica: tell auth
+ mds_rank_t auth = lock->get_parent()->authority().first;
+
+ if (lock->get_parent()->is_rejoining() &&
+ mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
+ dout(7) << "eval_gather finished gather, but still rejoining "
+ << *lock->get_parent() << dendl;
+ return;
+ }
+
+ if (!mds->is_cluster_degraded() ||
+ mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
+ switch (lock->get_state()) {
+ case LOCK_SYNC_LOCK:
+ mds->send_message_mds(MLock::create(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), auth);
+ break;
+
+ case LOCK_MIX_SYNC:
+ {
+ auto reply = MLock::create(lock, LOCK_AC_SYNCACK, mds->get_nodeid());
+ lock->encode_locked_state(reply->get_data());
+ mds->send_message_mds(reply, auth);
+ next = LOCK_MIX_SYNC2;
+ (static_cast<ScatterLock *>(lock))->start_flush();
+ }
+ break;
+
+ case LOCK_MIX_SYNC2:
+ (static_cast<ScatterLock *>(lock))->finish_flush();
+ (static_cast<ScatterLock *>(lock))->clear_flushed();
+
+ case LOCK_SYNC_MIX2:
+ // do nothing, we already acked
+ break;
+
+ case LOCK_SYNC_MIX:
+ {
+ auto reply = MLock::create(lock, LOCK_AC_MIXACK, mds->get_nodeid());
+ mds->send_message_mds(reply, auth);
+ next = LOCK_SYNC_MIX2;
+ }
+ break;
+
+ case LOCK_MIX_LOCK:
+ {
+ bufferlist data;
+ lock->encode_locked_state(data);
+ mds->send_message_mds(MLock::create(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), auth);
+ (static_cast<ScatterLock *>(lock))->start_flush();
+ // we'll get an AC_LOCKFLUSHED to complete
+ }
+ break;
+
+ default:
+ ceph_abort();
+ }
+ }
+ } else {
+ // auth
+
+ // once the first (local) stage of mix->lock gather complete we can
+ // gather from replicas
+ if (lock->get_state() == LOCK_MIX_LOCK &&
+ lock->get_parent()->is_replicated()) {
+ dout(10) << " finished (local) gather for mix->lock, now gathering from replicas" << dendl;
+ send_lock_message(lock, LOCK_AC_LOCK);
+ lock->init_gather();
+ lock->set_state(LOCK_MIX_LOCK2);
+ return;
+ }
+
+ if (lock->is_dirty() && !lock->is_flushed()) {
+ scatter_writebehind(static_cast<ScatterLock *>(lock));
+ mds->mdlog->flush();
+ return;
+ }
+ lock->clear_flushed();
+
+ switch (lock->get_state()) {
+ // to mixed
+ case LOCK_TSYN_MIX:
+ case LOCK_SYNC_MIX:
+ case LOCK_EXCL_MIX:
+ case LOCK_XSYN_MIX:
+ in->start_scatter(static_cast<ScatterLock *>(lock));
+ if (lock->get_parent()->is_replicated()) {
+ bufferlist softdata;
+ lock->encode_locked_state(softdata);
+ send_lock_message(lock, LOCK_AC_MIX, softdata);
+ }
+ (static_cast<ScatterLock *>(lock))->clear_scatter_wanted();
+ break;
+
+ case LOCK_XLOCK:
+ case LOCK_XLOCKDONE:
+ if (next != LOCK_SYNC)
+ break;
+ // fall-thru
+
+ // to sync
+ case LOCK_EXCL_SYNC:
+ case LOCK_LOCK_SYNC:
+ case LOCK_MIX_SYNC:
+ case LOCK_XSYN_SYNC:
+ if (lock->get_parent()->is_replicated()) {
+ bufferlist softdata;
+ lock->encode_locked_state(softdata);
+ send_lock_message(lock, LOCK_AC_SYNC, softdata);
+ }
+ break;
+ }
+
+ }
+
+ lock->set_state(next);
+
+ if (lock->get_parent()->is_auth() &&
+ lock->is_stable())
+ lock->get_parent()->auth_unpin(lock);
+
+ // drop loner before doing waiters
+ if (caps &&
+ in->is_head() &&
+ in->is_auth() &&
+ in->get_wanted_loner() != in->get_loner()) {
+ dout(10) << " trying to drop loner" << dendl;
+ if (in->try_drop_loner()) {
+ dout(10) << " dropped loner" << dendl;
+ need_issue = true;
+ }
+ }
+
+ if (pfinishers)
+ lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD|SimpleLock::WAIT_XLOCK,
+ *pfinishers);
+ else
+ lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD|SimpleLock::WAIT_XLOCK);
+
+ if (caps && in->is_head())
+ need_issue = true;
+
+ if (lock->get_parent()->is_auth() &&
+ lock->is_stable())
+ try_eval(lock, &need_issue);
+ }
+
+ if (need_issue) {
+ if (pneed_issue)
+ *pneed_issue = true;
+ else if (in->is_head())
+ issue_caps(in);
+ }
+
+}
+
+bool Locker::eval(CInode *in, int mask, bool caps_imported)
+{
+ bool need_issue = caps_imported;
+ MDSContext::vec finishers;
+
+ dout(10) << "eval " << mask << " " << *in << dendl;
+
+ // choose loner?
+ if (in->is_auth() && in->is_head()) {
+ client_t orig_loner = in->get_loner();
+ if (in->choose_ideal_loner()) {
+ dout(10) << "eval set loner: client." << orig_loner << " -> client." << in->get_loner() << dendl;
+ need_issue = true;
+ mask = -1;
+ } else if (in->get_wanted_loner() != in->get_loner()) {
+ dout(10) << "eval want loner: client." << in->get_wanted_loner() << " but failed to set it" << dendl;
+ mask = -1;
+ }
+ }
+
+ retry:
+ if (mask & CEPH_LOCK_IFILE)
+ eval_any(&in->filelock, &need_issue, &finishers, caps_imported);
+ if (mask & CEPH_LOCK_IAUTH)
+ eval_any(&in->authlock, &need_issue, &finishers, caps_imported);
+ if (mask & CEPH_LOCK_ILINK)
+ eval_any(&in->linklock, &need_issue, &finishers, caps_imported);
+ if (mask & CEPH_LOCK_IXATTR)
+ eval_any(&in->xattrlock, &need_issue, &finishers, caps_imported);
+ if (mask & CEPH_LOCK_INEST)
+ eval_any(&in->nestlock, &need_issue, &finishers, caps_imported);
+ if (mask & CEPH_LOCK_IFLOCK)
+ eval_any(&in->flocklock, &need_issue, &finishers, caps_imported);
+ if (mask & CEPH_LOCK_IPOLICY)
+ eval_any(&in->policylock, &need_issue, &finishers, caps_imported);
+
+ // drop loner?
+ if (in->is_auth() && in->is_head() && in->get_wanted_loner() != in->get_loner()) {
+ if (in->try_drop_loner()) {
+ need_issue = true;
+ if (in->get_wanted_loner() >= 0) {
+ dout(10) << "eval end set loner to client." << in->get_loner() << dendl;
+ bool ok = in->try_set_loner();
+ ceph_assert(ok);
+ mask = -1;
+ goto retry;
+ }
+ }
+ }
+
+ finish_contexts(g_ceph_context, finishers);
+
+ if (need_issue && in->is_head())
+ issue_caps(in);
+
+ dout(10) << "eval done" << dendl;
+ return need_issue;
+}
+
+class C_Locker_Eval : public LockerContext {
+ MDSCacheObject *p;
+ int mask;
+public:
+ C_Locker_Eval(Locker *l, MDSCacheObject *pp, int m) : LockerContext(l), p(pp), mask(m) {
+ // We are used as an MDSCacheObject waiter, so should
+ // only be invoked by someone already holding the big lock.
+ ceph_assert(locker->mds->mds_lock.is_locked_by_me());
+ p->get(MDSCacheObject::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ locker->try_eval(p, mask);
+ p->put(MDSCacheObject::PIN_PTRWAITER);
+ }
+};
+
+void Locker::try_eval(MDSCacheObject *p, int mask)
+{
+ // unstable and ambiguous auth?
+ if (p->is_ambiguous_auth()) {
+ dout(7) << "try_eval ambiguous auth, waiting on " << *p << dendl;
+ p->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_Eval(this, p, mask));
+ return;
+ }
+
+ if (p->is_auth() && p->is_frozen()) {
+ dout(7) << "try_eval frozen, waiting on " << *p << dendl;
+ p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_Eval(this, p, mask));
+ return;
+ }
+
+ if (mask & CEPH_LOCK_DN) {
+ ceph_assert(mask == CEPH_LOCK_DN);
+ bool need_issue = false; // ignore this, no caps on dentries
+ CDentry *dn = static_cast<CDentry *>(p);
+ eval_any(&dn->lock, &need_issue);
+ } else {
+ CInode *in = static_cast<CInode *>(p);
+ eval(in, mask);
+ }
+}
+
+void Locker::try_eval(SimpleLock *lock, bool *pneed_issue)
+{
+ MDSCacheObject *p = lock->get_parent();
+
+ // unstable and ambiguous auth?
+ if (p->is_ambiguous_auth()) {
+ dout(7) << "try_eval " << *lock << " ambiguousauth, waiting on " << *p << dendl;
+ p->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_Eval(this, p, lock->get_type()));
+ return;
+ }
+
+ if (!p->is_auth()) {
+ dout(7) << "try_eval " << *lock << " not auth for " << *p << dendl;
+ return;
+ }
+
+ if (p->is_frozen()) {
+ dout(7) << "try_eval " << *lock << " frozen, waiting on " << *p << dendl;
+ p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_Eval(this, p, lock->get_type()));
+ return;
+ }
+
+ /*
+ * We could have a situation like:
+ *
+ * - mds A authpins item on mds B
+ * - mds B starts to freeze tree containing item
+ * - mds A tries wrlock_start on A, sends REQSCATTER to B
+ * - mds B lock is unstable, sets scatter_wanted
+ * - mds B lock stabilizes, calls try_eval.
+ *
+ * We can defer while freezing without causing a deadlock. Honor
+ * scatter_wanted flag here. This will never get deferred by the
+ * checks above due to the auth_pin held by the master.
+ */
+ if (lock->is_scatterlock()) {
+ ScatterLock *slock = static_cast<ScatterLock *>(lock);
+ if (slock->get_scatter_wanted() &&
+ slock->get_state() != LOCK_MIX) {
+ scatter_mix(slock, pneed_issue);
+ if (!lock->is_stable())
+ return;
+ } else if (slock->get_unscatter_wanted() &&
+ slock->get_state() != LOCK_LOCK) {
+ simple_lock(slock, pneed_issue);
+ if (!lock->is_stable()) {
+ return;
+ }
+ }
+ }
+
+ if (lock->get_type() != CEPH_LOCK_DN &&
+ lock->get_type() != CEPH_LOCK_ISNAP &&
+ p->is_freezing()) {
+ dout(7) << "try_eval " << *lock << " freezing, waiting on " << *p << dendl;
+ p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_Eval(this, p, lock->get_type()));
+ return;
+ }
+
+ eval(lock, pneed_issue);
+}
+
+void Locker::eval_cap_gather(CInode *in, set<CInode*> *issue_set)
+{
+ bool need_issue = false;
+ MDSContext::vec finishers;
+
+ // kick locks now
+ if (!in->filelock.is_stable())
+ eval_gather(&in->filelock, false, &need_issue, &finishers);
+ if (!in->authlock.is_stable())
+ eval_gather(&in->authlock, false, &need_issue, &finishers);
+ if (!in->linklock.is_stable())
+ eval_gather(&in->linklock, false, &need_issue, &finishers);
+ if (!in->xattrlock.is_stable())
+ eval_gather(&in->xattrlock, false, &need_issue, &finishers);
+
+ if (need_issue && in->is_head()) {
+ if (issue_set)
+ issue_set->insert(in);
+ else
+ issue_caps(in);
+ }
+
+ finish_contexts(g_ceph_context, finishers);
+}
+
+void Locker::eval_scatter_gathers(CInode *in)
+{
+ bool need_issue = false;
+ MDSContext::vec finishers;
+
+ dout(10) << "eval_scatter_gathers " << *in << dendl;
+
+ // kick locks now
+ if (!in->filelock.is_stable())
+ eval_gather(&in->filelock, false, &need_issue, &finishers);
+ if (!in->nestlock.is_stable())
+ eval_gather(&in->nestlock, false, &need_issue, &finishers);
+ if (!in->dirfragtreelock.is_stable())
+ eval_gather(&in->dirfragtreelock, false, &need_issue, &finishers);
+
+ if (need_issue && in->is_head())
+ issue_caps(in);
+
+ finish_contexts(g_ceph_context, finishers);
+}
+
+void Locker::eval(SimpleLock *lock, bool *need_issue)
+{
+ switch (lock->get_type()) {
+ case CEPH_LOCK_IFILE:
+ return file_eval(static_cast<ScatterLock*>(lock), need_issue);
+ case CEPH_LOCK_IDFT:
+ case CEPH_LOCK_INEST:
+ return scatter_eval(static_cast<ScatterLock*>(lock), need_issue);
+ default:
+ return simple_eval(lock, need_issue);
+ }
+}
+
+
+// ------------------
+// rdlock
+
+bool Locker::_rdlock_kick(SimpleLock *lock, bool as_anon)
+{
+ // kick the lock
+ if (lock->is_stable()) {
+ if (lock->get_parent()->is_auth()) {
+ if (lock->get_sm() == &sm_scatterlock) {
+ // not until tempsync is fully implemented
+ //if (lock->get_parent()->is_replicated())
+ //scatter_tempsync((ScatterLock*)lock);
+ //else
+ simple_sync(lock);
+ } else if (lock->get_sm() == &sm_filelock) {
+ CInode *in = static_cast<CInode*>(lock->get_parent());
+ if (lock->get_state() == LOCK_EXCL &&
+ in->get_target_loner() >= 0 &&
+ !in->is_dir() && !as_anon) // as_anon => caller wants SYNC, not XSYN
+ file_xsyn(lock);
+ else
+ simple_sync(lock);
+ } else
+ simple_sync(lock);
+ return true;
+ } else {
+ // request rdlock state change from auth
+ mds_rank_t auth = lock->get_parent()->authority().first;
+ if (!mds->is_cluster_degraded() ||
+ mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
+ dout(10) << "requesting rdlock from auth on "
+ << *lock << " on " << *lock->get_parent() << dendl;
+ mds->send_message_mds(MLock::create(lock, LOCK_AC_REQRDLOCK, mds->get_nodeid()), auth);
+ }
+ return false;
+ }
+ }
+ if (lock->get_type() == CEPH_LOCK_IFILE) {
+ CInode *in = static_cast<CInode *>(lock->get_parent());
+ if (in->state_test(CInode::STATE_RECOVERING)) {
+ mds->mdcache->recovery_queue.prioritize(in);
+ }
+ }
+
+ return false;
+}
+
+bool Locker::rdlock_try(SimpleLock *lock, client_t client, MDSContext *con)
+{
+ dout(7) << "rdlock_try on " << *lock << " on " << *lock->get_parent() << dendl;
+
+ // can read? grab ref.
+ if (lock->can_rdlock(client))
+ return true;
+
+ _rdlock_kick(lock, false);
+
+ if (lock->can_rdlock(client))
+ return true;
+
+ // wait!
+ if (con) {
+ dout(7) << "rdlock_try waiting on " << *lock << " on " << *lock->get_parent() << dendl;
+ lock->add_waiter(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, con);
+ }
+ return false;
+}
+
+bool Locker::rdlock_start(SimpleLock *lock, MDRequestRef& mut, bool as_anon)
+{
+ dout(7) << "rdlock_start on " << *lock << " on " << *lock->get_parent() << dendl;
+
+ // client may be allowed to rdlock the same item it has xlocked.
+ // UNLESS someone passes in as_anon, or we're reading snapped version here.
+ if (mut->snapid != CEPH_NOSNAP)
+ as_anon = true;
+ client_t client = as_anon ? -1 : mut->get_client();
+
+ CInode *in = 0;
+ if (lock->get_type() != CEPH_LOCK_DN)
+ in = static_cast<CInode *>(lock->get_parent());
+
+ /*
+ if (!lock->get_parent()->is_auth() &&
+ lock->fw_rdlock_to_auth()) {
+ mdcache->request_forward(mut, lock->get_parent()->authority().first);
+ return false;
+ }
+ */
+
+ while (1) {
+ // can read? grab ref.
+ if (lock->can_rdlock(client)) {
+ lock->get_rdlock();
+ mut->locks.emplace_hint(mut->locks.end(), lock, MutationImpl::LockOp::RDLOCK);
+ return true;
+ }
+
+ // hmm, wait a second.
+ if (in && !in->is_head() && in->is_auth() &&
+ lock->get_state() == LOCK_SNAP_SYNC) {
+ // okay, we actually need to kick the head's lock to get ourselves synced up.
+ CInode *head = mdcache->get_inode(in->ino());
+ ceph_assert(head);
+ SimpleLock *hlock = head->get_lock(CEPH_LOCK_IFILE);
+ if (hlock->get_state() == LOCK_SYNC)
+ hlock = head->get_lock(lock->get_type());
+
+ if (hlock->get_state() != LOCK_SYNC) {
+ dout(10) << "rdlock_start trying head inode " << *head << dendl;
+ if (!rdlock_start(hlock, mut, true)) // ** as_anon, no rdlock on EXCL **
+ return false;
+ // oh, check our lock again then
+ }
+ }
+
+ if (!_rdlock_kick(lock, as_anon))
+ break;
+ }
+
+ // wait!
+ int wait_on;
+ if (lock->get_parent()->is_auth() && lock->is_stable())
+ wait_on = SimpleLock::WAIT_RD;
+ else
+ wait_on = SimpleLock::WAIT_STABLE; // REQRDLOCK is ignored if lock is unstable, so we need to retry.
+ dout(7) << "rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl;
+ lock->add_waiter(wait_on, new C_MDS_RetryRequest(mdcache, mut));
+ nudge_log(lock);
+ return false;
+}
+
+void Locker::nudge_log(SimpleLock *lock)
+{
+ dout(10) << "nudge_log " << *lock << " on " << *lock->get_parent() << dendl;
+ if (lock->get_parent()->is_auth() && lock->is_unstable_and_locked()) // as with xlockdone, or cap flush
+ mds->mdlog->flush();
+}
+
+void Locker::rdlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue)
+{
+ ceph_assert(it->is_rdlock());
+ SimpleLock *lock = it->lock;
+ // drop ref
+ lock->put_rdlock();
+ if (mut)
+ mut->locks.erase(it);
+
+ dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << dendl;
+
+ // last one?
+ if (!lock->is_rdlocked()) {
+ if (!lock->is_stable())
+ eval_gather(lock, false, pneed_issue);
+ else if (lock->get_parent()->is_auth())
+ try_eval(lock, pneed_issue);
+ }
+}
+
+
+bool Locker::can_rdlock_set(MutationImpl::LockOpVec& lov)
+{
+ dout(10) << "can_rdlock_set " << dendl;
+ for (const auto& p : lov) {
+ ceph_assert(p.is_rdlock());
+ if (!p.lock->can_rdlock(-1)) {
+ dout(10) << "can_rdlock_set can't rdlock " << *p << " on " << *p.lock->get_parent() << dendl;
+ return false;
+ }
+ }
+ return true;
+}
+
+
+void Locker::rdlock_take_set(MutationImpl::LockOpVec& lov, MutationRef& mut)
+{
+ dout(10) << "rdlock_take_set " << dendl;
+ for (const auto& p : lov) {
+ ceph_assert(p.is_rdlock());
+ p.lock->get_rdlock();
+ mut->locks.emplace(p.lock, MutationImpl::LockOp::RDLOCK);
+ }
+}
+
+// ------------------
+// wrlock
+
+void Locker::wrlock_force(SimpleLock *lock, MutationRef& mut)
+{
+ if (lock->get_type() == CEPH_LOCK_IVERSION ||
+ lock->get_type() == CEPH_LOCK_DVERSION)
+ return local_wrlock_grab(static_cast<LocalLock*>(lock), mut);
+
+ dout(7) << "wrlock_force on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ lock->get_wrlock(true);
+ mut->locks.emplace(lock, MutationImpl::LockOp::WRLOCK);
+}
+
+bool Locker::wrlock_start(SimpleLock *lock, MDRequestRef& mut, bool nowait)
+{
+ if (lock->get_type() == CEPH_LOCK_IVERSION ||
+ lock->get_type() == CEPH_LOCK_DVERSION)
+ return local_wrlock_start(static_cast<LocalLock*>(lock), mut);
+
+ dout(10) << "wrlock_start " << *lock << " on " << *lock->get_parent() << dendl;
+
+ CInode *in = static_cast<CInode *>(lock->get_parent());
+ client_t client = mut->get_client();
+ bool want_scatter = !nowait && lock->get_parent()->is_auth() &&
+ (in->has_subtree_or_exporting_dirfrag() ||
+ static_cast<ScatterLock*>(lock)->get_scatter_wanted());
+
+ while (1) {
+ // wrlock?
+ if (lock->can_wrlock(client) &&
+ (!want_scatter || lock->get_state() == LOCK_MIX)) {
+ lock->get_wrlock();
+ auto it = mut->locks.emplace_hint(mut->locks.end(), lock, MutationImpl::LockOp::WRLOCK);
+ it->flags |= MutationImpl::LockOp::WRLOCK; // may already remote_wrlocked
+ return true;
+ }
+
+ if (lock->get_type() == CEPH_LOCK_IFILE &&
+ in->state_test(CInode::STATE_RECOVERING)) {
+ mds->mdcache->recovery_queue.prioritize(in);
+ }
+
+ if (!lock->is_stable())
+ break;
+
+ if (in->is_auth()) {
+ // don't do nested lock state change if we have dirty scatterdata and
+ // may scatter_writebehind or start_scatter, because nowait==true implies
+ // that the caller already has a log entry open!
+ if (nowait && lock->is_dirty())
+ return false;
+
+ if (want_scatter)
+ scatter_mix(static_cast<ScatterLock*>(lock));
+ else
+ simple_lock(lock);
+
+ if (nowait && !lock->can_wrlock(client))
+ return false;
+
+ } else {
+ // replica.
+ // auth should be auth_pinned (see acquire_locks wrlock weird mustpin case).
+ mds_rank_t auth = lock->get_parent()->authority().first;
+ if (!mds->is_cluster_degraded() ||
+ mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
+ dout(10) << "requesting scatter from auth on "
+ << *lock << " on " << *lock->get_parent() << dendl;
+ mds->send_message_mds(MLock::create(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), auth);
+ }
+ break;
+ }
+ }
+
+ if (!nowait) {
+ dout(7) << "wrlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl;
+ lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut));
+ nudge_log(lock);
+ }
+
+ return false;
+}
+
+void Locker::wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue)
+{
+ ceph_assert(it->is_wrlock());
+ SimpleLock* lock = it->lock;
+
+ if (lock->get_type() == CEPH_LOCK_IVERSION ||
+ lock->get_type() == CEPH_LOCK_DVERSION)
+ return local_wrlock_finish(it, mut);
+
+ dout(7) << "wrlock_finish on " << *lock << " on " << *lock->get_parent() << dendl;
+ lock->put_wrlock();
+
+ if (it->is_remote_wrlock())
+ it->clear_wrlock();
+ else
+ mut->locks.erase(it);
+
+ if (!lock->is_wrlocked()) {
+ if (!lock->is_stable())
+ eval_gather(lock, false, pneed_issue);
+ else if (lock->get_parent()->is_auth())
+ try_eval(lock, pneed_issue);
+ }
+}
+
+
+// remote wrlock
+
+void Locker::remote_wrlock_start(SimpleLock *lock, mds_rank_t target, MDRequestRef& mut)
+{
+ dout(7) << "remote_wrlock_start mds." << target << " on " << *lock << " on " << *lock->get_parent() << dendl;
+
+ // wait for active target
+ if (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(target)) {
+ dout(7) << " mds." << target << " is not active" << dendl;
+ if (mut->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(target, new C_MDS_RetryRequest(mdcache, mut));
+ return;
+ }
+
+ // send lock request
+ mut->start_locking(lock, target);
+ mut->more()->slaves.insert(target);
+ auto r = MMDSSlaveRequest::create(mut->reqid, mut->attempt, MMDSSlaveRequest::OP_WRLOCK);
+ r->set_lock_type(lock->get_type());
+ lock->get_parent()->set_object_info(r->get_object_info());
+ mds->send_message_mds(r, target);
+
+ ceph_assert(mut->more()->waiting_on_slave.count(target) == 0);
+ mut->more()->waiting_on_slave.insert(target);
+}
+
+void Locker::remote_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut)
+{
+ ceph_assert(it->is_remote_wrlock());
+ SimpleLock *lock = it->lock;
+ mds_rank_t target = it->wrlock_target;
+
+ if (it->is_wrlock())
+ it->clear_remote_wrlock();
+ else
+ mut->locks.erase(it);
+
+ dout(7) << "remote_wrlock_finish releasing remote wrlock on mds." << target
+ << " " << *lock->get_parent() << dendl;
+ if (!mds->is_cluster_degraded() ||
+ mds->mdsmap->get_state(target) >= MDSMap::STATE_REJOIN) {
+ auto slavereq = MMDSSlaveRequest::create(mut->reqid, mut->attempt, MMDSSlaveRequest::OP_UNWRLOCK);
+ slavereq->set_lock_type(lock->get_type());
+ lock->get_parent()->set_object_info(slavereq->get_object_info());
+ mds->send_message_mds(slavereq, target);
+ }
+}
+
+
+// ------------------
+// xlock
+
+bool Locker::xlock_start(SimpleLock *lock, MDRequestRef& mut)
+{
+ if (lock->get_type() == CEPH_LOCK_IVERSION ||
+ lock->get_type() == CEPH_LOCK_DVERSION)
+ return local_xlock_start(static_cast<LocalLock*>(lock), mut);
+
+ dout(7) << "xlock_start on " << *lock << " on " << *lock->get_parent() << dendl;
+ client_t client = mut->get_client();
+
+ CInode *in = nullptr;
+ if (lock->get_cap_shift())
+ in = static_cast<CInode *>(lock->get_parent());
+
+ // auth?
+ if (lock->get_parent()->is_auth()) {
+ // auth
+ while (1) {
+ if (mut->locking && // started xlock (not preempt other request)
+ lock->can_xlock(client) &&
+ !(lock->get_state() == LOCK_LOCK_XLOCK && // client is not xlocker or
+ in && in->issued_caps_need_gather(lock))) { // xlocker does not hold shared cap
+ lock->set_state(LOCK_XLOCK);
+ lock->get_xlock(mut, client);
+ mut->locks.emplace_hint(mut->locks.end(), lock, MutationImpl::LockOp::XLOCK);
+ mut->finish_locking(lock);
+ return true;
+ }
+
+ if (lock->get_type() == CEPH_LOCK_IFILE &&
+ in->state_test(CInode::STATE_RECOVERING)) {
+ mds->mdcache->recovery_queue.prioritize(in);
+ }
+
+ if (!lock->is_stable() && (lock->get_state() != LOCK_XLOCKDONE ||
+ lock->get_xlock_by_client() != client ||
+ lock->is_waiter_for(SimpleLock::WAIT_STABLE)))
+ break;
+
+ if (lock->get_state() == LOCK_LOCK || lock->get_state() == LOCK_XLOCKDONE) {
+ mut->start_locking(lock);
+ simple_xlock(lock);
+ } else {
+ simple_lock(lock);
+ }
+ }
+
+ lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut));
+ nudge_log(lock);
+ return false;
+ } else {
+ // replica
+ ceph_assert(lock->get_sm()->can_remote_xlock);
+ ceph_assert(!mut->slave_request);
+
+ // wait for single auth
+ if (lock->get_parent()->is_ambiguous_auth()) {
+ lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH,
+ new C_MDS_RetryRequest(mdcache, mut));
+ return false;
+ }
+
+ // wait for active auth
+ mds_rank_t auth = lock->get_parent()->authority().first;
+ if (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
+ dout(7) << " mds." << auth << " is not active" << dendl;
+ if (mut->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(auth, new C_MDS_RetryRequest(mdcache, mut));
+ return false;
+ }
+
+ // send lock request
+ mut->more()->slaves.insert(auth);
+ mut->start_locking(lock, auth);
+ auto r = MMDSSlaveRequest::create(mut->reqid, mut->attempt, MMDSSlaveRequest::OP_XLOCK);
+ r->set_lock_type(lock->get_type());
+ lock->get_parent()->set_object_info(r->get_object_info());
+ mds->send_message_mds(r, auth);
+
+ ceph_assert(mut->more()->waiting_on_slave.count(auth) == 0);
+ mut->more()->waiting_on_slave.insert(auth);
+
+ return false;
+ }
+}
+
+void Locker::_finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue)
+{
+ ceph_assert(!lock->is_stable());
+ if (lock->get_type() != CEPH_LOCK_DN &&
+ lock->get_type() != CEPH_LOCK_ISNAP &&
+ lock->get_num_rdlocks() == 0 &&
+ lock->get_num_wrlocks() == 0 &&
+ !lock->is_leased() &&
+ lock->get_state() != LOCK_XLOCKSNAP) {
+ CInode *in = static_cast<CInode*>(lock->get_parent());
+ client_t loner = in->get_target_loner();
+ if (loner >= 0 && (xlocker < 0 || xlocker == loner)) {
+ lock->set_state(LOCK_EXCL);
+ lock->get_parent()->auth_unpin(lock);
+ lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD);
+ if (lock->get_cap_shift())
+ *pneed_issue = true;
+ if (lock->get_parent()->is_auth() &&
+ lock->is_stable())
+ try_eval(lock, pneed_issue);
+ return;
+ }
+ }
+ // the xlocker may have CEPH_CAP_GSHARED, need to revoke it if next state is LOCK_LOCK
+ eval_gather(lock, lock->get_state() != LOCK_XLOCKSNAP, pneed_issue);
+}
+
+void Locker::xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue)
+{
+ ceph_assert(it->is_xlock());
+ SimpleLock *lock = it->lock;
+
+ if (lock->get_type() == CEPH_LOCK_IVERSION ||
+ lock->get_type() == CEPH_LOCK_DVERSION)
+ return local_xlock_finish(it, mut);
+
+ dout(10) << "xlock_finish on " << *lock << " " << *lock->get_parent() << dendl;
+
+ client_t xlocker = lock->get_xlock_by_client();
+
+ // drop ref
+ lock->put_xlock();
+ ceph_assert(mut);
+ mut->locks.erase(it);
+
+ bool do_issue = false;
+
+ // remote xlock?
+ if (!lock->get_parent()->is_auth()) {
+ ceph_assert(lock->get_sm()->can_remote_xlock);
+
+ // tell auth
+ dout(7) << "xlock_finish releasing remote xlock on " << *lock->get_parent() << dendl;
+ mds_rank_t auth = lock->get_parent()->authority().first;
+ if (!mds->is_cluster_degraded() ||
+ mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
+ auto slavereq = MMDSSlaveRequest::create(mut->reqid, mut->attempt, MMDSSlaveRequest::OP_UNXLOCK);
+ slavereq->set_lock_type(lock->get_type());
+ lock->get_parent()->set_object_info(slavereq->get_object_info());
+ mds->send_message_mds(slavereq, auth);
+ }
+ // others waiting?
+ lock->finish_waiters(SimpleLock::WAIT_STABLE |
+ SimpleLock::WAIT_WR |
+ SimpleLock::WAIT_RD, 0);
+ } else {
+ if (lock->get_num_xlocks() == 0 &&
+ lock->get_state() != LOCK_LOCK_XLOCK) { // no one is taking xlock
+ _finish_xlock(lock, xlocker, &do_issue);
+ }
+ }
+
+ if (do_issue) {
+ CInode *in = static_cast<CInode*>(lock->get_parent());
+ if (in->is_head()) {
+ if (pneed_issue)
+ *pneed_issue = true;
+ else
+ issue_caps(in);
+ }
+ }
+}
+
+void Locker::xlock_export(const MutationImpl::lock_iterator& it, MutationImpl *mut)
+{
+ ceph_assert(it->is_xlock());
+ SimpleLock *lock = it->lock;
+ dout(10) << "xlock_export on " << *lock << " " << *lock->get_parent() << dendl;
+
+ lock->put_xlock();
+ mut->locks.erase(it);
+
+ MDSCacheObject *p = lock->get_parent();
+ ceph_assert(p->state_test(CInode::STATE_AMBIGUOUSAUTH)); // we are exporting this (inode)
+
+ if (!lock->is_stable())
+ lock->get_parent()->auth_unpin(lock);
+
+ lock->set_state(LOCK_LOCK);
+}
+
+void Locker::xlock_import(SimpleLock *lock)
+{
+ dout(10) << "xlock_import on " << *lock << " " << *lock->get_parent() << dendl;
+ lock->get_parent()->auth_pin(lock);
+}
+
+
+
+// file i/o -----------------------------------------
+
+version_t Locker::issue_file_data_version(CInode *in)
+{
+ dout(7) << "issue_file_data_version on " << *in << dendl;
+ return in->inode.file_data_version;
+}
+
+class C_Locker_FileUpdate_finish : public LockerLogContext {
+ CInode *in;
+ MutationRef mut;
+ unsigned flags;
+ client_t client;
+ MClientCaps::ref ack;
+public:
+ C_Locker_FileUpdate_finish(Locker *l, CInode *i, MutationRef& m, unsigned f,
+ const MClientCaps::ref &ack, client_t c=-1)
+ : LockerLogContext(l), in(i), mut(m), flags(f), client(c), ack(ack) {
+ in->get(CInode::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ locker->file_update_finish(in, mut, flags, client, ack);
+ in->put(CInode::PIN_PTRWAITER);
+ }
+};
+
+enum {
+ UPDATE_SHAREMAX = 1,
+ UPDATE_NEEDSISSUE = 2,
+ UPDATE_SNAPFLUSH = 4,
+};
+
+void Locker::file_update_finish(CInode *in, MutationRef& mut, unsigned flags,
+ client_t client, const MClientCaps::ref &ack)
+{
+ dout(10) << "file_update_finish on " << *in << dendl;
+ in->pop_and_dirty_projected_inode(mut->ls);
+
+ mut->apply();
+
+ if (ack) {
+ Session *session = mds->get_session(client);
+ if (session && !session->is_closed()) {
+ // "oldest flush tid" > 0 means client uses unique TID for each flush
+ if (ack->get_oldest_flush_tid() > 0)
+ session->add_completed_flush(ack->get_client_tid());
+ mds->send_message_client_counted(ack, session);
+ } else {
+ dout(10) << " no session for client." << client << " " << *ack << dendl;
+ }
+ }
+
+ set<CInode*> need_issue;
+ drop_locks(mut.get(), &need_issue);
+
+ if (in->is_head()) {
+ if ((flags & UPDATE_NEEDSISSUE) && need_issue.count(in) == 0) {
+ Capability *cap = in->get_client_cap(client);
+ if (cap && (cap->wanted() & ~cap->pending()))
+ issue_caps(in, cap);
+ }
+
+ if ((flags & UPDATE_SHAREMAX) && in->is_auth() &&
+ (in->filelock.gcaps_allowed(CAP_LONER) & (CEPH_CAP_GWR|CEPH_CAP_GBUFFER)))
+ share_inode_max_size(in);
+
+ } else if ((flags & UPDATE_SNAPFLUSH) && !in->client_snap_caps.empty()) {
+ dout(10) << " client_snap_caps " << in->client_snap_caps << dendl;
+ // check for snap writeback completion
+ in->client_snap_caps.erase(client);
+ if (in->client_snap_caps.empty()) {
+ for (int i = 0; i < num_cinode_locks; i++) {
+ SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
+ ceph_assert(lock);
+ lock->put_wrlock();
+ }
+ in->item_open_file.remove_myself();
+ in->item_caps.remove_myself();
+ eval_cap_gather(in, &need_issue);
+ }
+ }
+ issue_caps_set(need_issue);
+
+ mds->balancer->hit_inode(in, META_POP_IWR);
+
+ // auth unpin after issuing caps
+ mut->cleanup();
+}
+
+Capability* Locker::issue_new_caps(CInode *in,
+ int mode,
+ Session *session,
+ SnapRealm *realm,
+ bool is_replay)
+{
+ dout(7) << "issue_new_caps for mode " << mode << " on " << *in << dendl;
+ bool is_new;
+
+ // if replay, try to reconnect cap, and otherwise do nothing.
+ if (is_replay)
+ return mds->mdcache->try_reconnect_cap(in, session);
+
+
+ // my needs
+ ceph_assert(session->info.inst.name.is_client());
+ client_t my_client = session->get_client();
+ int my_want = ceph_caps_for_mode(mode);
+
+ // register a capability
+ Capability *cap = in->get_client_cap(my_client);
+ if (!cap) {
+ // new cap
+ cap = in->add_client_cap(my_client, session, realm);
+ cap->set_wanted(my_want);
+ cap->mark_new();
+ cap->inc_suppress(); // suppress file cap messages for new cap (we'll bundle with the open() reply)
+ is_new = true;
+ } else {
+ is_new = false;
+ // make sure it wants sufficient caps
+ if (my_want & ~cap->wanted()) {
+ // augment wanted caps for this client
+ cap->set_wanted(cap->wanted() | my_want);
+ }
+ }
+
+ if (in->is_auth()) {
+ // [auth] twiddle mode?
+ eval(in, CEPH_CAP_LOCKS);
+
+ if (_need_flush_mdlog(in, my_want))
+ mds->mdlog->flush();
+
+ } else {
+ // [replica] tell auth about any new caps wanted
+ request_inode_file_caps(in);
+ }
+
+ // issue caps (pot. incl new one)
+ //issue_caps(in); // note: _eval above may have done this already...
+
+ // re-issue whatever we can
+ //cap->issue(cap->pending());
+
+ if (is_new)
+ cap->dec_suppress();
+
+ return cap;
+}
+
+
+void Locker::issue_caps_set(set<CInode*>& inset)
+{
+ for (set<CInode*>::iterator p = inset.begin(); p != inset.end(); ++p)
+ issue_caps(*p);
+}
+
+class C_Locker_RevokeStaleCap : public LockerContext {
+ CInode *in;
+ client_t client;
+public:
+ C_Locker_RevokeStaleCap(Locker *l, CInode *i, client_t c) :
+ LockerContext(l), in(i), client(c) {
+ in->get(CInode::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ locker->revoke_stale_cap(in, client);
+ in->put(CInode::PIN_PTRWAITER);
+ }
+};
+
+int Locker::issue_caps(CInode *in, Capability *only_cap)
+{
+ // allowed caps are determined by the lock mode.
+ int all_allowed = in->get_caps_allowed_by_type(CAP_ANY);
+ int loner_allowed = in->get_caps_allowed_by_type(CAP_LONER);
+ int xlocker_allowed = in->get_caps_allowed_by_type(CAP_XLOCKER);
+
+ client_t loner = in->get_loner();
+ if (loner >= 0) {
+ dout(7) << "issue_caps loner client." << loner
+ << " allowed=" << ccap_string(loner_allowed)
+ << ", xlocker allowed=" << ccap_string(xlocker_allowed)
+ << ", others allowed=" << ccap_string(all_allowed)
+ << " on " << *in << dendl;
+ } else {
+ dout(7) << "issue_caps allowed=" << ccap_string(all_allowed)
+ << ", xlocker allowed=" << ccap_string(xlocker_allowed)
+ << " on " << *in << dendl;
+ }
+
+ ceph_assert(in->is_head());
+
+ // count conflicts with
+ int nissued = 0;
+
+ // client caps
+ map<client_t, Capability>::iterator it;
+ if (only_cap)
+ it = in->client_caps.find(only_cap->get_client());
+ else
+ it = in->client_caps.begin();
+ for (; it != in->client_caps.end(); ++it) {
+ Capability *cap = &it->second;
+
+ // do not issue _new_ bits when size|mtime is projected
+ int allowed;
+ if (loner == it->first)
+ allowed = loner_allowed;
+ else
+ allowed = all_allowed;
+
+ // add in any xlocker-only caps (for locks this client is the xlocker for)
+ allowed |= xlocker_allowed & in->get_xlocker_mask(it->first);
+
+ if ((in->inode.inline_data.version != CEPH_INLINE_NONE &&
+ cap->is_noinline()) ||
+ (!in->inode.layout.pool_ns.empty() &&
+ cap->is_nopoolns()))
+ allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
+
+ int pending = cap->pending();
+ int wanted = cap->wanted();
+
+ dout(20) << " client." << it->first
+ << " pending " << ccap_string(pending)
+ << " allowed " << ccap_string(allowed)
+ << " wanted " << ccap_string(wanted)
+ << dendl;
+
+ if (!(pending & ~allowed)) {
+ // skip if suppress or new, and not revocation
+ if (cap->is_new() || cap->is_suppress() || cap->is_stale()) {
+ dout(20) << " !revoke and new|suppressed|stale, skipping client." << it->first << dendl;
+ continue;
+ }
+ } else {
+ ceph_assert(!cap->is_new());
+ if (cap->is_stale()) {
+ dout(20) << " revoke stale cap from client." << it->first << dendl;
+ ceph_assert(!cap->is_valid());
+ cap->issue(allowed & pending, false);
+ mds->queue_waiter_front(new C_Locker_RevokeStaleCap(this, in, it->first));
+ continue;
+ }
+
+ if (!cap->is_valid() && (pending & ~CEPH_CAP_PIN)) {
+ // After stale->resume circle, client thinks it only has CEPH_CAP_PIN.
+ // mds needs to re-issue caps, then do revocation.
+ long seq = cap->issue(pending, true);
+
+ dout(7) << " sending MClientCaps to client." << it->first
+ << " seq " << seq << " re-issue " << ccap_string(pending) << dendl;
+
+ auto m = MClientCaps::create(CEPH_CAP_OP_GRANT, in->ino(),
+ in->find_snaprealm()->inode->ino(),
+ cap->get_cap_id(), cap->get_last_seq(),
+ pending, wanted, 0, cap->get_mseq(),
+ mds->get_osd_epoch_barrier());
+ in->encode_cap_message(m, cap);
+
+ mds->send_message_client_counted(m, cap->get_session());
+ }
+ }
+
+ // notify clients about deleted inode, to make sure they release caps ASAP.
+ if (in->inode.nlink == 0)
+ wanted |= CEPH_CAP_LINK_SHARED;
+
+ // are there caps that the client _wants_ and can have, but aren't pending?
+ // or do we need to revoke?
+ if ((pending & ~allowed) || // need to revoke ~allowed caps.
+ ((wanted & allowed) & ~pending) || // missing wanted+allowed caps
+ !cap->is_valid()) { // after stale->resume circle
+ // issue
+ nissued++;
+
+ // include caps that clients generally like, while we're at it.
+ int likes = in->get_caps_liked();
+ int before = pending;
+ long seq;
+ if (pending & ~allowed)
+ seq = cap->issue((wanted|likes) & allowed & pending, true); // if revoking, don't issue anything new.
+ else
+ seq = cap->issue((wanted|likes) & allowed, true);
+ int after = cap->pending();
+
+ dout(7) << " sending MClientCaps to client." << it->first
+ << " seq " << seq << " new pending " << ccap_string(after)
+ << " was " << ccap_string(before) << dendl;
+
+ int op = (before & ~after) ? CEPH_CAP_OP_REVOKE : CEPH_CAP_OP_GRANT;
+ if (op == CEPH_CAP_OP_REVOKE) {
+ revoking_caps.push_back(&cap->item_revoking_caps);
+ revoking_caps_by_client[cap->get_client()].push_back(&cap->item_client_revoking_caps);
+ cap->set_last_revoke_stamp(ceph_clock_now());
+ cap->reset_num_revoke_warnings();
+ }
+
+ auto m = MClientCaps::create(op, in->ino(),
+ in->find_snaprealm()->inode->ino(),
+ cap->get_cap_id(), cap->get_last_seq(),
+ after, wanted, 0, cap->get_mseq(),
+ mds->get_osd_epoch_barrier());
+ in->encode_cap_message(m, cap);
+
+ mds->send_message_client_counted(m, cap->get_session());
+ }
+
+ if (only_cap)
+ break;
+ }
+
+ return nissued;
+}
+
+void Locker::issue_truncate(CInode *in)
+{
+ dout(7) << "issue_truncate on " << *in << dendl;
+
+ for (auto &p : in->client_caps) {
+ Capability *cap = &p.second;
+ auto m = MClientCaps::create(CEPH_CAP_OP_TRUNC,
+ in->ino(),
+ in->find_snaprealm()->inode->ino(),
+ cap->get_cap_id(), cap->get_last_seq(),
+ cap->pending(), cap->wanted(), 0,
+ cap->get_mseq(),
+ mds->get_osd_epoch_barrier());
+ in->encode_cap_message(m, cap);
+ mds->send_message_client_counted(m, p.first);
+ }
+
+ // should we increase max_size?
+ if (in->is_auth() && in->is_file())
+ check_inode_max_size(in);
+}
+
+
+void Locker::revoke_stale_cap(CInode *in, client_t client)
+{
+ dout(7) << __func__ << " client." << client << " on " << *in << dendl;
+ Capability *cap = in->get_client_cap(client);
+ if (!cap)
+ return;
+
+ if (cap->revoking() & CEPH_CAP_ANY_WR) {
+ std::stringstream ss;
+ mds->evict_client(client.v, false, g_conf()->mds_session_blacklist_on_timeout, ss, nullptr);
+ return;
+ }
+
+ cap->revoke();
+
+ if (in->is_auth() && in->inode.client_ranges.count(cap->get_client()))
+ in->state_set(CInode::STATE_NEEDSRECOVER);
+
+ if (in->state_test(CInode::STATE_EXPORTINGCAPS))
+ return;
+
+ if (!in->filelock.is_stable())
+ eval_gather(&in->filelock);
+ if (!in->linklock.is_stable())
+ eval_gather(&in->linklock);
+ if (!in->authlock.is_stable())
+ eval_gather(&in->authlock);
+ if (!in->xattrlock.is_stable())
+ eval_gather(&in->xattrlock);
+
+ if (in->is_auth())
+ try_eval(in, CEPH_CAP_LOCKS);
+ else
+ request_inode_file_caps(in);
+}
+
+bool Locker::revoke_stale_caps(Session *session)
+{
+ dout(10) << "revoke_stale_caps for " << session->info.inst.name << dendl;
+
+ // invalidate all caps
+ session->inc_cap_gen();
+
+ bool ret = true;
+ std::vector<CInode*> to_eval;
+
+ for (auto p = session->caps.begin(); !p.end(); ) {
+ Capability *cap = *p;
+ ++p;
+ if (!cap->is_notable()) {
+ // the rest ones are not being revoked and don't have writeable range
+ // and don't want exclusive caps or want file read/write. They don't
+ // need recover, they don't affect eval_gather()/try_eval()
+ break;
+ }
+
+ int revoking = cap->revoking();
+ if (!revoking)
+ continue;
+
+ if (revoking & CEPH_CAP_ANY_WR) {
+ ret = false;
+ break;
+ }
+
+ int issued = cap->issued();
+ CInode *in = cap->get_inode();
+ dout(10) << " revoking " << ccap_string(issued) << " on " << *in << dendl;
+ cap->revoke();
+
+ if (in->is_auth() &&
+ in->inode.client_ranges.count(cap->get_client()))
+ in->state_set(CInode::STATE_NEEDSRECOVER);
+
+ // eval lock/inode may finish contexts, which may modify other cap's position
+ // in the session->caps.
+ to_eval.push_back(in);
+ }
+
+ for (auto in : to_eval) {
+ if (in->state_test(CInode::STATE_EXPORTINGCAPS))
+ continue;
+
+ if (!in->filelock.is_stable())
+ eval_gather(&in->filelock);
+ if (!in->linklock.is_stable())
+ eval_gather(&in->linklock);
+ if (!in->authlock.is_stable())
+ eval_gather(&in->authlock);
+ if (!in->xattrlock.is_stable())
+ eval_gather(&in->xattrlock);
+
+ if (in->is_auth())
+ try_eval(in, CEPH_CAP_LOCKS);
+ else
+ request_inode_file_caps(in);
+ }
+
+ return ret;
+}
+
+void Locker::resume_stale_caps(Session *session)
+{
+ dout(10) << "resume_stale_caps for " << session->info.inst.name << dendl;
+
+ bool lazy = session->info.has_feature(CEPHFS_FEATURE_LAZY_CAP_WANTED);
+ for (xlist<Capability*>::iterator p = session->caps.begin(); !p.end(); ) {
+ Capability *cap = *p;
+ ++p;
+ if (lazy && !cap->is_notable())
+ break; // see revoke_stale_caps()
+
+ CInode *in = cap->get_inode();
+ ceph_assert(in->is_head());
+ dout(10) << " clearing stale flag on " << *in << dendl;
+
+ if (in->state_test(CInode::STATE_EXPORTINGCAPS)) {
+ // if export succeeds, the cap will be removed. if export fails,
+ // we need to re-issue the cap if it's not stale.
+ in->state_set(CInode::STATE_EVALSTALECAPS);
+ continue;
+ }
+
+ if (!in->is_auth() || !eval(in, CEPH_CAP_LOCKS))
+ issue_caps(in, cap);
+ }
+}
+
+void Locker::remove_stale_leases(Session *session)
+{
+ dout(10) << "remove_stale_leases for " << session->info.inst.name << dendl;
+ xlist<ClientLease*>::iterator p = session->leases.begin();
+ while (!p.end()) {
+ ClientLease *l = *p;
+ ++p;
+ CDentry *parent = static_cast<CDentry*>(l->parent);
+ dout(15) << " removing lease on " << *parent << dendl;
+ parent->remove_client_lease(l, this);
+ }
+}
+
+
+class C_MDL_RequestInodeFileCaps : public LockerContext {
+ CInode *in;
+public:
+ C_MDL_RequestInodeFileCaps(Locker *l, CInode *i) : LockerContext(l), in(i) {
+ in->get(CInode::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ if (!in->is_auth())
+ locker->request_inode_file_caps(in);
+ in->put(CInode::PIN_PTRWAITER);
+ }
+};
+
+void Locker::request_inode_file_caps(CInode *in)
+{
+ ceph_assert(!in->is_auth());
+
+ int wanted = in->get_caps_wanted() & in->get_caps_allowed_ever() & ~CEPH_CAP_PIN;
+ if (wanted != in->replica_caps_wanted) {
+ // wait for single auth
+ if (in->is_ambiguous_auth()) {
+ in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH,
+ new C_MDL_RequestInodeFileCaps(this, in));
+ return;
+ }
+
+ mds_rank_t auth = in->authority().first;
+ if (mds->is_cluster_degraded() &&
+ mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
+ mds->wait_for_active_peer(auth, new C_MDL_RequestInodeFileCaps(this, in));
+ return;
+ }
+
+ dout(7) << "request_inode_file_caps " << ccap_string(wanted)
+ << " was " << ccap_string(in->replica_caps_wanted)
+ << " on " << *in << " to mds." << auth << dendl;
+
+ in->replica_caps_wanted = wanted;
+
+ if (!mds->is_cluster_degraded() ||
+ mds->mdsmap->is_clientreplay_or_active_or_stopping(auth))
+ mds->send_message_mds(MInodeFileCaps::create(in->ino(), in->replica_caps_wanted), auth);
+ }
+}
+
+void Locker::handle_inode_file_caps(const MInodeFileCaps::const_ref &m)
+{
+ // nobody should be talking to us during recovery.
+ if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+ if (mds->get_want_state() >= MDSMap::STATE_CLIENTREPLAY) {
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ ceph_abort_msg("got unexpected message during recovery");
+ }
+
+ // ok
+ CInode *in = mdcache->get_inode(m->get_ino());
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+
+ ceph_assert(in);
+ ceph_assert(in->is_auth());
+
+ dout(7) << "handle_inode_file_caps replica mds." << from << " wants caps " << ccap_string(m->get_caps()) << " on " << *in << dendl;
+
+ in->set_mds_caps_wanted(from, m->get_caps());
+
+ try_eval(in, CEPH_CAP_LOCKS);
+}
+
+
+class C_MDL_CheckMaxSize : public LockerContext {
+ CInode *in;
+ uint64_t new_max_size;
+ uint64_t newsize;
+ utime_t mtime;
+
+public:
+ C_MDL_CheckMaxSize(Locker *l, CInode *i, uint64_t _new_max_size,
+ uint64_t _newsize, utime_t _mtime) :
+ LockerContext(l), in(i),
+ new_max_size(_new_max_size), newsize(_newsize), mtime(_mtime)
+ {
+ in->get(CInode::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ if (in->is_auth())
+ locker->check_inode_max_size(in, false, new_max_size, newsize, mtime);
+ in->put(CInode::PIN_PTRWAITER);
+ }
+};
+
+uint64_t Locker::calc_new_max_size(CInode::mempool_inode *pi, uint64_t size)
+{
+ uint64_t new_max = (size + 1) << 1;
+ uint64_t max_inc = g_conf()->mds_client_writeable_range_max_inc_objs;
+ if (max_inc > 0) {
+ max_inc *= pi->layout.object_size;
+ new_max = std::min(new_max, size + max_inc);
+ }
+ return round_up_to(new_max, pi->get_layout_size_increment());
+}
+
+void Locker::calc_new_client_ranges(CInode *in, uint64_t size, bool update,
+ CInode::mempool_inode::client_range_map *new_ranges,
+ bool *max_increased)
+{
+ auto latest = in->get_projected_inode();
+ uint64_t ms;
+ if (latest->has_layout()) {
+ ms = calc_new_max_size(latest, size);
+ } else {
+ // Layout-less directories like ~mds0/, have zero size
+ ms = 0;
+ }
+
+ // increase ranges as appropriate.
+ // shrink to 0 if no WR|BUFFER caps issued.
+ for (auto &p : in->client_caps) {
+ if ((p.second.issued() | p.second.wanted()) & CEPH_CAP_ANY_FILE_WR) {
+ client_writeable_range_t& nr = (*new_ranges)[p.first];
+ nr.range.first = 0;
+ if (latest->client_ranges.count(p.first)) {
+ client_writeable_range_t& oldr = latest->client_ranges[p.first];
+ if (ms > oldr.range.last)
+ *max_increased = true;
+ nr.range.last = std::max(ms, oldr.range.last);
+ nr.follows = oldr.follows;
+ } else {
+ *max_increased = true;
+ nr.range.last = ms;
+ nr.follows = in->first - 1;
+ }
+ if (update)
+ p.second.mark_clientwriteable();
+ } else {
+ if (update)
+ p.second.clear_clientwriteable();
+ }
+ }
+}
+
+bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
+ uint64_t new_max_size, uint64_t new_size,
+ utime_t new_mtime)
+{
+ ceph_assert(in->is_auth());
+ ceph_assert(in->is_file());
+
+ CInode::mempool_inode *latest = in->get_projected_inode();
+ CInode::mempool_inode::client_range_map new_ranges;
+ uint64_t size = latest->size;
+ bool update_size = new_size > 0;
+ bool update_max = false;
+ bool max_increased = false;
+
+ if (update_size) {
+ new_size = size = std::max(size, new_size);
+ new_mtime = std::max(new_mtime, latest->mtime);
+ if (latest->size == new_size && latest->mtime == new_mtime)
+ update_size = false;
+ }
+
+ int can_update = 1;
+ if (in->is_frozen()) {
+ can_update = -1;
+ } else if (!force_wrlock && !in->filelock.can_wrlock(in->get_loner())) {
+ // lock?
+ if (in->filelock.is_stable()) {
+ if (in->get_target_loner() >= 0)
+ file_excl(&in->filelock);
+ else
+ simple_lock(&in->filelock);
+ }
+ if (!in->filelock.can_wrlock(in->get_loner()))
+ can_update = -2;
+ }
+
+ calc_new_client_ranges(in, std::max(new_max_size, size), can_update > 0,
+ &new_ranges, &max_increased);
+
+ if (max_increased || latest->client_ranges != new_ranges)
+ update_max = true;
+
+ if (!update_size && !update_max) {
+ dout(20) << "check_inode_max_size no-op on " << *in << dendl;
+ return false;
+ }
+
+ dout(10) << "check_inode_max_size new_ranges " << new_ranges
+ << " update_size " << update_size
+ << " on " << *in << dendl;
+
+ if (can_update < 0) {
+ auto cms = new C_MDL_CheckMaxSize(this, in, new_max_size, new_size, new_mtime);
+ if (can_update == -1) {
+ dout(10) << "check_inode_max_size frozen, waiting on " << *in << dendl;
+ in->add_waiter(CInode::WAIT_UNFREEZE, cms);
+ } else {
+ in->filelock.add_waiter(SimpleLock::WAIT_STABLE, cms);
+ dout(10) << "check_inode_max_size can't wrlock, waiting on " << *in << dendl;
+ }
+ return false;
+ }
+
+ MutationRef mut(new MutationImpl());
+ mut->ls = mds->mdlog->get_current_segment();
+
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
+
+ if (update_max) {
+ dout(10) << "check_inode_max_size client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
+ pi.inode.client_ranges = new_ranges;
+ }
+
+ if (update_size) {
+ dout(10) << "check_inode_max_size size " << pi.inode.size << " -> " << new_size << dendl;
+ pi.inode.size = new_size;
+ pi.inode.rstat.rbytes = new_size;
+ dout(10) << "check_inode_max_size mtime " << pi.inode.mtime << " -> " << new_mtime << dendl;
+ pi.inode.mtime = new_mtime;
+ if (new_mtime > pi.inode.ctime) {
+ pi.inode.ctime = new_mtime;
+ if (new_mtime > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = new_mtime;
+ }
+ }
+
+ // use EOpen if the file is still open; otherwise, use EUpdate.
+ // this is just an optimization to push open files forward into
+ // newer log segments.
+ LogEvent *le;
+ EMetaBlob *metablob;
+ if (in->is_any_caps_wanted() && in->last == CEPH_NOSNAP) {
+ EOpen *eo = new EOpen(mds->mdlog);
+ eo->add_ino(in->ino());
+ metablob = &eo->metablob;
+ le = eo;
+ } else {
+ EUpdate *eu = new EUpdate(mds->mdlog, "check_inode_max_size");
+ metablob = &eu->metablob;
+ le = eu;
+ }
+ mds->mdlog->start_entry(le);
+ if (update_size) { // FIXME if/when we do max_size nested accounting
+ mdcache->predirty_journal_parents(mut, metablob, in, 0, PREDIRTY_PRIMARY);
+ // no cow, here!
+ CDentry *parent = in->get_projected_parent_dn();
+ metablob->add_primary_dentry(parent, in, true);
+ } else {
+ metablob->add_dir_context(in->get_projected_parent_dn()->get_dir());
+ mdcache->journal_dirty_inode(mut.get(), metablob, in);
+ }
+ mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut,
+ UPDATE_SHAREMAX, MClientCaps::ref()));
+ wrlock_force(&in->filelock, mut); // wrlock for duration of journal
+ mut->auth_pin(in);
+
+ // make max_size _increase_ timely
+ if (max_increased)
+ mds->mdlog->flush();
+
+ return true;
+}
+
+
+void Locker::share_inode_max_size(CInode *in, Capability *only_cap)
+{
+ /*
+ * only share if currently issued a WR cap. if client doesn't have it,
+ * file_max doesn't matter, and the client will get it if/when they get
+ * the cap later.
+ */
+ dout(10) << "share_inode_max_size on " << *in << dendl;
+ map<client_t, Capability>::iterator it;
+ if (only_cap)
+ it = in->client_caps.find(only_cap->get_client());
+ else
+ it = in->client_caps.begin();
+ for (; it != in->client_caps.end(); ++it) {
+ const client_t client = it->first;
+ Capability *cap = &it->second;
+ if (cap->is_suppress())
+ continue;
+ if (cap->pending() & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER)) {
+ dout(10) << "share_inode_max_size with client." << client << dendl;
+ cap->inc_last_seq();
+ auto m = MClientCaps::create(CEPH_CAP_OP_GRANT,
+ in->ino(),
+ in->find_snaprealm()->inode->ino(),
+ cap->get_cap_id(),
+ cap->get_last_seq(),
+ cap->pending(),
+ cap->wanted(), 0,
+ cap->get_mseq(),
+ mds->get_osd_epoch_barrier());
+ in->encode_cap_message(m, cap);
+ mds->send_message_client_counted(m, client);
+ }
+ if (only_cap)
+ break;
+ }
+}
+
+bool Locker::_need_flush_mdlog(CInode *in, int wanted)
+{
+ /* flush log if caps are wanted by client but corresponding lock is unstable and locked by
+ * pending mutations. */
+ if (((wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR|CEPH_CAP_FILE_SHARED|CEPH_CAP_FILE_EXCL)) &&
+ in->filelock.is_unstable_and_locked()) ||
+ ((wanted & (CEPH_CAP_AUTH_SHARED|CEPH_CAP_AUTH_EXCL)) &&
+ in->authlock.is_unstable_and_locked()) ||
+ ((wanted & (CEPH_CAP_LINK_SHARED|CEPH_CAP_LINK_EXCL)) &&
+ in->linklock.is_unstable_and_locked()) ||
+ ((wanted & (CEPH_CAP_XATTR_SHARED|CEPH_CAP_XATTR_EXCL)) &&
+ in->xattrlock.is_unstable_and_locked()))
+ return true;
+ return false;
+}
+
+void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq)
+{
+ if (ceph_seq_cmp(issue_seq, cap->get_last_issue()) == 0) {
+ dout(10) << " wanted " << ccap_string(cap->wanted())
+ << " -> " << ccap_string(wanted) << dendl;
+ cap->set_wanted(wanted);
+ } else if (wanted & ~cap->wanted()) {
+ dout(10) << " wanted " << ccap_string(cap->wanted())
+ << " -> " << ccap_string(wanted)
+ << " (added caps even though we had seq mismatch!)" << dendl;
+ cap->set_wanted(wanted | cap->wanted());
+ } else {
+ dout(10) << " NOT changing wanted " << ccap_string(cap->wanted())
+ << " -> " << ccap_string(wanted)
+ << " (issue_seq " << issue_seq << " != last_issue "
+ << cap->get_last_issue() << ")" << dendl;
+ return;
+ }
+
+ CInode *cur = cap->get_inode();
+ if (!cur->is_auth()) {
+ request_inode_file_caps(cur);
+ return;
+ }
+
+ if (cap->wanted()) {
+ if (cur->state_test(CInode::STATE_RECOVERING) &&
+ (cap->wanted() & (CEPH_CAP_FILE_RD |
+ CEPH_CAP_FILE_WR))) {
+ mds->mdcache->recovery_queue.prioritize(cur);
+ }
+
+ if (mdcache->open_file_table.should_log_open(cur)) {
+ ceph_assert(cur->last == CEPH_NOSNAP);
+ EOpen *le = new EOpen(mds->mdlog);
+ mds->mdlog->start_entry(le);
+ le->add_clean_inode(cur);
+ mds->mdlog->submit_entry(le);
+ }
+ }
+}
+
+void Locker::snapflush_nudge(CInode *in)
+{
+ ceph_assert(in->last != CEPH_NOSNAP);
+ if (in->client_snap_caps.empty())
+ return;
+
+ CInode *head = mdcache->get_inode(in->ino());
+ // head inode gets unpinned when snapflush starts. It might get trimmed
+ // before snapflush finishes.
+ if (!head)
+ return;
+
+ ceph_assert(head->is_auth());
+ if (head->client_need_snapflush.empty())
+ return;
+
+ SimpleLock *hlock = head->get_lock(CEPH_LOCK_IFILE);
+ if (hlock->get_state() == LOCK_SYNC || !hlock->is_stable()) {
+ hlock = NULL;
+ for (int i = 0; i < num_cinode_locks; i++) {
+ SimpleLock *lock = head->get_lock(cinode_lock_info[i].lock);
+ if (lock->get_state() != LOCK_SYNC && lock->is_stable()) {
+ hlock = lock;
+ break;
+ }
+ }
+ }
+ if (hlock) {
+ _rdlock_kick(hlock, true);
+ } else {
+ // also, requeue, in case of unstable lock
+ need_snapflush_inodes.push_back(&in->item_caps);
+ }
+}
+
+void Locker::mark_need_snapflush_inode(CInode *in)
+{
+ ceph_assert(in->last != CEPH_NOSNAP);
+ if (!in->item_caps.is_on_list()) {
+ need_snapflush_inodes.push_back(&in->item_caps);
+ utime_t now = ceph_clock_now();
+ in->last_dirstat_prop = now;
+ dout(10) << "mark_need_snapflush_inode " << *in << " - added at " << now << dendl;
+ }
+}
+
+bool Locker::is_revoking_any_caps_from(client_t client)
+{
+ auto it = revoking_caps_by_client.find(client);
+ if (it == revoking_caps_by_client.end())
+ return false;
+ return !it->second.empty();
+}
+
+void Locker::_do_null_snapflush(CInode *head_in, client_t client, snapid_t last)
+{
+ dout(10) << "_do_null_snapflush client." << client << " on " << *head_in << dendl;
+ for (auto p = head_in->client_need_snapflush.begin();
+ p != head_in->client_need_snapflush.end() && p->first < last; ) {
+ snapid_t snapid = p->first;
+ auto &clients = p->second;
+ ++p; // be careful, q loop below depends on this
+
+ if (clients.count(client)) {
+ dout(10) << " doing async NULL snapflush on " << snapid << " from client." << client << dendl;
+ CInode *sin = mdcache->pick_inode_snap(head_in, snapid - 1);
+ ceph_assert(sin);
+ ceph_assert(sin->first <= snapid);
+ _do_snap_update(sin, snapid, 0, sin->first - 1, client, MClientCaps::ref(), MClientCaps::ref());
+ head_in->remove_need_snapflush(sin, snapid, client);
+ }
+ }
+}
+
+
+bool Locker::should_defer_client_cap_frozen(CInode *in)
+{
+ /*
+ * This policy needs to be AT LEAST as permissive as allowing a client request
+ * to go forward, or else a client request can release something, the release
+ * gets deferred, but the request gets processed and deadlocks because when the
+ * caps can't get revoked.
+ *
+ * Currently, a request wait if anything locked is freezing (can't
+ * auth_pin), which would avoid any deadlock with cap release. Thus @in
+ * _MUST_ be in the lock/auth_pin set.
+ *
+ * auth_pins==0 implies no unstable lock and not auth pinnned by
+ * client request, otherwise continue even it's freezing.
+ */
+ return (in->is_freezing() && in->get_num_auth_pins() == 0) || in->is_frozen();
+}
+
+void Locker::handle_client_caps(const MClientCaps::const_ref &m)
+{
+ client_t client = m->get_source().num();
+ snapid_t follows = m->get_snap_follows();
+ auto op = m->get_op();
+ auto dirty = m->get_dirty();
+ dout(7) << "handle_client_caps "
+ << " on " << m->get_ino()
+ << " tid " << m->get_client_tid() << " follows " << follows
+ << " op " << ceph_cap_op_name(op)
+ << " flags 0x" << std::hex << m->flags << std::dec << dendl;
+
+ Session *session = mds->get_session(m);
+ if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+ if (!session) {
+ dout(5) << " no session, dropping " << *m << dendl;
+ return;
+ }
+ if (session->is_closed() ||
+ session->is_closing() ||
+ session->is_killing()) {
+ dout(7) << " session closed|closing|killing, dropping " << *m << dendl;
+ return;
+ }
+ if ((mds->is_reconnect() || mds->get_want_state() == MDSMap::STATE_RECONNECT) &&
+ dirty && m->get_client_tid() > 0 &&
+ !session->have_completed_flush(m->get_client_tid())) {
+ mdcache->set_reconnected_dirty_caps(client, m->get_ino(), dirty,
+ op == CEPH_CAP_OP_FLUSHSNAP);
+ }
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ if (m->get_client_tid() > 0 && session &&
+ session->have_completed_flush(m->get_client_tid())) {
+ dout(7) << "handle_client_caps already flushed tid " << m->get_client_tid()
+ << " for client." << client << dendl;
+ MClientCaps::ref ack;
+ if (op == CEPH_CAP_OP_FLUSHSNAP) {
+ ack = MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP_ACK, m->get_ino(), 0, 0, 0, 0, 0, dirty, 0, mds->get_osd_epoch_barrier());
+ } else {
+ ack = MClientCaps::create(CEPH_CAP_OP_FLUSH_ACK, m->get_ino(), 0, m->get_cap_id(), m->get_seq(), m->get_caps(), 0, dirty, 0, mds->get_osd_epoch_barrier());
+ }
+ ack->set_snap_follows(follows);
+ ack->set_client_tid(m->get_client_tid());
+ mds->send_message_client_counted(ack, m->get_connection());
+ if (op == CEPH_CAP_OP_FLUSHSNAP) {
+ return;
+ } else {
+ // fall-thru because the message may release some caps
+ dirty = false;
+ op = CEPH_CAP_OP_UPDATE;
+ }
+ }
+
+ // "oldest flush tid" > 0 means client uses unique TID for each flush
+ if (m->get_oldest_flush_tid() > 0 && session) {
+ if (session->trim_completed_flushes(m->get_oldest_flush_tid())) {
+ mds->mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
+
+ if (session->get_num_trim_flushes_warnings() > 0 &&
+ session->get_num_completed_flushes() * 2 < g_conf()->mds_max_completed_flushes)
+ session->reset_num_trim_flushes_warnings();
+ } else {
+ if (session->get_num_completed_flushes() >=
+ (g_conf()->mds_max_completed_flushes << session->get_num_trim_flushes_warnings())) {
+ session->inc_num_trim_flushes_warnings();
+ stringstream ss;
+ ss << "client." << session->get_client() << " does not advance its oldest_flush_tid ("
+ << m->get_oldest_flush_tid() << "), "
+ << session->get_num_completed_flushes()
+ << " completed flushes recorded in session";
+ mds->clog->warn() << ss.str();
+ dout(20) << __func__ << " " << ss.str() << dendl;
+ }
+ }
+ }
+
+ CInode *head_in = mdcache->get_inode(m->get_ino());
+ if (!head_in) {
+ if (mds->is_clientreplay()) {
+ dout(7) << "handle_client_caps on unknown ino " << m->get_ino()
+ << ", will try again after replayed client requests" << dendl;
+ mdcache->wait_replay_cap_reconnect(m->get_ino(), new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ /*
+ * "handle_client_caps on unknown ino xxx” is normal after migrating a subtree
+ * Sequence of events that cause this are:
+ * - client sends caps message to mds.a
+ * - mds finishes subtree migration, send cap export to client
+ * - mds trim its cache
+ * - mds receives cap messages from client
+ */
+ dout(7) << "handle_client_caps on unknown ino " << m->get_ino() << ", dropping" << dendl;
+ return;
+ }
+
+ if (m->osd_epoch_barrier && !mds->objecter->have_map(m->osd_epoch_barrier)) {
+ // Pause RADOS operations until we see the required epoch
+ mds->objecter->set_epoch_barrier(m->osd_epoch_barrier);
+ }
+
+ if (mds->get_osd_epoch_barrier() < m->osd_epoch_barrier) {
+ // Record the barrier so that we will retransmit it to clients
+ mds->set_osd_epoch_barrier(m->osd_epoch_barrier);
+ }
+
+ dout(10) << " head inode " << *head_in << dendl;
+
+ Capability *cap = 0;
+ cap = head_in->get_client_cap(client);
+ if (!cap) {
+ dout(7) << "handle_client_caps no cap for client." << client << " on " << *head_in << dendl;
+ return;
+ }
+ ceph_assert(cap);
+
+ // freezing|frozen?
+ if (should_defer_client_cap_frozen(head_in)) {
+ dout(7) << "handle_client_caps freezing|frozen on " << *head_in << dendl;
+ head_in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ if (ceph_seq_cmp(m->get_mseq(), cap->get_mseq()) < 0) {
+ dout(7) << "handle_client_caps mseq " << m->get_mseq() << " < " << cap->get_mseq()
+ << ", dropping" << dendl;
+ return;
+ }
+
+ bool need_unpin = false;
+
+ // flushsnap?
+ if (op == CEPH_CAP_OP_FLUSHSNAP) {
+ if (!head_in->is_auth()) {
+ dout(7) << " not auth, ignoring flushsnap on " << *head_in << dendl;
+ goto out;
+ }
+
+ SnapRealm *realm = head_in->find_snaprealm();
+ snapid_t snap = realm->get_snap_following(follows);
+ dout(10) << " flushsnap follows " << follows << " -> snap " << snap << dendl;
+
+ auto p = head_in->client_need_snapflush.begin();
+ if (p != head_in->client_need_snapflush.end() && p->first < snap) {
+ head_in->auth_pin(this); // prevent subtree frozen
+ need_unpin = true;
+ _do_null_snapflush(head_in, client, snap);
+ }
+
+ CInode *in = head_in;
+ if (snap != CEPH_NOSNAP) {
+ in = mdcache->pick_inode_snap(head_in, snap - 1);
+ if (in != head_in)
+ dout(10) << " snapped inode " << *in << dendl;
+ }
+
+ // we can prepare the ack now, since this FLUSHEDSNAP is independent of any
+ // other cap ops. (except possibly duplicate FLUSHSNAP requests, but worst
+ // case we get a dup response, so whatever.)
+ MClientCaps::ref ack;
+ if (dirty) {
+ ack = MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP_ACK, in->ino(), 0, 0, 0, 0, 0, dirty, 0, mds->get_osd_epoch_barrier());
+ ack->set_snap_follows(follows);
+ ack->set_client_tid(m->get_client_tid());
+ ack->set_oldest_flush_tid(m->get_oldest_flush_tid());
+ }
+
+ if (in == head_in ||
+ (head_in->client_need_snapflush.count(snap) &&
+ head_in->client_need_snapflush[snap].count(client))) {
+ dout(7) << " flushsnap snap " << snap
+ << " client." << client << " on " << *in << dendl;
+
+ // this cap now follows a later snap (i.e. the one initiating this flush, or later)
+ if (in == head_in)
+ cap->client_follows = snap < CEPH_NOSNAP ? snap : realm->get_newest_seq();
+
+ _do_snap_update(in, snap, dirty, follows, client, m, ack);
+
+ if (in != head_in)
+ head_in->remove_need_snapflush(in, snap, client);
+ } else {
+ dout(7) << " not expecting flushsnap " << snap << " from client." << client << " on " << *in << dendl;
+ if (ack)
+ mds->send_message_client_counted(ack, m->get_connection());
+ }
+ goto out;
+ }
+
+ if (cap->get_cap_id() != m->get_cap_id()) {
+ dout(7) << " ignoring client capid " << m->get_cap_id() << " != my " << cap->get_cap_id() << dendl;
+ } else {
+ CInode *in = head_in;
+ if (follows > 0) {
+ in = mdcache->pick_inode_snap(head_in, follows);
+ // intermediate snap inodes
+ while (in != head_in) {
+ ceph_assert(in->last != CEPH_NOSNAP);
+ if (in->is_auth() && dirty) {
+ dout(10) << " updating intermediate snapped inode " << *in << dendl;
+ _do_cap_update(in, NULL, dirty, follows, m, MClientCaps::ref());
+ }
+ in = mdcache->pick_inode_snap(head_in, in->last);
+ }
+ }
+
+ // head inode, and cap
+ MClientCaps::ref ack;
+
+ int caps = m->get_caps();
+ if (caps & ~cap->issued()) {
+ dout(10) << " confirming not issued caps " << ccap_string(caps & ~cap->issued()) << dendl;
+ caps &= cap->issued();
+ }
+
+ cap->confirm_receipt(m->get_seq(), caps);
+ dout(10) << " follows " << follows
+ << " retains " << ccap_string(m->get_caps())
+ << " dirty " << ccap_string(dirty)
+ << " on " << *in << dendl;
+
+
+ // missing/skipped snapflush?
+ // The client MAY send a snapflush if it is issued WR/EXCL caps, but
+ // presently only does so when it has actual dirty metadata. But, we
+ // set up the need_snapflush stuff based on the issued caps.
+ // We can infer that the client WONT send a FLUSHSNAP once they have
+ // released all WR/EXCL caps (the FLUSHSNAP always comes before the cap
+ // update/release).
+ if (!head_in->client_need_snapflush.empty()) {
+ if (!(cap->issued() & CEPH_CAP_ANY_FILE_WR) &&
+ !(m->flags & MClientCaps::FLAG_PENDING_CAPSNAP)) {
+ head_in->auth_pin(this); // prevent subtree frozen
+ need_unpin = true;
+ _do_null_snapflush(head_in, client);
+ } else {
+ dout(10) << " revocation in progress, not making any conclusions about null snapflushes" << dendl;
+ }
+ }
+ if (cap->need_snapflush() && !(m->flags & MClientCaps::FLAG_PENDING_CAPSNAP))
+ cap->clear_needsnapflush();
+
+ if (dirty && in->is_auth()) {
+ dout(7) << " flush client." << client << " dirty " << ccap_string(dirty)
+ << " seq " << m->get_seq() << " on " << *in << dendl;
+ ack = MClientCaps::create(CEPH_CAP_OP_FLUSH_ACK, in->ino(), 0, cap->get_cap_id(), m->get_seq(),
+ m->get_caps(), 0, dirty, 0, mds->get_osd_epoch_barrier());
+ ack->set_client_tid(m->get_client_tid());
+ ack->set_oldest_flush_tid(m->get_oldest_flush_tid());
+ }
+
+ // filter wanted based on what we could ever give out (given auth/replica status)
+ bool need_flush = m->flags & MClientCaps::FLAG_SYNC;
+ int new_wanted = m->get_wanted();
+ if (new_wanted != cap->wanted()) {
+ if (!need_flush && in->is_auth() && (new_wanted & ~cap->pending())) {
+ // exapnding caps. make sure we aren't waiting for a log flush
+ need_flush = _need_flush_mdlog(head_in, new_wanted & ~cap->pending());
+ }
+
+ adjust_cap_wanted(cap, new_wanted, m->get_issue_seq());
+ }
+
+ if (in->is_auth() &&
+ _do_cap_update(in, cap, dirty, follows, m, ack, &need_flush)) {
+ // updated
+ eval(in, CEPH_CAP_LOCKS);
+
+ if (!need_flush && (cap->wanted() & ~cap->pending()))
+ need_flush = _need_flush_mdlog(in, cap->wanted() & ~cap->pending());
+ } else {
+ // no update, ack now.
+ if (ack)
+ mds->send_message_client_counted(ack, m->get_connection());
+
+ bool did_issue = eval(in, CEPH_CAP_LOCKS);
+ if (!did_issue && (cap->wanted() & ~cap->pending()))
+ issue_caps(in, cap);
+
+ if (cap->get_last_seq() == 0 &&
+ (cap->pending() & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER))) {
+ share_inode_max_size(in, cap);
+ }
+ }
+
+ if (need_flush)
+ mds->mdlog->flush();
+ }
+
+ out:
+ if (need_unpin)
+ head_in->auth_unpin(this);
+}
+
+
+class C_Locker_RetryRequestCapRelease : public LockerContext {
+ client_t client;
+ ceph_mds_request_release item;
+public:
+ C_Locker_RetryRequestCapRelease(Locker *l, client_t c, const ceph_mds_request_release& it) :
+ LockerContext(l), client(c), item(it) { }
+ void finish(int r) override {
+ string dname;
+ MDRequestRef null_ref;
+ locker->process_request_cap_release(null_ref, client, item, dname);
+ }
+};
+
+void Locker::process_request_cap_release(MDRequestRef& mdr, client_t client, const ceph_mds_request_release& item,
+ std::string_view dname)
+{
+ inodeno_t ino = (uint64_t)item.ino;
+ uint64_t cap_id = item.cap_id;
+ int caps = item.caps;
+ int wanted = item.wanted;
+ int seq = item.seq;
+ int issue_seq = item.issue_seq;
+ int mseq = item.mseq;
+
+ CInode *in = mdcache->get_inode(ino);
+ if (!in)
+ return;
+
+ if (dname.length()) {
+ frag_t fg = in->pick_dirfrag(dname);
+ CDir *dir = in->get_dirfrag(fg);
+ if (dir) {
+ CDentry *dn = dir->lookup(dname);
+ if (dn) {
+ ClientLease *l = dn->get_client_lease(client);
+ if (l) {
+ dout(10) << __func__ << " removing lease on " << *dn << dendl;
+ dn->remove_client_lease(l, this);
+ } else {
+ dout(7) << __func__ << " client." << client
+ << " doesn't have lease on " << *dn << dendl;
+ }
+ } else {
+ dout(7) << __func__ << " client." << client << " released lease on dn "
+ << dir->dirfrag() << "/" << dname << " which dne" << dendl;
+ }
+ }
+ }
+
+ Capability *cap = in->get_client_cap(client);
+ if (!cap)
+ return;
+
+ dout(10) << __func__ << " client." << client << " " << ccap_string(caps) << " on " << *in
+ << (mdr ? "" : " (DEFERRED, no mdr)")
+ << dendl;
+
+ if (ceph_seq_cmp(mseq, cap->get_mseq()) < 0) {
+ dout(7) << " mseq " << mseq << " < " << cap->get_mseq() << ", dropping" << dendl;
+ return;
+ }
+
+ if (cap->get_cap_id() != cap_id) {
+ dout(7) << " cap_id " << cap_id << " != " << cap->get_cap_id() << ", dropping" << dendl;
+ return;
+ }
+
+ if (should_defer_client_cap_frozen(in)) {
+ dout(7) << " frozen, deferring" << dendl;
+ in->add_waiter(CInode::WAIT_UNFREEZE, new C_Locker_RetryRequestCapRelease(this, client, item));
+ return;
+ }
+
+ if (caps & ~cap->issued()) {
+ dout(10) << " confirming not issued caps " << ccap_string(caps & ~cap->issued()) << dendl;
+ caps &= cap->issued();
+ }
+ cap->confirm_receipt(seq, caps);
+
+ if (!in->client_need_snapflush.empty() &&
+ (cap->issued() & CEPH_CAP_ANY_FILE_WR) == 0) {
+ _do_null_snapflush(in, client);
+ }
+
+ adjust_cap_wanted(cap, wanted, issue_seq);
+
+ if (mdr)
+ cap->inc_suppress();
+ eval(in, CEPH_CAP_LOCKS);
+ if (mdr)
+ cap->dec_suppress();
+
+ // take note; we may need to reissue on this cap later
+ if (mdr)
+ mdr->cap_releases[in->vino()] = cap->get_last_seq();
+}
+
+class C_Locker_RetryKickIssueCaps : public LockerContext {
+ CInode *in;
+ client_t client;
+ ceph_seq_t seq;
+public:
+ C_Locker_RetryKickIssueCaps(Locker *l, CInode *i, client_t c, ceph_seq_t s) :
+ LockerContext(l), in(i), client(c), seq(s) {
+ in->get(CInode::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ locker->kick_issue_caps(in, client, seq);
+ in->put(CInode::PIN_PTRWAITER);
+ }
+};
+
+void Locker::kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq)
+{
+ Capability *cap = in->get_client_cap(client);
+ if (!cap || cap->get_last_seq() != seq)
+ return;
+ if (in->is_frozen()) {
+ dout(10) << "kick_issue_caps waiting for unfreeze on " << *in << dendl;
+ in->add_waiter(CInode::WAIT_UNFREEZE,
+ new C_Locker_RetryKickIssueCaps(this, in, client, seq));
+ return;
+ }
+ dout(10) << "kick_issue_caps released at current seq " << seq
+ << ", reissuing" << dendl;
+ issue_caps(in, cap);
+}
+
+void Locker::kick_cap_releases(MDRequestRef& mdr)
+{
+ client_t client = mdr->get_client();
+ for (map<vinodeno_t,ceph_seq_t>::iterator p = mdr->cap_releases.begin();
+ p != mdr->cap_releases.end();
+ ++p) {
+ CInode *in = mdcache->get_inode(p->first);
+ if (!in)
+ continue;
+ kick_issue_caps(in, client, p->second);
+ }
+}
+
+/**
+ * m and ack might be NULL, so don't dereference them unless dirty != 0
+ */
+void Locker::_do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, const MClientCaps::const_ref &m, const MClientCaps::ref &ack)
+{
+ dout(10) << "_do_snap_update dirty " << ccap_string(dirty)
+ << " follows " << follows << " snap " << snap
+ << " on " << *in << dendl;
+
+ if (snap == CEPH_NOSNAP) {
+ // hmm, i guess snap was already deleted? just ack!
+ dout(10) << " wow, the snap following " << follows
+ << " was already deleted. nothing to record, just ack." << dendl;
+ if (ack)
+ mds->send_message_client_counted(ack, m->get_connection());
+ return;
+ }
+
+ EUpdate *le = new EUpdate(mds->mdlog, "snap flush");
+ mds->mdlog->start_entry(le);
+ MutationRef mut = new MutationImpl();
+ mut->ls = mds->mdlog->get_current_segment();
+
+ // normal metadata updates that we can apply to the head as well.
+
+ // update xattrs?
+ CInode::mempool_xattr_map *px = nullptr;
+ bool xattrs = (dirty & CEPH_CAP_XATTR_EXCL) &&
+ m->xattrbl.length() &&
+ m->head.xattr_version > in->get_projected_inode()->xattr_version;
+
+ CInode::mempool_old_inode *oi = 0;
+ if (in->is_multiversion()) {
+ oi = in->pick_old_inode(snap);
+ }
+
+ CInode::mempool_inode *i;
+ if (oi) {
+ dout(10) << " writing into old inode" << dendl;
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
+ if (snap > oi->first)
+ in->split_old_inode(snap);
+ i = &oi->inode;
+ if (xattrs)
+ px = &oi->xattrs;
+ } else {
+ auto &pi = in->project_inode(xattrs);
+ pi.inode.version = in->pre_dirty();
+ i = &pi.inode;
+ if (xattrs)
+ px = pi.xattrs.get();
+ }
+
+ _update_cap_fields(in, dirty, m, i);
+
+ // xattr
+ if (xattrs) {
+ dout(7) << " xattrs v" << i->xattr_version << " -> " << m->head.xattr_version
+ << " len " << m->xattrbl.length() << dendl;
+ i->xattr_version = m->head.xattr_version;
+ auto p = m->xattrbl.cbegin();
+ decode(*px, p);
+ }
+
+ {
+ auto it = i->client_ranges.find(client);
+ if (it != i->client_ranges.end()) {
+ if (in->last == snap) {
+ dout(10) << " removing client_range entirely" << dendl;
+ i->client_ranges.erase(it);
+ } else {
+ dout(10) << " client_range now follows " << snap << dendl;
+ it->second.follows = snap;
+ }
+ }
+ }
+
+ mut->auth_pin(in);
+ mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows);
+ mdcache->journal_dirty_inode(mut.get(), &le->metablob, in, follows);
+
+ // "oldest flush tid" > 0 means client uses unique TID for each flush
+ if (ack && ack->get_oldest_flush_tid() > 0)
+ le->metablob.add_client_flush(metareqid_t(m->get_source(), ack->get_client_tid()),
+ ack->get_oldest_flush_tid());
+
+ mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut, UPDATE_SNAPFLUSH,
+ ack, client));
+}
+
+void Locker::_update_cap_fields(CInode *in, int dirty, const MClientCaps::const_ref &m, CInode::mempool_inode *pi)
+{
+ if (dirty == 0)
+ return;
+
+ /* m must be valid if there are dirty caps */
+ ceph_assert(m);
+ uint64_t features = m->get_connection()->get_features();
+
+ if (m->get_ctime() > pi->ctime) {
+ dout(7) << " ctime " << pi->ctime << " -> " << m->get_ctime()
+ << " for " << *in << dendl;
+ pi->ctime = m->get_ctime();
+ if (m->get_ctime() > pi->rstat.rctime)
+ pi->rstat.rctime = m->get_ctime();
+ }
+
+ if ((features & CEPH_FEATURE_FS_CHANGE_ATTR) &&
+ m->get_change_attr() > pi->change_attr) {
+ dout(7) << " change_attr " << pi->change_attr << " -> " << m->get_change_attr()
+ << " for " << *in << dendl;
+ pi->change_attr = m->get_change_attr();
+ }
+
+ // file
+ if (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
+ utime_t atime = m->get_atime();
+ utime_t mtime = m->get_mtime();
+ uint64_t size = m->get_size();
+ version_t inline_version = m->inline_version;
+
+ if (((dirty & CEPH_CAP_FILE_WR) && mtime > pi->mtime) ||
+ ((dirty & CEPH_CAP_FILE_EXCL) && mtime != pi->mtime)) {
+ dout(7) << " mtime " << pi->mtime << " -> " << mtime
+ << " for " << *in << dendl;
+ pi->mtime = mtime;
+ if (mtime > pi->rstat.rctime)
+ pi->rstat.rctime = mtime;
+ }
+ if (in->inode.is_file() && // ONLY if regular file
+ size > pi->size) {
+ dout(7) << " size " << pi->size << " -> " << size
+ << " for " << *in << dendl;
+ pi->size = size;
+ pi->rstat.rbytes = size;
+ }
+ if (in->inode.is_file() &&
+ (dirty & CEPH_CAP_FILE_WR) &&
+ inline_version > pi->inline_data.version) {
+ pi->inline_data.version = inline_version;
+ if (inline_version != CEPH_INLINE_NONE && m->inline_data.length() > 0)
+ pi->inline_data.get_data() = m->inline_data;
+ else
+ pi->inline_data.free_data();
+ }
+ if ((dirty & CEPH_CAP_FILE_EXCL) && atime != pi->atime) {
+ dout(7) << " atime " << pi->atime << " -> " << atime
+ << " for " << *in << dendl;
+ pi->atime = atime;
+ }
+ if ((dirty & CEPH_CAP_FILE_EXCL) &&
+ ceph_seq_cmp(pi->time_warp_seq, m->get_time_warp_seq()) < 0) {
+ dout(7) << " time_warp_seq " << pi->time_warp_seq << " -> " << m->get_time_warp_seq()
+ << " for " << *in << dendl;
+ pi->time_warp_seq = m->get_time_warp_seq();
+ }
+ }
+ // auth
+ if (dirty & CEPH_CAP_AUTH_EXCL) {
+ if (m->head.uid != pi->uid) {
+ dout(7) << " uid " << pi->uid
+ << " -> " << m->head.uid
+ << " for " << *in << dendl;
+ pi->uid = m->head.uid;
+ }
+ if (m->head.gid != pi->gid) {
+ dout(7) << " gid " << pi->gid
+ << " -> " << m->head.gid
+ << " for " << *in << dendl;
+ pi->gid = m->head.gid;
+ }
+ if (m->head.mode != pi->mode) {
+ dout(7) << " mode " << oct << pi->mode
+ << " -> " << m->head.mode << dec
+ << " for " << *in << dendl;
+ pi->mode = m->head.mode;
+ }
+ if ((features & CEPH_FEATURE_FS_BTIME) && m->get_btime() != pi->btime) {
+ dout(7) << " btime " << oct << pi->btime
+ << " -> " << m->get_btime() << dec
+ << " for " << *in << dendl;
+ pi->btime = m->get_btime();
+ }
+ }
+}
+
+/*
+ * update inode based on cap flush|flushsnap|wanted.
+ * adjust max_size, if needed.
+ * if we update, return true; otherwise, false (no updated needed).
+ */
+bool Locker::_do_cap_update(CInode *in, Capability *cap,
+ int dirty, snapid_t follows,
+ const MClientCaps::const_ref &m, const MClientCaps::ref &ack,
+ bool *need_flush)
+{
+ dout(10) << "_do_cap_update dirty " << ccap_string(dirty)
+ << " issued " << ccap_string(cap ? cap->issued() : 0)
+ << " wanted " << ccap_string(cap ? cap->wanted() : 0)
+ << " on " << *in << dendl;
+ ceph_assert(in->is_auth());
+ client_t client = m->get_source().num();
+ CInode::mempool_inode *latest = in->get_projected_inode();
+
+ // increase or zero max_size?
+ uint64_t size = m->get_size();
+ bool change_max = false;
+ uint64_t old_max = latest->client_ranges.count(client) ? latest->client_ranges[client].range.last : 0;
+ uint64_t new_max = old_max;
+
+ if (in->is_file()) {
+ bool forced_change_max = false;
+ dout(20) << "inode is file" << dendl;
+ if (cap && ((cap->issued() | cap->wanted()) & CEPH_CAP_ANY_FILE_WR)) {
+ dout(20) << "client has write caps; m->get_max_size="
+ << m->get_max_size() << "; old_max=" << old_max << dendl;
+ if (m->get_max_size() > new_max) {
+ dout(10) << "client requests file_max " << m->get_max_size()
+ << " > max " << old_max << dendl;
+ change_max = true;
+ forced_change_max = true;
+ new_max = calc_new_max_size(latest, m->get_max_size());
+ } else {
+ new_max = calc_new_max_size(latest, size);
+
+ if (new_max > old_max)
+ change_max = true;
+ else
+ new_max = old_max;
+ }
+ } else {
+ if (old_max) {
+ change_max = true;
+ new_max = 0;
+ }
+ }
+
+ if (in->last == CEPH_NOSNAP &&
+ change_max &&
+ !in->filelock.can_wrlock(client) &&
+ !in->filelock.can_force_wrlock(client)) {
+ dout(10) << " i want to change file_max, but lock won't allow it (yet)" << dendl;
+ if (in->filelock.is_stable()) {
+ bool need_issue = false;
+ if (cap)
+ cap->inc_suppress();
+ if (in->get_mds_caps_wanted().empty() &&
+ (in->get_loner() >= 0 || (in->get_wanted_loner() >= 0 && in->try_set_loner()))) {
+ if (in->filelock.get_state() != LOCK_EXCL)
+ file_excl(&in->filelock, &need_issue);
+ } else
+ simple_lock(&in->filelock, &need_issue);
+ if (need_issue)
+ issue_caps(in);
+ if (cap)
+ cap->dec_suppress();
+ }
+ if (!in->filelock.can_wrlock(client) &&
+ !in->filelock.can_force_wrlock(client)) {
+ C_MDL_CheckMaxSize *cms = new C_MDL_CheckMaxSize(this, in,
+ forced_change_max ? new_max : 0,
+ 0, utime_t());
+
+ in->filelock.add_waiter(SimpleLock::WAIT_STABLE, cms);
+ change_max = false;
+ }
+ }
+ }
+
+ if (m->flockbl.length()) {
+ int32_t num_locks;
+ auto bli = m->flockbl.cbegin();
+ decode(num_locks, bli);
+ for ( int i=0; i < num_locks; ++i) {
+ ceph_filelock decoded_lock;
+ decode(decoded_lock, bli);
+ in->get_fcntl_lock_state()->held_locks.
+ insert(pair<uint64_t, ceph_filelock>(decoded_lock.start, decoded_lock));
+ ++in->get_fcntl_lock_state()->client_held_lock_counts[(client_t)(decoded_lock.client)];
+ }
+ decode(num_locks, bli);
+ for ( int i=0; i < num_locks; ++i) {
+ ceph_filelock decoded_lock;
+ decode(decoded_lock, bli);
+ in->get_flock_lock_state()->held_locks.
+ insert(pair<uint64_t, ceph_filelock>(decoded_lock.start, decoded_lock));
+ ++in->get_flock_lock_state()->client_held_lock_counts[(client_t)(decoded_lock.client)];
+ }
+ }
+
+ if (!dirty && !change_max)
+ return false;
+
+ Session *session = mds->get_session(m);
+ if (session->check_access(in, MAY_WRITE,
+ m->caller_uid, m->caller_gid, NULL, 0, 0) < 0) {
+ dout(10) << "check_access failed, dropping cap update on " << *in << dendl;
+ return false;
+ }
+
+ // do the update.
+ EUpdate *le = new EUpdate(mds->mdlog, "cap update");
+ mds->mdlog->start_entry(le);
+
+ bool xattr = (dirty & CEPH_CAP_XATTR_EXCL) &&
+ m->xattrbl.length() &&
+ m->head.xattr_version > in->get_projected_inode()->xattr_version;
+
+ auto &pi = in->project_inode(xattr);
+ pi.inode.version = in->pre_dirty();
+
+ MutationRef mut(new MutationImpl());
+ mut->ls = mds->mdlog->get_current_segment();
+
+ _update_cap_fields(in, dirty, m, &pi.inode);
+
+ if (change_max) {
+ dout(7) << " max_size " << old_max << " -> " << new_max
+ << " for " << *in << dendl;
+ if (new_max) {
+ auto &cr = pi.inode.client_ranges[client];
+ cr.range.first = 0;
+ cr.range.last = new_max;
+ cr.follows = in->first - 1;
+ if (cap)
+ cap->mark_clientwriteable();
+ } else {
+ pi.inode.client_ranges.erase(client);
+ if (cap)
+ cap->clear_clientwriteable();
+ }
+ }
+
+ if (change_max || (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)))
+ wrlock_force(&in->filelock, mut); // wrlock for duration of journal
+
+ // auth
+ if (dirty & CEPH_CAP_AUTH_EXCL)
+ wrlock_force(&in->authlock, mut);
+
+ // xattrs update?
+ if (xattr) {
+ dout(7) << " xattrs v" << pi.inode.xattr_version << " -> " << m->head.xattr_version << dendl;
+ pi.inode.xattr_version = m->head.xattr_version;
+ auto p = m->xattrbl.cbegin();
+ decode_noshare(*pi.xattrs, p);
+ wrlock_force(&in->xattrlock, mut);
+ }
+
+ mut->auth_pin(in);
+ mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows);
+ mdcache->journal_dirty_inode(mut.get(), &le->metablob, in, follows);
+
+ // "oldest flush tid" > 0 means client uses unique TID for each flush
+ if (ack && ack->get_oldest_flush_tid() > 0)
+ le->metablob.add_client_flush(metareqid_t(m->get_source(), ack->get_client_tid()),
+ ack->get_oldest_flush_tid());
+
+ unsigned update_flags = 0;
+ if (change_max)
+ update_flags |= UPDATE_SHAREMAX;
+ if (cap)
+ update_flags |= UPDATE_NEEDSISSUE;
+ mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut, update_flags,
+ ack, client));
+ if (need_flush && !*need_flush &&
+ ((change_max && new_max) || // max INCREASE
+ _need_flush_mdlog(in, dirty)))
+ *need_flush = true;
+
+ return true;
+}
+
+void Locker::handle_client_cap_release(const MClientCapRelease::const_ref &m)
+{
+ client_t client = m->get_source().num();
+ dout(10) << "handle_client_cap_release " << *m << dendl;
+
+ if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ if (m->osd_epoch_barrier && !mds->objecter->have_map(m->osd_epoch_barrier)) {
+ // Pause RADOS operations until we see the required epoch
+ mds->objecter->set_epoch_barrier(m->osd_epoch_barrier);
+ }
+
+ if (mds->get_osd_epoch_barrier() < m->osd_epoch_barrier) {
+ // Record the barrier so that we will retransmit it to clients
+ mds->set_osd_epoch_barrier(m->osd_epoch_barrier);
+ }
+
+ Session *session = mds->get_session(m);
+
+ for (const auto &cap : m->caps) {
+ _do_cap_release(client, inodeno_t((uint64_t)cap.ino) , cap.cap_id, cap.migrate_seq, cap.seq);
+ }
+
+ if (session) {
+ session->notify_cap_release(m->caps.size());
+ }
+}
+
+class C_Locker_RetryCapRelease : public LockerContext {
+ client_t client;
+ inodeno_t ino;
+ uint64_t cap_id;
+ ceph_seq_t migrate_seq;
+ ceph_seq_t issue_seq;
+public:
+ C_Locker_RetryCapRelease(Locker *l, client_t c, inodeno_t i, uint64_t id,
+ ceph_seq_t mseq, ceph_seq_t seq) :
+ LockerContext(l), client(c), ino(i), cap_id(id), migrate_seq(mseq), issue_seq(seq) {}
+ void finish(int r) override {
+ locker->_do_cap_release(client, ino, cap_id, migrate_seq, issue_seq);
+ }
+};
+
+void Locker::_do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id,
+ ceph_seq_t mseq, ceph_seq_t seq)
+{
+ CInode *in = mdcache->get_inode(ino);
+ if (!in) {
+ dout(7) << "_do_cap_release missing ino " << ino << dendl;
+ return;
+ }
+ Capability *cap = in->get_client_cap(client);
+ if (!cap) {
+ dout(7) << "_do_cap_release no cap for client" << client << " on "<< *in << dendl;
+ return;
+ }
+
+ dout(7) << "_do_cap_release for client." << client << " on "<< *in << dendl;
+ if (cap->get_cap_id() != cap_id) {
+ dout(7) << " capid " << cap_id << " != " << cap->get_cap_id() << ", ignore" << dendl;
+ return;
+ }
+ if (ceph_seq_cmp(mseq, cap->get_mseq()) < 0) {
+ dout(7) << " mseq " << mseq << " < " << cap->get_mseq() << ", ignore" << dendl;
+ return;
+ }
+ if (should_defer_client_cap_frozen(in)) {
+ dout(7) << " freezing|frozen, deferring" << dendl;
+ in->add_waiter(CInode::WAIT_UNFREEZE,
+ new C_Locker_RetryCapRelease(this, client, ino, cap_id, mseq, seq));
+ return;
+ }
+ if (seq != cap->get_last_issue()) {
+ dout(7) << " issue_seq " << seq << " != " << cap->get_last_issue() << dendl;
+ // clean out any old revoke history
+ cap->clean_revoke_from(seq);
+ eval_cap_gather(in);
+ return;
+ }
+ remove_client_cap(in, cap);
+}
+
+void Locker::remove_client_cap(CInode *in, Capability *cap, bool kill)
+{
+ client_t client = cap->get_client();
+ // clean out any pending snapflush state
+ if (!in->client_need_snapflush.empty())
+ _do_null_snapflush(in, client);
+
+ bool notable = cap->is_notable();
+ in->remove_client_cap(client);
+ if (!notable)
+ return;
+
+ if (in->is_auth()) {
+ // make sure we clear out the client byte range
+ if (in->get_projected_inode()->client_ranges.count(client) &&
+ !(in->inode.nlink == 0 && !in->is_any_caps())) { // unless it's unlink + stray
+ if (kill)
+ in->state_set(CInode::STATE_NEEDSRECOVER);
+ else
+ check_inode_max_size(in);
+ }
+ } else {
+ request_inode_file_caps(in);
+ }
+
+ try_eval(in, CEPH_CAP_LOCKS);
+}
+
+
+/**
+ * Return true if any currently revoking caps exceed the
+ * session_timeout threshold.
+ */
+bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking,
+ double timeout) const
+{
+ xlist<Capability*>::const_iterator p = revoking.begin();
+ if (p.end()) {
+ // No revoking caps at the moment
+ return false;
+ } else {
+ utime_t now = ceph_clock_now();
+ utime_t age = now - (*p)->get_last_revoke_stamp();
+ if (age <= timeout) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+}
+
+void Locker::get_late_revoking_clients(std::list<client_t> *result,
+ double timeout) const
+{
+ if (!any_late_revoking_caps(revoking_caps, timeout)) {
+ // Fast path: no misbehaving clients, execute in O(1)
+ return;
+ }
+
+ // Slow path: execute in O(N_clients)
+ for (auto &p : revoking_caps_by_client) {
+ if (any_late_revoking_caps(p.second, timeout)) {
+ // Search the list for duplicate and only insert if unique
+ std::list<client_t>::const_iterator it = std::find(result->begin(), result->end(), p.first);
+ if (it == result->end())
+ result->push_back(p.first);
+ }
+ }
+}
+
+// Hard-code instead of surfacing a config settings because this is
+// really a hack that should go away at some point when we have better
+// inspection tools for getting at detailed cap state (#7316)
+#define MAX_WARN_CAPS 100
+
+void Locker::caps_tick()
+{
+ utime_t now = ceph_clock_now();
+
+ if (!need_snapflush_inodes.empty()) {
+ // snap inodes that needs flush are auth pinned, they affect
+ // subtree/difrarg freeze.
+ utime_t cutoff = now;
+ cutoff -= g_conf()->mds_freeze_tree_timeout / 3;
+
+ CInode *last = need_snapflush_inodes.back();
+ while (!need_snapflush_inodes.empty()) {
+ CInode *in = need_snapflush_inodes.front();
+ if (in->last_dirstat_prop >= cutoff)
+ break;
+ in->item_caps.remove_myself();
+ snapflush_nudge(in);
+ if (in == last)
+ break;
+ }
+ }
+
+ dout(20) << __func__ << " " << revoking_caps.size() << " revoking caps" << dendl;
+
+ now = ceph_clock_now();
+ int n = 0;
+ for (xlist<Capability*>::iterator p = revoking_caps.begin(); !p.end(); ++p) {
+ Capability *cap = *p;
+
+ utime_t age = now - cap->get_last_revoke_stamp();
+ dout(20) << __func__ << " age = " << age << " client." << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
+ if (age <= mds->mdsmap->get_session_timeout()) {
+ dout(20) << __func__ << " age below timeout " << mds->mdsmap->get_session_timeout() << dendl;
+ break;
+ } else {
+ ++n;
+ if (n > MAX_WARN_CAPS) {
+ dout(1) << __func__ << " more than " << MAX_WARN_CAPS << " caps are late"
+ << "revoking, ignoring subsequent caps" << dendl;
+ break;
+ }
+ }
+ // exponential backoff of warning intervals
+ if (age > mds->mdsmap->get_session_timeout() * (1 << cap->get_num_revoke_warnings())) {
+ cap->inc_num_revoke_warnings();
+ stringstream ss;
+ ss << "client." << cap->get_client() << " isn't responding to mclientcaps(revoke), ino "
+ << cap->get_inode()->ino() << " pending " << ccap_string(cap->pending())
+ << " issued " << ccap_string(cap->issued()) << ", sent " << age << " seconds ago";
+ mds->clog->warn() << ss.str();
+ dout(20) << __func__ << " " << ss.str() << dendl;
+ } else {
+ dout(20) << __func__ << " silencing log message (backoff) for " << "client." << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
+ }
+ }
+}
+
+
+void Locker::handle_client_lease(const MClientLease::const_ref &m)
+{
+ dout(10) << "handle_client_lease " << *m << dendl;
+
+ ceph_assert(m->get_source().is_client());
+ client_t client = m->get_source().num();
+
+ CInode *in = mdcache->get_inode(m->get_ino(), m->get_last());
+ if (!in) {
+ dout(7) << "handle_client_lease don't have ino " << m->get_ino() << "." << m->get_last() << dendl;
+ return;
+ }
+ CDentry *dn = 0;
+
+ frag_t fg = in->pick_dirfrag(m->dname);
+ CDir *dir = in->get_dirfrag(fg);
+ if (dir)
+ dn = dir->lookup(m->dname);
+ if (!dn) {
+ dout(7) << "handle_client_lease don't have dn " << m->get_ino() << " " << m->dname << dendl;
+ return;
+ }
+ dout(10) << " on " << *dn << dendl;
+
+ // replica and lock
+ ClientLease *l = dn->get_client_lease(client);
+ if (!l) {
+ dout(7) << "handle_client_lease didn't have lease for client." << client << " of " << *dn << dendl;
+ return;
+ }
+
+ switch (m->get_action()) {
+ case CEPH_MDS_LEASE_REVOKE_ACK:
+ case CEPH_MDS_LEASE_RELEASE:
+ if (l->seq != m->get_seq()) {
+ dout(7) << "handle_client_lease release - seq " << l->seq << " != provided " << m->get_seq() << dendl;
+ } else {
+ dout(7) << "handle_client_lease client." << client
+ << " on " << *dn << dendl;
+ dn->remove_client_lease(l, this);
+ }
+ break;
+
+ case CEPH_MDS_LEASE_RENEW:
+ {
+ dout(7) << "handle_client_lease client." << client << " renew on " << *dn
+ << (!dn->lock.can_lease(client)?", revoking lease":"") << dendl;
+ if (dn->lock.can_lease(client)) {
+ auto reply = MClientLease::create(*m);
+ int pool = 1; // fixme.. do something smart!
+ reply->h.duration_ms = (int)(1000 * mdcache->client_lease_durations[pool]);
+ reply->h.seq = ++l->seq;
+ reply->clear_payload();
+
+ utime_t now = ceph_clock_now();
+ now += mdcache->client_lease_durations[pool];
+ mdcache->touch_client_lease(l, pool, now);
+
+ mds->send_message_client_counted(reply, m->get_connection());
+ }
+ }
+ break;
+
+ default:
+ ceph_abort(); // implement me
+ break;
+ }
+}
+
+
+void Locker::issue_client_lease(CDentry *dn, client_t client,
+ bufferlist &bl, utime_t now, Session *session)
+{
+ CInode *diri = dn->get_dir()->get_inode();
+ if (!diri->is_stray() && // do not issue dn leases in stray dir!
+ ((!diri->filelock.can_lease(client) &&
+ (diri->get_client_cap_pending(client) & (CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL)) == 0)) &&
+ dn->lock.can_lease(client)) {
+ int pool = 1; // fixme.. do something smart!
+ // issue a dentry lease
+ ClientLease *l = dn->add_client_lease(client, session);
+ session->touch_lease(l);
+
+ now += mdcache->client_lease_durations[pool];
+ mdcache->touch_client_lease(l, pool, now);
+
+ LeaseStat lstat;
+ lstat.mask = 1 | CEPH_LOCK_DN; // old and new bit values
+ lstat.duration_ms = (uint32_t)(1000 * mdcache->client_lease_durations[pool]);
+ lstat.seq = ++l->seq;
+ encode_lease(bl, session->info, lstat);
+ dout(20) << "issue_client_lease seq " << lstat.seq << " dur " << lstat.duration_ms << "ms "
+ << " on " << *dn << dendl;
+ } else {
+ // null lease
+ LeaseStat lstat;
+ encode_lease(bl, session->info, lstat);
+ dout(20) << "issue_client_lease no/null lease on " << *dn << dendl;
+ }
+}
+
+
+void Locker::revoke_client_leases(SimpleLock *lock)
+{
+ int n = 0;
+ CDentry *dn = static_cast<CDentry*>(lock->get_parent());
+ for (map<client_t, ClientLease*>::iterator p = dn->client_lease_map.begin();
+ p != dn->client_lease_map.end();
+ ++p) {
+ ClientLease *l = p->second;
+
+ n++;
+ ceph_assert(lock->get_type() == CEPH_LOCK_DN);
+
+ CDentry *dn = static_cast<CDentry*>(lock->get_parent());
+ int mask = 1 | CEPH_LOCK_DN; // old and new bits
+
+ // i should also revoke the dir ICONTENT lease, if they have it!
+ CInode *diri = dn->get_dir()->get_inode();
+ auto lease = MClientLease::create(CEPH_MDS_LEASE_REVOKE, l->seq, mask, diri->ino(), diri->first, CEPH_NOSNAP, dn->get_name());
+ mds->send_message_client_counted(lease, l->client);
+ }
+}
+
+void Locker::encode_lease(bufferlist& bl, const session_info_t& info,
+ const LeaseStat& ls)
+{
+ if (info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
+ ENCODE_START(1, 1, bl);
+ encode(ls.mask, bl);
+ encode(ls.duration_ms, bl);
+ encode(ls.seq, bl);
+ ENCODE_FINISH(bl);
+ }
+ else {
+ encode(ls.mask, bl);
+ encode(ls.duration_ms, bl);
+ encode(ls.seq, bl);
+ }
+}
+
+// locks ----------------------------------------------------------------
+
+SimpleLock *Locker::get_lock(int lock_type, const MDSCacheObjectInfo &info)
+{
+ switch (lock_type) {
+ case CEPH_LOCK_DN:
+ {
+ // be careful; info.dirfrag may have incorrect frag; recalculate based on dname.
+ CInode *diri = mdcache->get_inode(info.dirfrag.ino);
+ frag_t fg;
+ CDir *dir = 0;
+ CDentry *dn = 0;
+ if (diri) {
+ fg = diri->pick_dirfrag(info.dname);
+ dir = diri->get_dirfrag(fg);
+ if (dir)
+ dn = dir->lookup(info.dname, info.snapid);
+ }
+ if (!dn) {
+ dout(7) << "get_lock don't have dn " << info.dirfrag.ino << " " << info.dname << dendl;
+ return 0;
+ }
+ return &dn->lock;
+ }
+
+ case CEPH_LOCK_IAUTH:
+ case CEPH_LOCK_ILINK:
+ case CEPH_LOCK_IDFT:
+ case CEPH_LOCK_IFILE:
+ case CEPH_LOCK_INEST:
+ case CEPH_LOCK_IXATTR:
+ case CEPH_LOCK_ISNAP:
+ case CEPH_LOCK_IFLOCK:
+ case CEPH_LOCK_IPOLICY:
+ {
+ CInode *in = mdcache->get_inode(info.ino, info.snapid);
+ if (!in) {
+ dout(7) << "get_lock don't have ino " << info.ino << dendl;
+ return 0;
+ }
+ switch (lock_type) {
+ case CEPH_LOCK_IAUTH: return &in->authlock;
+ case CEPH_LOCK_ILINK: return &in->linklock;
+ case CEPH_LOCK_IDFT: return &in->dirfragtreelock;
+ case CEPH_LOCK_IFILE: return &in->filelock;
+ case CEPH_LOCK_INEST: return &in->nestlock;
+ case CEPH_LOCK_IXATTR: return &in->xattrlock;
+ case CEPH_LOCK_ISNAP: return &in->snaplock;
+ case CEPH_LOCK_IFLOCK: return &in->flocklock;
+ case CEPH_LOCK_IPOLICY: return &in->policylock;
+ }
+ }
+
+ default:
+ dout(7) << "get_lock don't know lock_type " << lock_type << dendl;
+ ceph_abort();
+ break;
+ }
+
+ return 0;
+}
+
+void Locker::handle_lock(const MLock::const_ref &m)
+{
+ // nobody should be talking to us during recovery.
+ ceph_assert(mds->is_rejoin() || mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+
+ SimpleLock *lock = get_lock(m->get_lock_type(), m->get_object_info());
+ if (!lock) {
+ dout(10) << "don't have object " << m->get_object_info() << ", must have trimmed, dropping" << dendl;
+ return;
+ }
+
+ switch (lock->get_type()) {
+ case CEPH_LOCK_DN:
+ case CEPH_LOCK_IAUTH:
+ case CEPH_LOCK_ILINK:
+ case CEPH_LOCK_ISNAP:
+ case CEPH_LOCK_IXATTR:
+ case CEPH_LOCK_IFLOCK:
+ case CEPH_LOCK_IPOLICY:
+ handle_simple_lock(lock, m);
+ break;
+
+ case CEPH_LOCK_IDFT:
+ case CEPH_LOCK_INEST:
+ //handle_scatter_lock((ScatterLock*)lock, m);
+ //break;
+
+ case CEPH_LOCK_IFILE:
+ handle_file_lock(static_cast<ScatterLock*>(lock), m);
+ break;
+
+ default:
+ dout(7) << "handle_lock got otype " << m->get_lock_type() << dendl;
+ ceph_abort();
+ break;
+ }
+}
+
+
+
+
+
+// ==========================================================================
+// simple lock
+
+/** This function may take a reference to m if it needs one, but does
+ * not put references. */
+void Locker::handle_reqrdlock(SimpleLock *lock, const MLock::const_ref &m)
+{
+ MDSCacheObject *parent = lock->get_parent();
+ if (parent->is_auth() &&
+ lock->get_state() != LOCK_SYNC &&
+ !parent->is_frozen()) {
+ dout(7) << "handle_reqrdlock got rdlock request on " << *lock
+ << " on " << *parent << dendl;
+ ceph_assert(parent->is_auth()); // replica auth pinned if they're doing this!
+ if (lock->is_stable()) {
+ simple_sync(lock);
+ } else {
+ dout(7) << "handle_reqrdlock delaying request until lock is stable" << dendl;
+ lock->add_waiter(SimpleLock::WAIT_STABLE | MDSCacheObject::WAIT_UNFREEZE,
+ new C_MDS_RetryMessage(mds, m));
+ }
+ } else {
+ dout(7) << "handle_reqrdlock dropping rdlock request on " << *lock
+ << " on " << *parent << dendl;
+ // replica should retry
+ }
+}
+
+void Locker::handle_simple_lock(SimpleLock *lock, const MLock::const_ref &m)
+{
+ int from = m->get_asker();
+
+ dout(10) << "handle_simple_lock " << *m
+ << " on " << *lock << " " << *lock->get_parent() << dendl;
+
+ if (mds->is_rejoin()) {
+ if (lock->get_parent()->is_rejoining()) {
+ dout(7) << "handle_simple_lock still rejoining " << *lock->get_parent()
+ << ", dropping " << *m << dendl;
+ return;
+ }
+ }
+
+ switch (m->get_action()) {
+ // -- replica --
+ case LOCK_AC_SYNC:
+ ceph_assert(lock->get_state() == LOCK_LOCK);
+ lock->decode_locked_state(m->get_data());
+ lock->set_state(LOCK_SYNC);
+ lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
+ break;
+
+ case LOCK_AC_LOCK:
+ ceph_assert(lock->get_state() == LOCK_SYNC);
+ lock->set_state(LOCK_SYNC_LOCK);
+ if (lock->is_leased())
+ revoke_client_leases(lock);
+ eval_gather(lock, true);
+ if (lock->is_unstable_and_locked())
+ mds->mdlog->flush();
+ break;
+
+
+ // -- auth --
+ case LOCK_AC_LOCKACK:
+ ceph_assert(lock->get_state() == LOCK_SYNC_LOCK ||
+ lock->get_state() == LOCK_SYNC_EXCL);
+ ceph_assert(lock->is_gathering(from));
+ lock->remove_gather(from);
+
+ if (lock->is_gathering()) {
+ dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from
+ << ", still gathering " << lock->get_gather_set() << dendl;
+ } else {
+ dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from
+ << ", last one" << dendl;
+ eval_gather(lock);
+ }
+ break;
+
+ case LOCK_AC_REQRDLOCK:
+ handle_reqrdlock(lock, m);
+ break;
+
+ }
+}
+
+/* unused, currently.
+
+class C_Locker_SimpleEval : public Context {
+ Locker *locker;
+ SimpleLock *lock;
+public:
+ C_Locker_SimpleEval(Locker *l, SimpleLock *lk) : locker(l), lock(lk) {}
+ void finish(int r) {
+ locker->try_simple_eval(lock);
+ }
+};
+
+void Locker::try_simple_eval(SimpleLock *lock)
+{
+ // unstable and ambiguous auth?
+ if (!lock->is_stable() &&
+ lock->get_parent()->is_ambiguous_auth()) {
+ dout(7) << "simple_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl;
+ //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
+ lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_SimpleEval(this, lock));
+ return;
+ }
+
+ if (!lock->get_parent()->is_auth()) {
+ dout(7) << "try_simple_eval not auth for " << *lock->get_parent() << dendl;
+ return;
+ }
+
+ if (!lock->get_parent()->can_auth_pin()) {
+ dout(7) << "try_simple_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl;
+ //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
+ lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_SimpleEval(this, lock));
+ return;
+ }
+
+ if (lock->is_stable())
+ simple_eval(lock);
+}
+*/
+
+
+void Locker::simple_eval(SimpleLock *lock, bool *need_issue)
+{
+ dout(10) << "simple_eval " << *lock << " on " << *lock->get_parent() << dendl;
+
+ ceph_assert(lock->get_parent()->is_auth());
+ ceph_assert(lock->is_stable());
+
+ if (lock->get_parent()->is_freezing_or_frozen()) {
+ // dentry/snap lock in unreadable state can block path traverse
+ if ((lock->get_type() != CEPH_LOCK_DN &&
+ lock->get_type() != CEPH_LOCK_ISNAP) ||
+ lock->get_state() == LOCK_SYNC ||
+ lock->get_parent()->is_frozen())
+ return;
+ }
+
+ if (mdcache->is_readonly()) {
+ if (lock->get_state() != LOCK_SYNC) {
+ dout(10) << "simple_eval read-only FS, syncing " << *lock << " on " << *lock->get_parent() << dendl;
+ simple_sync(lock, need_issue);
+ }
+ return;
+ }
+
+ CInode *in = 0;
+ int wanted = 0;
+ if (lock->get_cap_shift()) {
+ in = static_cast<CInode*>(lock->get_parent());
+ in->get_caps_wanted(&wanted, NULL, lock->get_cap_shift());
+ }
+
+ // -> excl?
+ if (lock->get_state() != LOCK_EXCL &&
+ in && in->get_target_loner() >= 0 &&
+ (wanted & CEPH_CAP_GEXCL)) {
+ dout(7) << "simple_eval stable, going to excl " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ simple_excl(lock, need_issue);
+ }
+
+ // stable -> sync?
+ else if (lock->get_state() != LOCK_SYNC &&
+ !lock->is_wrlocked() &&
+ ((!(wanted & CEPH_CAP_GEXCL) && !lock->is_waiter_for(SimpleLock::WAIT_WR)) ||
+ (lock->get_state() == LOCK_EXCL && in && in->get_target_loner() < 0))) {
+ dout(7) << "simple_eval stable, syncing " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ simple_sync(lock, need_issue);
+ }
+}
+
+
+// mid
+
+bool Locker::simple_sync(SimpleLock *lock, bool *need_issue)
+{
+ dout(7) << "simple_sync on " << *lock << " on " << *lock->get_parent() << dendl;
+ ceph_assert(lock->get_parent()->is_auth());
+ ceph_assert(lock->is_stable());
+
+ CInode *in = 0;
+ if (lock->get_cap_shift())
+ in = static_cast<CInode *>(lock->get_parent());
+
+ int old_state = lock->get_state();
+
+ if (old_state != LOCK_TSYN) {
+
+ switch (lock->get_state()) {
+ case LOCK_MIX: lock->set_state(LOCK_MIX_SYNC); break;
+ case LOCK_LOCK: lock->set_state(LOCK_LOCK_SYNC); break;
+ case LOCK_XSYN: lock->set_state(LOCK_XSYN_SYNC); break;
+ case LOCK_EXCL: lock->set_state(LOCK_EXCL_SYNC); break;
+ default: ceph_abort();
+ }
+
+ int gather = 0;
+ if (lock->is_wrlocked())
+ gather++;
+
+ if (lock->get_parent()->is_replicated() && old_state == LOCK_MIX) {
+ send_lock_message(lock, LOCK_AC_SYNC);
+ lock->init_gather();
+ gather++;
+ }
+
+ if (in && in->is_head()) {
+ if (in->issued_caps_need_gather(lock)) {
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ gather++;
+ }
+ }
+
+ bool need_recover = false;
+ if (lock->get_type() == CEPH_LOCK_IFILE) {
+ ceph_assert(in);
+ if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
+ mds->mdcache->queue_file_recover(in);
+ need_recover = true;
+ gather++;
+ }
+ }
+
+ if (!gather && lock->is_dirty()) {
+ lock->get_parent()->auth_pin(lock);
+ scatter_writebehind(static_cast<ScatterLock*>(lock));
+ mds->mdlog->flush();
+ return false;
+ }
+
+ if (gather) {
+ lock->get_parent()->auth_pin(lock);
+ if (need_recover)
+ mds->mdcache->do_file_recover();
+ return false;
+ }
+ }
+
+ if (lock->get_parent()->is_replicated()) { // FIXME
+ bufferlist data;
+ lock->encode_locked_state(data);
+ send_lock_message(lock, LOCK_AC_SYNC, data);
+ }
+ lock->set_state(LOCK_SYNC);
+ lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
+ if (in && in->is_head()) {
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ }
+ return true;
+}
+
+void Locker::simple_excl(SimpleLock *lock, bool *need_issue)
+{
+ dout(7) << "simple_excl on " << *lock << " on " << *lock->get_parent() << dendl;
+ ceph_assert(lock->get_parent()->is_auth());
+ ceph_assert(lock->is_stable());
+
+ CInode *in = 0;
+ if (lock->get_cap_shift())
+ in = static_cast<CInode *>(lock->get_parent());
+
+ switch (lock->get_state()) {
+ case LOCK_LOCK: lock->set_state(LOCK_LOCK_EXCL); break;
+ case LOCK_SYNC: lock->set_state(LOCK_SYNC_EXCL); break;
+ case LOCK_XSYN: lock->set_state(LOCK_XSYN_EXCL); break;
+ default: ceph_abort();
+ }
+
+ int gather = 0;
+ if (lock->is_rdlocked())
+ gather++;
+ if (lock->is_wrlocked())
+ gather++;
+
+ if (lock->get_parent()->is_replicated() &&
+ lock->get_state() != LOCK_LOCK_EXCL &&
+ lock->get_state() != LOCK_XSYN_EXCL) {
+ send_lock_message(lock, LOCK_AC_LOCK);
+ lock->init_gather();
+ gather++;
+ }
+
+ if (in && in->is_head()) {
+ if (in->issued_caps_need_gather(lock)) {
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ gather++;
+ }
+ }
+
+ if (gather) {
+ lock->get_parent()->auth_pin(lock);
+ } else {
+ lock->set_state(LOCK_EXCL);
+ lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
+ if (in) {
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ }
+ }
+}
+
+void Locker::simple_lock(SimpleLock *lock, bool *need_issue)
+{
+ dout(7) << "simple_lock on " << *lock << " on " << *lock->get_parent() << dendl;
+ ceph_assert(lock->get_parent()->is_auth());
+ ceph_assert(lock->is_stable());
+ ceph_assert(lock->get_state() != LOCK_LOCK);
+
+ CInode *in = 0;
+ if (lock->get_cap_shift())
+ in = static_cast<CInode *>(lock->get_parent());
+
+ int old_state = lock->get_state();
+
+ switch (lock->get_state()) {
+ case LOCK_SYNC: lock->set_state(LOCK_SYNC_LOCK); break;
+ case LOCK_XSYN: lock->set_state(LOCK_XSYN_LOCK); break;
+ case LOCK_EXCL: lock->set_state(LOCK_EXCL_LOCK); break;
+ case LOCK_MIX: lock->set_state(LOCK_MIX_LOCK);
+ (static_cast<ScatterLock *>(lock))->clear_unscatter_wanted();
+ break;
+ case LOCK_TSYN: lock->set_state(LOCK_TSYN_LOCK); break;
+ default: ceph_abort();
+ }
+
+ int gather = 0;
+ if (lock->is_leased()) {
+ gather++;
+ revoke_client_leases(lock);
+ }
+ if (lock->is_rdlocked())
+ gather++;
+ if (in && in->is_head()) {
+ if (in->issued_caps_need_gather(lock)) {
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ gather++;
+ }
+ }
+
+ bool need_recover = false;
+ if (lock->get_type() == CEPH_LOCK_IFILE) {
+ ceph_assert(in);
+ if(in->state_test(CInode::STATE_NEEDSRECOVER)) {
+ mds->mdcache->queue_file_recover(in);
+ need_recover = true;
+ gather++;
+ }
+ }
+
+ if (lock->get_parent()->is_replicated() &&
+ lock->get_state() == LOCK_MIX_LOCK &&
+ gather) {
+ dout(10) << " doing local stage of mix->lock gather before gathering from replicas" << dendl;
+ } else {
+ // move to second stage of gather now, so we don't send the lock action later.
+ if (lock->get_state() == LOCK_MIX_LOCK)
+ lock->set_state(LOCK_MIX_LOCK2);
+
+ if (lock->get_parent()->is_replicated() &&
+ lock->get_sm()->states[old_state].replica_state != LOCK_LOCK) { // replica may already be LOCK
+ gather++;
+ send_lock_message(lock, LOCK_AC_LOCK);
+ lock->init_gather();
+ }
+ }
+
+ if (!gather && lock->is_dirty()) {
+ lock->get_parent()->auth_pin(lock);
+ scatter_writebehind(static_cast<ScatterLock*>(lock));
+ mds->mdlog->flush();
+ return;
+ }
+
+ if (gather) {
+ lock->get_parent()->auth_pin(lock);
+ if (need_recover)
+ mds->mdcache->do_file_recover();
+ } else {
+ lock->set_state(LOCK_LOCK);
+ lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE);
+ }
+}
+
+
+void Locker::simple_xlock(SimpleLock *lock)
+{
+ dout(7) << "simple_xlock on " << *lock << " on " << *lock->get_parent() << dendl;
+ ceph_assert(lock->get_parent()->is_auth());
+ //assert(lock->is_stable());
+ ceph_assert(lock->get_state() != LOCK_XLOCK);
+
+ CInode *in = 0;
+ if (lock->get_cap_shift())
+ in = static_cast<CInode *>(lock->get_parent());
+
+ if (lock->is_stable())
+ lock->get_parent()->auth_pin(lock);
+
+ switch (lock->get_state()) {
+ case LOCK_LOCK:
+ case LOCK_XLOCKDONE: lock->set_state(LOCK_LOCK_XLOCK); break;
+ default: ceph_abort();
+ }
+
+ int gather = 0;
+ if (lock->is_rdlocked())
+ gather++;
+ if (lock->is_wrlocked())
+ gather++;
+
+ if (in && in->is_head()) {
+ if (in->issued_caps_need_gather(lock)) {
+ issue_caps(in);
+ gather++;
+ }
+ }
+
+ if (!gather) {
+ lock->set_state(LOCK_PREXLOCK);
+ //assert("shouldn't be called if we are already xlockable" == 0);
+ }
+}
+
+
+
+
+
+// ==========================================================================
+// scatter lock
+
+/*
+
+Some notes on scatterlocks.
+
+ - The scatter/gather is driven by the inode lock. The scatter always
+ brings in the latest metadata from the fragments.
+
+ - When in a scattered/MIX state, fragments are only allowed to
+ update/be written to if the accounted stat matches the inode's
+ current version.
+
+ - That means, on gather, we _only_ assimilate diffs for frag metadata
+ that match the current version, because those are the only ones
+ written during this scatter/gather cycle. (Others didn't permit
+ it.) We increment the version and journal this to disk.
+
+ - When possible, we also simultaneously update our local frag
+ accounted stats to match.
+
+ - On scatter, the new inode info is broadcast to frags, both local
+ and remote. If possible (auth and !frozen), the dirfrag auth
+ should update the accounted state (if it isn't already up to date).
+ Note that this may occur on both the local inode auth node and
+ inode replicas, so there are two potential paths. If it is NOT
+ possible, they need to mark_stale to prevent any possible writes.
+
+ - A scatter can be to MIX (potentially writeable) or to SYNC (read
+ only). Both are opportunities to update the frag accounted stats,
+ even though only the MIX case is affected by a stale dirfrag.
+
+ - Because many scatter/gather cycles can potentially go by without a
+ frag being able to update its accounted stats (due to being frozen
+ by exports/refragments in progress), the frag may have (even very)
+ old stat versions. That's fine. If when we do want to update it,
+ we can update accounted_* and the version first.
+
+*/
+
+class C_Locker_ScatterWB : public LockerLogContext {
+ ScatterLock *lock;
+ MutationRef mut;
+public:
+ C_Locker_ScatterWB(Locker *l, ScatterLock *sl, MutationRef& m) :
+ LockerLogContext(l), lock(sl), mut(m) {}
+ void finish(int r) override {
+ locker->scatter_writebehind_finish(lock, mut);
+ }
+};
+
+void Locker::scatter_writebehind(ScatterLock *lock)
+{
+ CInode *in = static_cast<CInode*>(lock->get_parent());
+ dout(10) << "scatter_writebehind " << in->inode.mtime << " on " << *lock << " on " << *in << dendl;
+
+ // journal
+ MutationRef mut(new MutationImpl());
+ mut->ls = mds->mdlog->get_current_segment();
+
+ // forcefully take a wrlock
+ lock->get_wrlock(true);
+ mut->locks.emplace(lock, MutationImpl::LockOp::WRLOCK);
+
+ in->pre_cow_old_inode(); // avoid cow mayhem
+
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
+
+ in->finish_scatter_gather_update(lock->get_type());
+ lock->start_flush();
+
+ EUpdate *le = new EUpdate(mds->mdlog, "scatter_writebehind");
+ mds->mdlog->start_entry(le);
+
+ mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mut.get(), &le->metablob, in);
+
+ in->finish_scatter_gather_update_accounted(lock->get_type(), mut, &le->metablob);
+
+ mds->mdlog->submit_entry(le, new C_Locker_ScatterWB(this, lock, mut));
+}
+
+void Locker::scatter_writebehind_finish(ScatterLock *lock, MutationRef& mut)
+{
+ CInode *in = static_cast<CInode*>(lock->get_parent());
+ dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << dendl;
+ in->pop_and_dirty_projected_inode(mut->ls);
+
+ lock->finish_flush();
+
+ // if replicas may have flushed in a mix->lock state, send another
+ // message so they can finish_flush().
+ if (in->is_replicated()) {
+ switch (lock->get_state()) {
+ case LOCK_MIX_LOCK:
+ case LOCK_MIX_LOCK2:
+ case LOCK_MIX_EXCL:
+ case LOCK_MIX_TSYN:
+ send_lock_message(lock, LOCK_AC_LOCKFLUSHED);
+ }
+ }
+
+ mut->apply();
+ drop_locks(mut.get());
+ mut->cleanup();
+
+ if (lock->is_stable())
+ lock->finish_waiters(ScatterLock::WAIT_STABLE);
+
+ //scatter_eval_gather(lock);
+}
+
+void Locker::scatter_eval(ScatterLock *lock, bool *need_issue)
+{
+ dout(10) << "scatter_eval " << *lock << " on " << *lock->get_parent() << dendl;
+
+ ceph_assert(lock->get_parent()->is_auth());
+ ceph_assert(lock->is_stable());
+
+ if (lock->get_parent()->is_freezing_or_frozen()) {
+ dout(20) << " freezing|frozen" << dendl;
+ return;
+ }
+
+ if (mdcache->is_readonly()) {
+ if (lock->get_state() != LOCK_SYNC) {
+ dout(10) << "scatter_eval read-only FS, syncing " << *lock << " on " << *lock->get_parent() << dendl;
+ simple_sync(lock, need_issue);
+ }
+ return;
+ }
+
+ if (!lock->is_rdlocked() &&
+ lock->get_state() != LOCK_MIX &&
+ lock->get_scatter_wanted()) {
+ dout(10) << "scatter_eval scatter_wanted, bump to mix " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ scatter_mix(lock, need_issue);
+ return;
+ }
+
+ if (lock->get_type() == CEPH_LOCK_INEST) {
+ // in general, we want to keep INEST writable at all times.
+ if (!lock->is_rdlocked()) {
+ if (lock->get_parent()->is_replicated()) {
+ if (lock->get_state() != LOCK_MIX)
+ scatter_mix(lock, need_issue);
+ } else {
+ if (lock->get_state() != LOCK_LOCK)
+ simple_lock(lock, need_issue);
+ }
+ }
+ return;
+ }
+
+ CInode *in = static_cast<CInode*>(lock->get_parent());
+ if (!in->has_subtree_or_exporting_dirfrag() || in->is_base()) {
+ // i _should_ be sync.
+ if (!lock->is_wrlocked() &&
+ lock->get_state() != LOCK_SYNC) {
+ dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << dendl;
+ simple_sync(lock, need_issue);
+ }
+ }
+}
+
+
+/*
+ * mark a scatterlock to indicate that the dir fnode has some dirty data
+ */
+void Locker::mark_updated_scatterlock(ScatterLock *lock)
+{
+ lock->mark_dirty();
+ if (lock->get_updated_item()->is_on_list()) {
+ dout(10) << "mark_updated_scatterlock " << *lock
+ << " - already on list since " << lock->get_update_stamp() << dendl;
+ } else {
+ updated_scatterlocks.push_back(lock->get_updated_item());
+ utime_t now = ceph_clock_now();
+ lock->set_update_stamp(now);
+ dout(10) << "mark_updated_scatterlock " << *lock
+ << " - added at " << now << dendl;
+ }
+}
+
+/*
+ * this is called by scatter_tick and LogSegment::try_to_trim() when
+ * trying to flush dirty scattered data (i.e. updated fnode) back to
+ * the inode.
+ *
+ * we need to lock|scatter in order to push fnode changes into the
+ * inode.dirstat.
+ */
+void Locker::scatter_nudge(ScatterLock *lock, MDSContext *c, bool forcelockchange)
+{
+ CInode *p = static_cast<CInode *>(lock->get_parent());
+
+ if (p->is_frozen() || p->is_freezing()) {
+ dout(10) << "scatter_nudge waiting for unfreeze on " << *p << dendl;
+ if (c)
+ p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, c);
+ else if (lock->is_dirty())
+ // just requeue. not ideal.. starvation prone..
+ updated_scatterlocks.push_back(lock->get_updated_item());
+ return;
+ }
+
+ if (p->is_ambiguous_auth()) {
+ dout(10) << "scatter_nudge waiting for single auth on " << *p << dendl;
+ if (c)
+ p->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, c);
+ else if (lock->is_dirty())
+ // just requeue. not ideal.. starvation prone..
+ updated_scatterlocks.push_back(lock->get_updated_item());
+ return;
+ }
+
+ if (p->is_auth()) {
+ int count = 0;
+ while (true) {
+ if (lock->is_stable()) {
+ // can we do it now?
+ // (only if we're not replicated.. if we are, we really do need
+ // to nudge the lock state!)
+ /*
+ actually, even if we're not replicated, we can't stay in MIX, because another mds
+ could discover and replicate us at any time. if that happens while we're flushing,
+ they end up in MIX but their inode has the old scatterstat version.
+
+ if (!forcelockchange && !lock->get_parent()->is_replicated() && lock->can_wrlock(-1)) {
+ dout(10) << "scatter_nudge auth, propagating " << *lock << " on " << *p << dendl;
+ scatter_writebehind(lock);
+ if (c)
+ lock->add_waiter(SimpleLock::WAIT_STABLE, c);
+ return;
+ }
+ */
+
+ if (mdcache->is_readonly()) {
+ if (lock->get_state() != LOCK_SYNC) {
+ dout(10) << "scatter_nudge auth, read-only FS, syncing " << *lock << " on " << *p << dendl;
+ simple_sync(static_cast<ScatterLock*>(lock));
+ }
+ break;
+ }
+
+ // adjust lock state
+ dout(10) << "scatter_nudge auth, scatter/unscattering " << *lock << " on " << *p << dendl;
+ switch (lock->get_type()) {
+ case CEPH_LOCK_IFILE:
+ if (p->is_replicated() && lock->get_state() != LOCK_MIX)
+ scatter_mix(static_cast<ScatterLock*>(lock));
+ else if (lock->get_state() != LOCK_LOCK)
+ simple_lock(static_cast<ScatterLock*>(lock));
+ else
+ simple_sync(static_cast<ScatterLock*>(lock));
+ break;
+
+ case CEPH_LOCK_IDFT:
+ case CEPH_LOCK_INEST:
+ if (p->is_replicated() && lock->get_state() != LOCK_MIX)
+ scatter_mix(lock);
+ else if (lock->get_state() != LOCK_LOCK)
+ simple_lock(lock);
+ else
+ simple_sync(lock);
+ break;
+ default:
+ ceph_abort();
+ }
+ ++count;
+ if (lock->is_stable() && count == 2) {
+ dout(10) << "scatter_nudge oh, stable after two cycles." << dendl;
+ // this should only realy happen when called via
+ // handle_file_lock due to AC_NUDGE, because the rest of the
+ // time we are replicated or have dirty data and won't get
+ // called. bailing here avoids an infinite loop.
+ ceph_assert(!c);
+ break;
+ }
+ } else {
+ dout(10) << "scatter_nudge auth, waiting for stable " << *lock << " on " << *p << dendl;
+ if (c)
+ lock->add_waiter(SimpleLock::WAIT_STABLE, c);
+ return;
+ }
+ }
+ } else {
+ dout(10) << "scatter_nudge replica, requesting scatter/unscatter of "
+ << *lock << " on " << *p << dendl;
+ // request unscatter?
+ mds_rank_t auth = lock->get_parent()->authority().first;
+ if (!mds->is_cluster_degraded() || mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
+ mds->send_message_mds(MLock::create(lock, LOCK_AC_NUDGE, mds->get_nodeid()), auth);
+ }
+
+ // wait...
+ if (c)
+ lock->add_waiter(SimpleLock::WAIT_STABLE, c);
+
+ // also, requeue, in case we had wrong auth or something
+ if (lock->is_dirty())
+ updated_scatterlocks.push_back(lock->get_updated_item());
+ }
+}
+
+void Locker::scatter_tick()
+{
+ dout(10) << "scatter_tick" << dendl;
+
+ // updated
+ utime_t now = ceph_clock_now();
+ int n = updated_scatterlocks.size();
+ while (!updated_scatterlocks.empty()) {
+ ScatterLock *lock = updated_scatterlocks.front();
+
+ if (n-- == 0) break; // scatter_nudge() may requeue; avoid looping
+
+ if (!lock->is_dirty()) {
+ updated_scatterlocks.pop_front();
+ dout(10) << " removing from updated_scatterlocks "
+ << *lock << " " << *lock->get_parent() << dendl;
+ continue;
+ }
+ if (now - lock->get_update_stamp() < g_conf()->mds_scatter_nudge_interval)
+ break;
+ updated_scatterlocks.pop_front();
+ scatter_nudge(lock, 0);
+ }
+ mds->mdlog->flush();
+}
+
+
+void Locker::scatter_tempsync(ScatterLock *lock, bool *need_issue)
+{
+ dout(10) << "scatter_tempsync " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ ceph_assert(lock->get_parent()->is_auth());
+ ceph_assert(lock->is_stable());
+
+ ceph_abort_msg("not fully implemented, at least not for filelock");
+
+ CInode *in = static_cast<CInode *>(lock->get_parent());
+
+ switch (lock->get_state()) {
+ case LOCK_SYNC: ceph_abort(); // this shouldn't happen
+ case LOCK_LOCK: lock->set_state(LOCK_LOCK_TSYN); break;
+ case LOCK_MIX: lock->set_state(LOCK_MIX_TSYN); break;
+ default: ceph_abort();
+ }
+
+ int gather = 0;
+ if (lock->is_wrlocked())
+ gather++;
+
+ if (lock->get_cap_shift() &&
+ in->is_head() &&
+ in->issued_caps_need_gather(lock)) {
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ gather++;
+ }
+
+ if (lock->get_state() == LOCK_MIX_TSYN &&
+ in->is_replicated()) {
+ lock->init_gather();
+ send_lock_message(lock, LOCK_AC_LOCK);
+ gather++;
+ }
+
+ if (gather) {
+ in->auth_pin(lock);
+ } else {
+ // do tempsync
+ lock->set_state(LOCK_TSYN);
+ lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE);
+ if (lock->get_cap_shift()) {
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ }
+ }
+}
+
+
+
+// ==========================================================================
+// local lock
+
+void Locker::local_wrlock_grab(LocalLock *lock, MutationRef& mut)
+{
+ dout(7) << "local_wrlock_grab on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+
+ ceph_assert(lock->get_parent()->is_auth());
+ ceph_assert(lock->can_wrlock());
+ lock->get_wrlock(mut->get_client());
+
+ auto ret = mut->locks.emplace(lock, MutationImpl::LockOp::WRLOCK);
+ ceph_assert(ret.second);
+}
+
+bool Locker::local_wrlock_start(LocalLock *lock, MDRequestRef& mut)
+{
+ dout(7) << "local_wrlock_start on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+
+ ceph_assert(lock->get_parent()->is_auth());
+ if (lock->can_wrlock()) {
+ lock->get_wrlock(mut->get_client());
+ auto it = mut->locks.emplace_hint(mut->locks.end(), lock, MutationImpl::LockOp::WRLOCK);
+ ceph_assert(it->is_wrlock());
+ return true;
+ } else {
+ lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut));
+ return false;
+ }
+}
+
+void Locker::local_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut)
+{
+ ceph_assert(it->is_wrlock());
+ LocalLock *lock = static_cast<LocalLock*>(it->lock);
+ dout(7) << "local_wrlock_finish on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ lock->put_wrlock();
+ mut->locks.erase(it);
+ if (lock->get_num_wrlocks() == 0) {
+ lock->finish_waiters(SimpleLock::WAIT_STABLE |
+ SimpleLock::WAIT_WR |
+ SimpleLock::WAIT_RD);
+ }
+}
+
+bool Locker::local_xlock_start(LocalLock *lock, MDRequestRef& mut)
+{
+ dout(7) << "local_xlock_start on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+
+ ceph_assert(lock->get_parent()->is_auth());
+ if (!lock->can_xlock_local()) {
+ lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut));
+ return false;
+ }
+
+ lock->get_xlock(mut, mut->get_client());
+ mut->locks.emplace_hint(mut->locks.end(), lock, MutationImpl::LockOp::XLOCK);
+ return true;
+}
+
+void Locker::local_xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut)
+{
+ ceph_assert(it->is_xlock());
+ LocalLock *lock = static_cast<LocalLock*>(it->lock);
+ dout(7) << "local_xlock_finish on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ lock->put_xlock();
+ mut->locks.erase(it);
+
+ lock->finish_waiters(SimpleLock::WAIT_STABLE |
+ SimpleLock::WAIT_WR |
+ SimpleLock::WAIT_RD);
+}
+
+
+
+// ==========================================================================
+// file lock
+
+
+void Locker::file_eval(ScatterLock *lock, bool *need_issue)
+{
+ CInode *in = static_cast<CInode*>(lock->get_parent());
+ int loner_wanted, other_wanted;
+ int wanted = in->get_caps_wanted(&loner_wanted, &other_wanted, CEPH_CAP_SFILE);
+ dout(7) << "file_eval wanted=" << gcap_string(wanted)
+ << " loner_wanted=" << gcap_string(loner_wanted)
+ << " other_wanted=" << gcap_string(other_wanted)
+ << " filelock=" << *lock << " on " << *lock->get_parent()
+ << dendl;
+
+ ceph_assert(lock->get_parent()->is_auth());
+ ceph_assert(lock->is_stable());
+
+ if (lock->get_parent()->is_freezing_or_frozen())
+ return;
+
+ if (mdcache->is_readonly()) {
+ if (lock->get_state() != LOCK_SYNC) {
+ dout(10) << "file_eval read-only FS, syncing " << *lock << " on " << *lock->get_parent() << dendl;
+ simple_sync(lock, need_issue);
+ }
+ return;
+ }
+
+ // excl -> *?
+ if (lock->get_state() == LOCK_EXCL) {
+ dout(20) << " is excl" << dendl;
+ int loner_issued, other_issued, xlocker_issued;
+ in->get_caps_issued(&loner_issued, &other_issued, &xlocker_issued, CEPH_CAP_SFILE);
+ dout(7) << "file_eval loner_issued=" << gcap_string(loner_issued)
+ << " other_issued=" << gcap_string(other_issued)
+ << " xlocker_issued=" << gcap_string(xlocker_issued)
+ << dendl;
+ if (!((loner_wanted|loner_issued) & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GBUFFER)) ||
+ (other_wanted & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GRD)) ||
+ (in->inode.is_dir() && in->multiple_nonstale_caps())) { // FIXME.. :/
+ dout(20) << " should lose it" << dendl;
+ // we should lose it.
+ // loner other want
+ // R R SYNC
+ // R R|W MIX
+ // R W MIX
+ // R|W R MIX
+ // R|W R|W MIX
+ // R|W W MIX
+ // W R MIX
+ // W R|W MIX
+ // W W MIX
+ // -> any writer means MIX; RD doesn't matter.
+ if (((other_wanted|loner_wanted) & CEPH_CAP_GWR) ||
+ lock->is_waiter_for(SimpleLock::WAIT_WR))
+ scatter_mix(lock, need_issue);
+ else if (!lock->is_wrlocked()) // let excl wrlocks drain first
+ simple_sync(lock, need_issue);
+ else
+ dout(10) << " waiting for wrlock to drain" << dendl;
+ }
+ }
+
+ // * -> excl?
+ else if (lock->get_state() != LOCK_EXCL &&
+ !lock->is_rdlocked() &&
+ //!lock->is_waiter_for(SimpleLock::WAIT_WR) &&
+ ((wanted & (CEPH_CAP_GWR|CEPH_CAP_GBUFFER)) ||
+ (in->inode.is_dir() && !in->has_subtree_or_exporting_dirfrag())) &&
+ in->get_target_loner() >= 0) {
+ dout(7) << "file_eval stable, bump to loner " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ file_excl(lock, need_issue);
+ }
+
+ // * -> mixed?
+ else if (lock->get_state() != LOCK_MIX &&
+ !lock->is_rdlocked() &&
+ //!lock->is_waiter_for(SimpleLock::WAIT_WR) &&
+ (lock->get_scatter_wanted() ||
+ (in->get_target_loner() < 0 && (wanted & CEPH_CAP_GWR)))) {
+ dout(7) << "file_eval stable, bump to mixed " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ scatter_mix(lock, need_issue);
+ }
+
+ // * -> sync?
+ else if (lock->get_state() != LOCK_SYNC &&
+ !lock->is_wrlocked() && // drain wrlocks first!
+ !lock->is_waiter_for(SimpleLock::WAIT_WR) &&
+ !(wanted & CEPH_CAP_GWR) &&
+ !((lock->get_state() == LOCK_MIX) &&
+ in->is_dir() && in->has_subtree_or_exporting_dirfrag()) // if we are a delegation point, stay where we are
+ //((wanted & CEPH_CAP_RD) ||
+ //in->is_replicated() ||
+ //lock->is_leased() ||
+ //(!loner && lock->get_state() == LOCK_EXCL)) &&
+ ) {
+ dout(7) << "file_eval stable, bump to sync " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ simple_sync(lock, need_issue);
+ }
+}
+
+
+
+void Locker::scatter_mix(ScatterLock *lock, bool *need_issue)
+{
+ dout(7) << "scatter_mix " << *lock << " on " << *lock->get_parent() << dendl;
+
+ CInode *in = static_cast<CInode*>(lock->get_parent());
+ ceph_assert(in->is_auth());
+ ceph_assert(lock->is_stable());
+
+ if (lock->get_state() == LOCK_LOCK) {
+ in->start_scatter(lock);
+ if (in->is_replicated()) {
+ // data
+ bufferlist softdata;
+ lock->encode_locked_state(softdata);
+
+ // bcast to replicas
+ send_lock_message(lock, LOCK_AC_MIX, softdata);
+ }
+
+ // change lock
+ lock->set_state(LOCK_MIX);
+ lock->clear_scatter_wanted();
+ if (lock->get_cap_shift()) {
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ }
+ } else {
+ // gather?
+ switch (lock->get_state()) {
+ case LOCK_SYNC: lock->set_state(LOCK_SYNC_MIX); break;
+ case LOCK_EXCL: lock->set_state(LOCK_EXCL_MIX); break;
+ case LOCK_XSYN: lock->set_state(LOCK_XSYN_MIX); break;
+ case LOCK_TSYN: lock->set_state(LOCK_TSYN_MIX); break;
+ default: ceph_abort();
+ }
+
+ int gather = 0;
+ if (lock->is_rdlocked())
+ gather++;
+ if (in->is_replicated()) {
+ if (lock->get_state() == LOCK_SYNC_MIX) { // for the rest states, replicas are already LOCK
+ send_lock_message(lock, LOCK_AC_MIX);
+ lock->init_gather();
+ gather++;
+ }
+ }
+ if (lock->is_leased()) {
+ revoke_client_leases(lock);
+ gather++;
+ }
+ if (lock->get_cap_shift() &&
+ in->is_head() &&
+ in->issued_caps_need_gather(lock)) {
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ gather++;
+ }
+ bool need_recover = false;
+ if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
+ mds->mdcache->queue_file_recover(in);
+ need_recover = true;
+ gather++;
+ }
+
+ if (gather) {
+ lock->get_parent()->auth_pin(lock);
+ if (need_recover)
+ mds->mdcache->do_file_recover();
+ } else {
+ in->start_scatter(lock);
+ lock->set_state(LOCK_MIX);
+ lock->clear_scatter_wanted();
+ if (in->is_replicated()) {
+ bufferlist softdata;
+ lock->encode_locked_state(softdata);
+ send_lock_message(lock, LOCK_AC_MIX, softdata);
+ }
+ if (lock->get_cap_shift()) {
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ }
+ }
+ }
+}
+
+
+void Locker::file_excl(ScatterLock *lock, bool *need_issue)
+{
+ CInode *in = static_cast<CInode*>(lock->get_parent());
+ dout(7) << "file_excl " << *lock << " on " << *lock->get_parent() << dendl;
+
+ ceph_assert(in->is_auth());
+ ceph_assert(lock->is_stable());
+
+ ceph_assert((in->get_loner() >= 0 && in->get_mds_caps_wanted().empty()) ||
+ (lock->get_state() == LOCK_XSYN)); // must do xsyn -> excl -> <anything else>
+
+ switch (lock->get_state()) {
+ case LOCK_SYNC: lock->set_state(LOCK_SYNC_EXCL); break;
+ case LOCK_MIX: lock->set_state(LOCK_MIX_EXCL); break;
+ case LOCK_LOCK: lock->set_state(LOCK_LOCK_EXCL); break;
+ case LOCK_XSYN: lock->set_state(LOCK_XSYN_EXCL); break;
+ default: ceph_abort();
+ }
+ int gather = 0;
+
+ if (lock->is_rdlocked())
+ gather++;
+ if (lock->is_wrlocked())
+ gather++;
+
+ if (in->is_replicated() &&
+ lock->get_state() != LOCK_LOCK_EXCL &&
+ lock->get_state() != LOCK_XSYN_EXCL) { // if we were lock, replicas are already lock.
+ send_lock_message(lock, LOCK_AC_LOCK);
+ lock->init_gather();
+ gather++;
+ }
+ if (lock->is_leased()) {
+ revoke_client_leases(lock);
+ gather++;
+ }
+ if (in->is_head() &&
+ in->issued_caps_need_gather(lock)) {
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ gather++;
+ }
+ bool need_recover = false;
+ if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
+ mds->mdcache->queue_file_recover(in);
+ need_recover = true;
+ gather++;
+ }
+
+ if (gather) {
+ lock->get_parent()->auth_pin(lock);
+ if (need_recover)
+ mds->mdcache->do_file_recover();
+ } else {
+ lock->set_state(LOCK_EXCL);
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ }
+}
+
+void Locker::file_xsyn(SimpleLock *lock, bool *need_issue)
+{
+ dout(7) << "file_xsyn on " << *lock << " on " << *lock->get_parent() << dendl;
+ CInode *in = static_cast<CInode *>(lock->get_parent());
+ ceph_assert(in->is_auth());
+ ceph_assert(in->get_loner() >= 0 && in->get_mds_caps_wanted().empty());
+
+ switch (lock->get_state()) {
+ case LOCK_EXCL: lock->set_state(LOCK_EXCL_XSYN); break;
+ default: ceph_abort();
+ }
+
+ int gather = 0;
+ if (lock->is_wrlocked())
+ gather++;
+
+ if (in->is_head() &&
+ in->issued_caps_need_gather(lock)) {
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ gather++;
+ }
+
+ if (gather) {
+ lock->get_parent()->auth_pin(lock);
+ } else {
+ lock->set_state(LOCK_XSYN);
+ lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
+ if (need_issue)
+ *need_issue = true;
+ else
+ issue_caps(in);
+ }
+}
+
+void Locker::file_recover(ScatterLock *lock)
+{
+ CInode *in = static_cast<CInode *>(lock->get_parent());
+ dout(7) << "file_recover " << *lock << " on " << *in << dendl;
+
+ ceph_assert(in->is_auth());
+ //assert(lock->is_stable());
+ ceph_assert(lock->get_state() == LOCK_PRE_SCAN); // only called from MDCache::start_files_to_recover()
+
+ int gather = 0;
+
+ /*
+ if (in->is_replicated()
+ lock->get_sm()->states[oldstate].replica_state != LOCK_LOCK) {
+ send_lock_message(lock, LOCK_AC_LOCK);
+ lock->init_gather();
+ gather++;
+ }
+ */
+ if (in->is_head() &&
+ in->issued_caps_need_gather(lock)) {
+ issue_caps(in);
+ gather++;
+ }
+
+ lock->set_state(LOCK_SCAN);
+ if (gather)
+ in->state_set(CInode::STATE_NEEDSRECOVER);
+ else
+ mds->mdcache->queue_file_recover(in);
+}
+
+
+// messenger
+void Locker::handle_file_lock(ScatterLock *lock, const MLock::const_ref &m)
+{
+ CInode *in = static_cast<CInode*>(lock->get_parent());
+ int from = m->get_asker();
+
+ if (mds->is_rejoin()) {
+ if (in->is_rejoining()) {
+ dout(7) << "handle_file_lock still rejoining " << *in
+ << ", dropping " << *m << dendl;
+ return;
+ }
+ }
+
+ dout(7) << "handle_file_lock a=" << lock->get_lock_action_name(m->get_action())
+ << " on " << *lock
+ << " from mds." << from << " "
+ << *in << dendl;
+
+ bool caps = lock->get_cap_shift();
+
+ switch (m->get_action()) {
+ // -- replica --
+ case LOCK_AC_SYNC:
+ ceph_assert(lock->get_state() == LOCK_LOCK ||
+ lock->get_state() == LOCK_MIX ||
+ lock->get_state() == LOCK_MIX_SYNC2);
+
+ if (lock->get_state() == LOCK_MIX) {
+ lock->set_state(LOCK_MIX_SYNC);
+ eval_gather(lock, true);
+ if (lock->is_unstable_and_locked())
+ mds->mdlog->flush();
+ break;
+ }
+
+ (static_cast<ScatterLock *>(lock))->finish_flush();
+ (static_cast<ScatterLock *>(lock))->clear_flushed();
+
+ // ok
+ lock->decode_locked_state(m->get_data());
+ lock->set_state(LOCK_SYNC);
+
+ lock->get_rdlock();
+ if (caps)
+ issue_caps(in);
+ lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
+ lock->put_rdlock();
+ break;
+
+ case LOCK_AC_LOCK:
+ switch (lock->get_state()) {
+ case LOCK_SYNC: lock->set_state(LOCK_SYNC_LOCK); break;
+ case LOCK_MIX: lock->set_state(LOCK_MIX_LOCK); break;
+ default: ceph_abort();
+ }
+
+ eval_gather(lock, true);
+ if (lock->is_unstable_and_locked())
+ mds->mdlog->flush();
+
+ break;
+
+ case LOCK_AC_LOCKFLUSHED:
+ (static_cast<ScatterLock *>(lock))->finish_flush();
+ (static_cast<ScatterLock *>(lock))->clear_flushed();
+ // wake up scatter_nudge waiters
+ if (lock->is_stable())
+ lock->finish_waiters(SimpleLock::WAIT_STABLE);
+ break;
+
+ case LOCK_AC_MIX:
+ ceph_assert(lock->get_state() == LOCK_SYNC ||
+ lock->get_state() == LOCK_LOCK ||
+ lock->get_state() == LOCK_SYNC_MIX2);
+
+ if (lock->get_state() == LOCK_SYNC) {
+ // MIXED
+ lock->set_state(LOCK_SYNC_MIX);
+ eval_gather(lock, true);
+ if (lock->is_unstable_and_locked())
+ mds->mdlog->flush();
+ break;
+ }
+
+ // ok
+ lock->set_state(LOCK_MIX);
+ lock->decode_locked_state(m->get_data());
+
+ if (caps)
+ issue_caps(in);
+
+ lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
+ break;
+
+
+ // -- auth --
+ case LOCK_AC_LOCKACK:
+ ceph_assert(lock->get_state() == LOCK_SYNC_LOCK ||
+ lock->get_state() == LOCK_MIX_LOCK ||
+ lock->get_state() == LOCK_MIX_LOCK2 ||
+ lock->get_state() == LOCK_MIX_EXCL ||
+ lock->get_state() == LOCK_SYNC_EXCL ||
+ lock->get_state() == LOCK_SYNC_MIX ||
+ lock->get_state() == LOCK_MIX_TSYN);
+ ceph_assert(lock->is_gathering(from));
+ lock->remove_gather(from);
+
+ if (lock->get_state() == LOCK_MIX_LOCK ||
+ lock->get_state() == LOCK_MIX_LOCK2 ||
+ lock->get_state() == LOCK_MIX_EXCL ||
+ lock->get_state() == LOCK_MIX_TSYN) {
+ lock->decode_locked_state(m->get_data());
+ // replica is waiting for AC_LOCKFLUSHED, eval_gather() should not
+ // delay calling scatter_writebehind().
+ lock->clear_flushed();
+ }
+
+ if (lock->is_gathering()) {
+ dout(7) << "handle_file_lock " << *in << " from " << from
+ << ", still gathering " << lock->get_gather_set() << dendl;
+ } else {
+ dout(7) << "handle_file_lock " << *in << " from " << from
+ << ", last one" << dendl;
+ eval_gather(lock);
+ }
+ break;
+
+ case LOCK_AC_SYNCACK:
+ ceph_assert(lock->get_state() == LOCK_MIX_SYNC);
+ ceph_assert(lock->is_gathering(from));
+ lock->remove_gather(from);
+
+ lock->decode_locked_state(m->get_data());
+
+ if (lock->is_gathering()) {
+ dout(7) << "handle_file_lock " << *in << " from " << from
+ << ", still gathering " << lock->get_gather_set() << dendl;
+ } else {
+ dout(7) << "handle_file_lock " << *in << " from " << from
+ << ", last one" << dendl;
+ eval_gather(lock);
+ }
+ break;
+
+ case LOCK_AC_MIXACK:
+ ceph_assert(lock->get_state() == LOCK_SYNC_MIX);
+ ceph_assert(lock->is_gathering(from));
+ lock->remove_gather(from);
+
+ if (lock->is_gathering()) {
+ dout(7) << "handle_file_lock " << *in << " from " << from
+ << ", still gathering " << lock->get_gather_set() << dendl;
+ } else {
+ dout(7) << "handle_file_lock " << *in << " from " << from
+ << ", last one" << dendl;
+ eval_gather(lock);
+ }
+ break;
+
+
+ // requests....
+ case LOCK_AC_REQSCATTER:
+ if (lock->is_stable()) {
+ /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing)
+ * because the replica should be holding an auth_pin if they're
+ * doing this (and thus, we are freezing, not frozen, and indefinite
+ * starvation isn't an issue).
+ */
+ dout(7) << "handle_file_lock got scatter request on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ if (lock->get_state() != LOCK_MIX) // i.e., the reqscatter didn't race with an actual mix/scatter
+ scatter_mix(lock);
+ } else {
+ dout(7) << "handle_file_lock got scatter request, !stable, marking scatter_wanted on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ lock->set_scatter_wanted();
+ }
+ break;
+
+ case LOCK_AC_REQUNSCATTER:
+ if (lock->is_stable()) {
+ /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing)
+ * because the replica should be holding an auth_pin if they're
+ * doing this (and thus, we are freezing, not frozen, and indefinite
+ * starvation isn't an issue).
+ */
+ dout(7) << "handle_file_lock got unscatter request on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ if (lock->get_state() == LOCK_MIX) // i.e., the reqscatter didn't race with an actual mix/scatter
+ simple_lock(lock); // FIXME tempsync?
+ } else {
+ dout(7) << "handle_file_lock ignoring unscatter request on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ lock->set_unscatter_wanted();
+ }
+ break;
+
+ case LOCK_AC_REQRDLOCK:
+ handle_reqrdlock(lock, m);
+ break;
+
+ case LOCK_AC_NUDGE:
+ if (!lock->get_parent()->is_auth()) {
+ dout(7) << "handle_file_lock IGNORING nudge on non-auth " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ } else if (!lock->get_parent()->is_replicated()) {
+ dout(7) << "handle_file_lock IGNORING nudge on non-replicated " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ } else {
+ dout(7) << "handle_file_lock trying nudge on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ scatter_nudge(lock, 0, true);
+ mds->mdlog->flush();
+ }
+ break;
+
+ default:
+ ceph_abort();
+ }
+}
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
new file mode 100644
index 00000000..c4dd65ee
--- /dev/null
+++ b/src/mds/Locker.h
@@ -0,0 +1,291 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_LOCKER_H
+#define CEPH_MDS_LOCKER_H
+
+#include "include/types.h"
+
+#include "messages/MClientCaps.h"
+#include "messages/MClientCapRelease.h"
+#include "messages/MClientLease.h"
+#include "messages/MLock.h"
+
+#include <map>
+#include <list>
+#include <set>
+#include <string_view>
+
+class MDSRank;
+class Session;
+class CDentry;
+struct SnapRealm;
+
+class Capability;
+
+class SimpleLock;
+class ScatterLock;
+class LocalLock;
+
+#include "CInode.h"
+#include "SimpleLock.h"
+#include "MDSContext.h"
+#include "Mutation.h"
+#include "messages/MClientReply.h"
+
+class Locker {
+private:
+ MDSRank *mds;
+ MDCache *mdcache;
+
+ public:
+ Locker(MDSRank *m, MDCache *c);
+
+ SimpleLock *get_lock(int lock_type, const MDSCacheObjectInfo &info);
+
+ void dispatch(const Message::const_ref &m);
+ void handle_lock(const MLock::const_ref &m);
+
+ void tick();
+
+ void nudge_log(SimpleLock *lock);
+
+protected:
+ void send_lock_message(SimpleLock *lock, int msg);
+ void send_lock_message(SimpleLock *lock, int msg, const bufferlist &data);
+
+ // -- locks --
+ void _drop_locks(MutationImpl *mut, std::set<CInode*> *pneed_issue, bool drop_rdlocks);
+public:
+ void include_snap_rdlocks(CInode *in, MutationImpl::LockOpVec& lov);
+ void include_snap_rdlocks_wlayout(CInode *in, MutationImpl::LockOpVec& lov,
+ file_layout_t **layout);
+
+ bool acquire_locks(MDRequestRef& mdr,
+ MutationImpl::LockOpVec& lov,
+ CInode *auth_pin_freeze=NULL,
+ bool auth_pin_nonblock=false);
+
+ void notify_freeze_waiter(MDSCacheObject *o);
+ void cancel_locking(MutationImpl *mut, std::set<CInode*> *pneed_issue);
+ void drop_locks(MutationImpl *mut, std::set<CInode*> *pneed_issue=0);
+ void set_xlocks_done(MutationImpl *mut, bool skip_dentry=false);
+ void drop_non_rdlocks(MutationImpl *mut, std::set<CInode*> *pneed_issue=0);
+ void drop_rdlocks_for_early_reply(MutationImpl *mut);
+ void drop_locks_for_fragment_unfreeze(MutationImpl *mut);
+
+ void eval_gather(SimpleLock *lock, bool first=false, bool *need_issue=0, MDSContext::vec *pfinishers=0);
+ void eval(SimpleLock *lock, bool *need_issue);
+ void eval_any(SimpleLock *lock, bool *need_issue, MDSContext::vec *pfinishers=0, bool first=false) {
+ if (!lock->is_stable())
+ eval_gather(lock, first, need_issue, pfinishers);
+ else if (lock->get_parent()->is_auth())
+ eval(lock, need_issue);
+ }
+
+ void eval_scatter_gathers(CInode *in);
+
+ void eval_cap_gather(CInode *in, std::set<CInode*> *issue_set=0);
+
+ bool eval(CInode *in, int mask, bool caps_imported=false);
+ void try_eval(MDSCacheObject *p, int mask);
+ void try_eval(SimpleLock *lock, bool *pneed_issue);
+
+ bool _rdlock_kick(SimpleLock *lock, bool as_anon);
+ bool rdlock_try(SimpleLock *lock, client_t client, MDSContext *c);
+ bool rdlock_start(SimpleLock *lock, MDRequestRef& mut, bool as_anon=false);
+ void rdlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue);
+ bool can_rdlock_set(MutationImpl::LockOpVec& lov);
+ void rdlock_take_set(MutationImpl::LockOpVec& lov, MutationRef& mut);
+
+ void wrlock_force(SimpleLock *lock, MutationRef& mut);
+ bool wrlock_start(SimpleLock *lock, MDRequestRef& mut, bool nowait=false);
+ void wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue);
+
+ void remote_wrlock_start(SimpleLock *lock, mds_rank_t target, MDRequestRef& mut);
+ void remote_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut);
+
+ bool xlock_start(SimpleLock *lock, MDRequestRef& mut);
+ void _finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue);
+ void xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue);
+
+ void xlock_export(const MutationImpl::lock_iterator& it, MutationImpl *mut);
+ void xlock_import(SimpleLock *lock);
+
+
+ // simple
+public:
+ void try_simple_eval(SimpleLock *lock);
+ bool simple_rdlock_try(SimpleLock *lock, MDSContext *con);
+protected:
+ void simple_eval(SimpleLock *lock, bool *need_issue);
+ void handle_simple_lock(SimpleLock *lock, const MLock::const_ref &m);
+
+public:
+ bool simple_sync(SimpleLock *lock, bool *need_issue=0);
+protected:
+ void simple_lock(SimpleLock *lock, bool *need_issue=0);
+ void simple_excl(SimpleLock *lock, bool *need_issue=0);
+ void simple_xlock(SimpleLock *lock);
+
+
+ // scatter
+public:
+ void scatter_eval(ScatterLock *lock, bool *need_issue); // public for MDCache::adjust_subtree_auth()
+
+ void scatter_tick();
+ void scatter_nudge(ScatterLock *lock, MDSContext *c, bool forcelockchange=false);
+
+protected:
+ void handle_scatter_lock(ScatterLock *lock, const MLock::const_ref &m);
+ bool scatter_scatter_fastpath(ScatterLock *lock);
+ void scatter_scatter(ScatterLock *lock, bool nowait=false);
+ void scatter_tempsync(ScatterLock *lock, bool *need_issue=0);
+
+ void scatter_writebehind(ScatterLock *lock);
+
+ void scatter_writebehind_finish(ScatterLock *lock, MutationRef& mut);
+
+ xlist<ScatterLock*> updated_scatterlocks;
+public:
+ void mark_updated_scatterlock(ScatterLock *lock);
+
+
+ void handle_reqrdlock(SimpleLock *lock, const MLock::const_ref &m);
+
+
+
+ // caps
+
+ // when to defer processing client cap release or writeback due to being
+ // frozen. the condition must be consistent across handle_client_caps and
+ // process_request_cap_release to preserve ordering.
+ bool should_defer_client_cap_frozen(CInode *in);
+
+ void process_request_cap_release(MDRequestRef& mdr, client_t client, const ceph_mds_request_release& r,
+ std::string_view dname);
+
+ void kick_cap_releases(MDRequestRef& mdr);
+ void kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq);
+
+ void remove_client_cap(CInode *in, Capability *cap, bool kill=false);
+
+ void get_late_revoking_clients(std::list<client_t> *result, double timeout) const;
+
+private:
+ bool any_late_revoking_caps(xlist<Capability*> const &revoking, double timeout) const;
+
+protected:
+ bool _need_flush_mdlog(CInode *in, int wanted_caps);
+ void adjust_cap_wanted(Capability *cap, int wanted, int issue_seq);
+ void handle_client_caps(const MClientCaps::const_ref &m);
+ void _update_cap_fields(CInode *in, int dirty, const MClientCaps::const_ref &m, CInode::mempool_inode *pi);
+ void _do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, const MClientCaps::const_ref &m, const MClientCaps::ref &ack);
+ void _do_null_snapflush(CInode *head_in, client_t client, snapid_t last=CEPH_NOSNAP);
+ bool _do_cap_update(CInode *in, Capability *cap, int dirty, snapid_t follows, const MClientCaps::const_ref &m,
+ const MClientCaps::ref &ack, bool *need_flush=NULL);
+ void handle_client_cap_release(const MClientCapRelease::const_ref &m);
+ void _do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id, ceph_seq_t mseq, ceph_seq_t seq);
+ void caps_tick();
+
+ // Maintain a global list to quickly find if any caps are late revoking
+ xlist<Capability*> revoking_caps;
+ // Maintain a per-client list to find clients responsible for late ones quickly
+ std::map<client_t, xlist<Capability*> > revoking_caps_by_client;
+
+ elist<CInode*> need_snapflush_inodes;
+public:
+ void snapflush_nudge(CInode *in);
+ void mark_need_snapflush_inode(CInode *in);
+ bool is_revoking_any_caps_from(client_t client);
+
+ // local
+public:
+ void local_wrlock_grab(LocalLock *lock, MutationRef& mut);
+protected:
+ bool local_wrlock_start(LocalLock *lock, MDRequestRef& mut);
+ void local_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut);
+ bool local_xlock_start(LocalLock *lock, MDRequestRef& mut);
+ void local_xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut);
+
+
+ // file
+public:
+ void file_eval(ScatterLock *lock, bool *need_issue);
+protected:
+ void handle_file_lock(ScatterLock *lock, const MLock::const_ref &m);
+ void scatter_mix(ScatterLock *lock, bool *need_issue=0);
+ void file_excl(ScatterLock *lock, bool *need_issue=0);
+ void file_xsyn(SimpleLock *lock, bool *need_issue=0);
+
+public:
+ void file_recover(ScatterLock *lock);
+
+private:
+ xlist<ScatterLock*> updated_filelocks;
+public:
+ void mark_updated_Filelock(ScatterLock *lock);
+
+ // -- file i/o --
+public:
+ version_t issue_file_data_version(CInode *in);
+ Capability* issue_new_caps(CInode *in, int mode, Session *session, SnapRealm *conrealm, bool is_replay);
+ int issue_caps(CInode *in, Capability *only_cap=0);
+ void issue_caps_set(std::set<CInode*>& inset);
+ void issue_truncate(CInode *in);
+ void revoke_stale_cap(CInode *in, client_t client);
+ bool revoke_stale_caps(Session *session);
+ void resume_stale_caps(Session *session);
+ void remove_stale_leases(Session *session);
+
+public:
+ void request_inode_file_caps(CInode *in);
+protected:
+ void handle_inode_file_caps(const MInodeFileCaps::const_ref &m);
+
+ void file_update_finish(CInode *in, MutationRef& mut, unsigned flags,
+ client_t client, const MClientCaps::ref &ack);
+private:
+ uint64_t calc_new_max_size(CInode::mempool_inode *pi, uint64_t size);
+public:
+ void calc_new_client_ranges(CInode *in, uint64_t size, bool update,
+ CInode::mempool_inode::client_range_map* new_ranges,
+ bool *max_increased);
+ bool check_inode_max_size(CInode *in, bool force_wrlock=false,
+ uint64_t newmax=0, uint64_t newsize=0,
+ utime_t mtime=utime_t());
+ void share_inode_max_size(CInode *in, Capability *only_cap=0);
+
+private:
+ friend class C_MDL_CheckMaxSize;
+ friend class C_MDL_RequestInodeFileCaps;
+ friend class C_Locker_FileUpdate_finish;
+ friend class C_Locker_RetryCapRelease;
+ friend class C_Locker_Eval;
+ friend class C_Locker_ScatterWB;
+ friend class LockerContext;
+ friend class LockerLogContext;
+
+
+ // -- client leases --
+public:
+ void handle_client_lease(const MClientLease::const_ref &m);
+
+ void issue_client_lease(CDentry *dn, client_t client, bufferlist &bl, utime_t now, Session *session);
+ void revoke_client_leases(SimpleLock *lock);
+ static void encode_lease(bufferlist& bl, const session_info_t& info, const LeaseStat& ls);
+};
+
+
+#endif
diff --git a/src/mds/LogEvent.cc b/src/mds/LogEvent.cc
new file mode 100644
index 00000000..3e321531
--- /dev/null
+++ b/src/mds/LogEvent.cc
@@ -0,0 +1,209 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/config.h"
+#include "LogEvent.h"
+
+#include "MDSRank.h"
+
+// events i know of
+#include "events/ESubtreeMap.h"
+#include "events/EExport.h"
+#include "events/EImportStart.h"
+#include "events/EImportFinish.h"
+#include "events/EFragment.h"
+
+#include "events/EResetJournal.h"
+#include "events/ESession.h"
+#include "events/ESessions.h"
+
+#include "events/EUpdate.h"
+#include "events/ESlaveUpdate.h"
+#include "events/EOpen.h"
+#include "events/ECommitted.h"
+
+#include "events/ETableClient.h"
+#include "events/ETableServer.h"
+
+#include "events/ENoOp.h"
+
+#define dout_context g_ceph_context
+
+
+std::unique_ptr<LogEvent> LogEvent::decode_event(bufferlist::const_iterator p)
+{
+ // parse type, length
+ EventType type;
+ std::unique_ptr<LogEvent> event;
+ using ceph::decode;
+ decode(type, p);
+
+ if (EVENT_NEW_ENCODING == type) {
+ try {
+ DECODE_START(1, p);
+ decode(type, p);
+ event = decode_event(p, type);
+ DECODE_FINISH(p);
+ }
+ catch (const buffer::error &e) {
+ generic_dout(0) << "failed to decode LogEvent (type maybe " << type << ")" << dendl;
+ return NULL;
+ }
+ } else { // we are using classic encoding
+ event = decode_event(p, type);
+ }
+ return event;
+}
+
+
+std::string_view LogEvent::get_type_str() const
+{
+ switch(_type) {
+ case EVENT_SUBTREEMAP: return "SUBTREEMAP";
+ case EVENT_SUBTREEMAP_TEST: return "SUBTREEMAP_TEST";
+ case EVENT_EXPORT: return "EXPORT";
+ case EVENT_IMPORTSTART: return "IMPORTSTART";
+ case EVENT_IMPORTFINISH: return "IMPORTFINISH";
+ case EVENT_FRAGMENT: return "FRAGMENT";
+ case EVENT_RESETJOURNAL: return "RESETJOURNAL";
+ case EVENT_SESSION: return "SESSION";
+ case EVENT_SESSIONS_OLD: return "SESSIONS_OLD";
+ case EVENT_SESSIONS: return "SESSIONS";
+ case EVENT_UPDATE: return "UPDATE";
+ case EVENT_SLAVEUPDATE: return "SLAVEUPDATE";
+ case EVENT_OPEN: return "OPEN";
+ case EVENT_COMMITTED: return "COMMITTED";
+ case EVENT_TABLECLIENT: return "TABLECLIENT";
+ case EVENT_TABLESERVER: return "TABLESERVER";
+ case EVENT_NOOP: return "NOOP";
+
+ default:
+ generic_dout(0) << "get_type_str: unknown type " << _type << dendl;
+ return "UNKNOWN";
+ }
+}
+
+const std::map<std::string, LogEvent::EventType> LogEvent::types = {
+ {"SUBTREEMAP", EVENT_SUBTREEMAP},
+ {"SUBTREEMAP_TEST", EVENT_SUBTREEMAP_TEST},
+ {"EXPORT", EVENT_EXPORT},
+ {"IMPORTSTART", EVENT_IMPORTSTART},
+ {"IMPORTFINISH", EVENT_IMPORTFINISH},
+ {"FRAGMENT", EVENT_FRAGMENT},
+ {"RESETJOURNAL", EVENT_RESETJOURNAL},
+ {"SESSION", EVENT_SESSION},
+ {"SESSIONS_OLD", EVENT_SESSIONS_OLD},
+ {"SESSIONS", EVENT_SESSIONS},
+ {"UPDATE", EVENT_UPDATE},
+ {"SLAVEUPDATE", EVENT_SLAVEUPDATE},
+ {"OPEN", EVENT_OPEN},
+ {"COMMITTED", EVENT_COMMITTED},
+ {"TABLECLIENT", EVENT_TABLECLIENT},
+ {"TABLESERVER", EVENT_TABLESERVER},
+ {"NOOP", EVENT_NOOP}
+};
+
+/*
+ * Resolve type string to type enum
+ *
+ * Return -1 if not found
+ */
+LogEvent::EventType LogEvent::str_to_type(std::string_view str)
+{
+ return LogEvent::types.at(std::string(str));
+}
+
+
+std::unique_ptr<LogEvent> LogEvent::decode_event(bufferlist::const_iterator& p, LogEvent::EventType type)
+{
+ const auto length = p.get_remaining();
+ generic_dout(15) << "decode_log_event type " << type << ", size " << length << dendl;
+
+ // create event
+ std::unique_ptr<LogEvent> le;
+ switch (type) {
+ case EVENT_SUBTREEMAP:
+ le = std::make_unique<ESubtreeMap>();
+ break;
+ case EVENT_SUBTREEMAP_TEST:
+ le = std::make_unique<ESubtreeMap>();
+ le->set_type(type);
+ break;
+ case EVENT_EXPORT:
+ le = std::make_unique<EExport>();
+ break;
+ case EVENT_IMPORTSTART:
+ le = std::make_unique<EImportStart>();
+ break;
+ case EVENT_IMPORTFINISH:
+ le = std::make_unique<EImportFinish>();
+ break;
+ case EVENT_FRAGMENT:
+ le = std::make_unique<EFragment>();
+ break;
+ case EVENT_RESETJOURNAL:
+ le = std::make_unique<EResetJournal>();
+ break;
+ case EVENT_SESSION:
+ le = std::make_unique<ESession>();
+ break;
+ case EVENT_SESSIONS_OLD:
+ {
+ auto e = std::make_unique<ESessions>();
+ e->mark_old_encoding();
+ le = std::move(e);
+ }
+ break;
+ case EVENT_SESSIONS:
+ le = std::make_unique<ESessions>();
+ break;
+ case EVENT_UPDATE:
+ le = std::make_unique<EUpdate>();
+ break;
+ case EVENT_SLAVEUPDATE:
+ le = std::make_unique<ESlaveUpdate>();
+ break;
+ case EVENT_OPEN:
+ le = std::make_unique<EOpen>();
+ break;
+ case EVENT_COMMITTED:
+ le = std::make_unique<ECommitted>();
+ break;
+ case EVENT_TABLECLIENT:
+ le = std::make_unique<ETableClient>();
+ break;
+ case EVENT_TABLESERVER:
+ le = std::make_unique<ETableServer>();
+ break;
+ case EVENT_NOOP:
+ le = std::make_unique<ENoOp>();
+ break;
+ default:
+ generic_dout(0) << "uh oh, unknown log event type " << type << " length " << length << dendl;
+ return nullptr;
+ }
+
+ // decode
+ try {
+ le->decode(p);
+ }
+ catch (const buffer::error &e) {
+ generic_dout(0) << "failed to decode LogEvent type " << type << dendl;
+ return nullptr;
+ }
+
+ ceph_assert(p.end());
+ return le;
+}
+
diff --git a/src/mds/LogEvent.h b/src/mds/LogEvent.h
new file mode 100644
index 00000000..7c7273f8
--- /dev/null
+++ b/src/mds/LogEvent.h
@@ -0,0 +1,132 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LOGEVENT_H
+#define CEPH_LOGEVENT_H
+
+#define EVENT_NEW_ENCODING 0 // indicates that the encoding is versioned
+#define EVENT_UNUSED 1 // was previously EVENT_STRING
+
+#define EVENT_SUBTREEMAP 2
+#define EVENT_EXPORT 3
+#define EVENT_IMPORTSTART 4
+#define EVENT_IMPORTFINISH 5
+#define EVENT_FRAGMENT 6
+
+#define EVENT_RESETJOURNAL 9
+
+#define EVENT_SESSION 10
+#define EVENT_SESSIONS_OLD 11
+#define EVENT_SESSIONS 12
+
+#define EVENT_UPDATE 20
+#define EVENT_SLAVEUPDATE 21
+#define EVENT_OPEN 22
+#define EVENT_COMMITTED 23
+
+#define EVENT_TABLECLIENT 42
+#define EVENT_TABLESERVER 43
+
+#define EVENT_SUBTREEMAP_TEST 50
+#define EVENT_NOOP 51
+
+
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "include/utime.h"
+
+class MDSRank;
+class LogSegment;
+class EMetaBlob;
+
+// generic log event
+class LogEvent {
+public:
+ friend class MDLog;
+ typedef __u32 EventType;
+
+ LogEvent() = delete;
+ explicit LogEvent(int t) : _type(t) {}
+ LogEvent(const LogEvent&) = delete;
+ LogEvent& operator=(const LogEvent&) = delete;
+ virtual ~LogEvent() {}
+
+ std::string_view get_type_str() const;
+ static EventType str_to_type(std::string_view str);
+ EventType get_type() const { return _type; }
+ void set_type(EventType t) { _type = t; }
+
+ uint64_t get_start_off() const { return _start_off; }
+ void set_start_off(uint64_t o) { _start_off = o; }
+
+ utime_t get_stamp() const { return stamp; }
+ void set_stamp(utime_t t) { stamp = t; }
+
+ // encoding
+ virtual void encode(bufferlist& bl, uint64_t features) const = 0;
+ virtual void decode(bufferlist::const_iterator &) = 0;
+ static std::unique_ptr<LogEvent> decode_event(bufferlist::const_iterator);
+ virtual void dump(Formatter *f) const = 0;
+
+ void encode_with_header(bufferlist& bl, uint64_t features) {
+ using ceph::encode;
+ encode(EVENT_NEW_ENCODING, bl);
+ ENCODE_START(1, 1, bl)
+ encode(_type, bl);
+ this->encode(bl, features);
+ ENCODE_FINISH(bl);
+ }
+
+ virtual void print(ostream& out) const {
+ out << "event(" << _type << ")";
+ }
+
+ /*** live journal ***/
+ /* update_segment() - adjust any state we need to in the LogSegment
+ */
+ virtual void update_segment() { }
+
+ /*** recovery ***/
+ /* replay() - replay given event. this is idempotent.
+ */
+ virtual void replay(MDSRank *m) { ceph_abort(); }
+
+ /**
+ * If the subclass embeds a MetaBlob, return it here so that
+ * tools can examine metablobs while traversing lists of LogEvent.
+ */
+ virtual EMetaBlob *get_metablob() { return NULL; }
+
+protected:
+ utime_t stamp;
+
+ LogSegment* get_segment() { return _segment; }
+ LogSegment const* get_segment() const { return _segment; }
+
+private:
+ static const std::map<std::string, LogEvent::EventType> types;
+
+ static std::unique_ptr<LogEvent> decode_event(bufferlist::const_iterator&, EventType);
+
+ EventType _type = 0;
+ uint64_t _start_off = 0;
+ LogSegment *_segment = nullptr;
+};
+
+inline ostream& operator<<(ostream& out, const LogEvent &le) {
+ le.print(out);
+ return out;
+}
+
+#endif
diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h
new file mode 100644
index 00000000..c1c8e7ea
--- /dev/null
+++ b/src/mds/LogSegment.h
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LOGSEGMENT_H
+#define CEPH_LOGSEGMENT_H
+
+#include "include/elist.h"
+#include "include/interval_set.h"
+#include "include/Context.h"
+#include "MDSContext.h"
+#include "mdstypes.h"
+#include "CInode.h"
+#include "CDentry.h"
+#include "CDir.h"
+
+#include "include/unordered_set.h"
+
+using ceph::unordered_set;
+
+class CDir;
+class CInode;
+class CDentry;
+class MDSRank;
+struct MDSlaveUpdate;
+
+class LogSegment {
+ public:
+ using seq_t = uint64_t;
+
+ LogSegment(uint64_t _seq, loff_t off=-1) :
+ seq(_seq), offset(off), end(off),
+ dirty_dirfrags(member_offset(CDir, item_dirty)),
+ new_dirfrags(member_offset(CDir, item_new)),
+ dirty_inodes(member_offset(CInode, item_dirty)),
+ dirty_dentries(member_offset(CDentry, item_dirty)),
+ open_files(member_offset(CInode, item_open_file)),
+ dirty_parent_inodes(member_offset(CInode, item_dirty_parent)),
+ dirty_dirfrag_dir(member_offset(CInode, item_dirty_dirfrag_dir)),
+ dirty_dirfrag_nest(member_offset(CInode, item_dirty_dirfrag_nest)),
+ dirty_dirfrag_dirfragtree(member_offset(CInode, item_dirty_dirfrag_dirfragtree))
+ {}
+
+ void try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio);
+
+ void wait_for_expiry(MDSContext *c)
+ {
+ ceph_assert(c != NULL);
+ expiry_waiters.push_back(c);
+ }
+
+ const seq_t seq;
+ uint64_t offset, end;
+ int num_events = 0;
+
+ // dirty items
+ elist<CDir*> dirty_dirfrags, new_dirfrags;
+ elist<CInode*> dirty_inodes;
+ elist<CDentry*> dirty_dentries;
+
+ elist<CInode*> open_files;
+ elist<CInode*> dirty_parent_inodes;
+ elist<CInode*> dirty_dirfrag_dir;
+ elist<CInode*> dirty_dirfrag_nest;
+ elist<CInode*> dirty_dirfrag_dirfragtree;
+
+ set<CInode*> truncating_inodes;
+
+ map<int, ceph::unordered_set<version_t> > pending_commit_tids; // mdstable
+ set<metareqid_t> uncommitted_masters;
+ set<metareqid_t> uncommitted_slaves;
+ set<dirfrag_t> uncommitted_fragments;
+
+ // client request ids
+ map<int, ceph_tid_t> last_client_tids;
+
+ // potentially dirty sessions
+ std::set<entity_name_t> touched_sessions;
+
+ // table version
+ version_t inotablev = 0;
+ version_t sessionmapv = 0;
+ map<int,version_t> tablev;
+
+ MDSContext::vec expiry_waiters;
+};
+
+#endif
diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc
new file mode 100644
index 00000000..b6f6bbc1
--- /dev/null
+++ b/src/mds/MDBalancer.cc
@@ -0,0 +1,1456 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "mdstypes.h"
+
+#include "mon/MonClient.h"
+#include "MDBalancer.h"
+#include "MDSRank.h"
+#include "MDSMap.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "MDCache.h"
+#include "Migrator.h"
+#include "Mantle.h"
+
+#include "include/Context.h"
+#include "msg/Messenger.h"
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include <map>
+using std::map;
+using std::vector;
+using std::chrono::duration_cast;
+
+#include "common/config.h"
+#include "common/errno.h"
+
+#define dout_context g_ceph_context
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal "
+#undef dout
+#define dout(lvl) \
+ do {\
+ auto subsys = ceph_subsys_mds;\
+ if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
+ subsys = ceph_subsys_mds_balancer;\
+ }\
+ dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
+#undef dendl
+#define dendl dendl_impl; } while (0)
+
+
+#define MIN_LOAD 50 // ??
+#define MIN_REEXPORT 5 // will automatically reexport
+#define MIN_OFFLOAD 10 // point at which i stop trying, close enough
+
+
+int MDBalancer::proc_message(const Message::const_ref &m)
+{
+ switch (m->get_type()) {
+
+ case MSG_MDS_HEARTBEAT:
+ handle_heartbeat(MHeartbeat::msgref_cast(m));
+ break;
+
+ default:
+ derr << " balancer unknown message " << m->get_type() << dendl_impl;
+ ceph_abort_msg("balancer unknown message");
+ }
+
+ return 0;
+}
+
+MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
+ mds(m), messenger(msgr), mon_client(monc)
+{
+ bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
+ bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
+}
+
+void MDBalancer::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
+{
+ if (changed.count("mds_bal_fragment_dirs"))
+ bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
+ if (changed.count("mds_bal_fragment_interval"))
+ bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
+}
+
+void MDBalancer::handle_export_pins(void)
+{
+ auto &q = mds->mdcache->export_pin_queue;
+ auto it = q.begin();
+ dout(20) << "export_pin_queue size=" << q.size() << dendl;
+ while (it != q.end()) {
+ auto cur = it++;
+ CInode *in = *cur;
+ ceph_assert(in->is_dir());
+ mds_rank_t export_pin = in->get_export_pin(false);
+ if (export_pin >= mds->mdsmap->get_max_mds()) {
+ dout(20) << " delay export pin on " << *in << dendl;
+ in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
+ q.erase(cur);
+
+ in->state_set(CInode::STATE_DELAYEDEXPORTPIN);
+ mds->mdcache->export_pin_delayed_queue.insert(in);
+ continue;
+ }
+
+ bool remove = true;
+ list<CDir*> dfls;
+ in->get_dirfrags(dfls);
+ for (auto dir : dfls) {
+ if (!dir->is_auth())
+ continue;
+
+ if (export_pin == MDS_RANK_NONE) {
+ if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
+ if (dir->is_frozen() || dir->is_freezing()) {
+ // try again later
+ remove = false;
+ continue;
+ }
+ dout(10) << " clear auxsubtree on " << *dir << dendl;
+ dir->state_clear(CDir::STATE_AUXSUBTREE);
+ mds->mdcache->try_subtree_merge(dir);
+ }
+ } else if (export_pin == mds->get_nodeid()) {
+ if (dir->state_test(CDir::STATE_CREATING) ||
+ dir->is_frozen() || dir->is_freezing()) {
+ // try again later
+ remove = false;
+ continue;
+ }
+ if (!dir->is_subtree_root()) {
+ dir->state_set(CDir::STATE_AUXSUBTREE);
+ mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
+ dout(10) << " create aux subtree on " << *dir << dendl;
+ } else if (!dir->state_test(CDir::STATE_AUXSUBTREE)) {
+ dout(10) << " set auxsubtree bit on " << *dir << dendl;
+ dir->state_set(CDir::STATE_AUXSUBTREE);
+ }
+ } else {
+ mds->mdcache->migrator->export_dir(dir, export_pin);
+ remove = false;
+ }
+ }
+
+ if (remove) {
+ in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
+ q.erase(cur);
+ }
+ }
+
+ std::vector<CDir *> authsubs = mds->mdcache->get_auth_subtrees();
+ bool print_auth_subtrees = true;
+
+ if (authsubs.size() > AUTH_TREES_THRESHOLD &&
+ !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
+ dout(15) << "number of auth trees = " << authsubs.size() << "; not "
+ "printing auth trees" << dendl;
+ print_auth_subtrees = false;
+ }
+
+ for (auto &cd : authsubs) {
+ mds_rank_t export_pin = cd->inode->get_export_pin();
+
+ if (print_auth_subtrees) {
+ dout(25) << "auth tree " << *cd << " export_pin=" << export_pin <<
+ dendl;
+ }
+
+ if (export_pin >= 0 && export_pin < mds->mdsmap->get_max_mds()
+ && export_pin != mds->get_nodeid()) {
+ mds->mdcache->migrator->export_dir(cd, export_pin);
+ }
+ }
+}
+
+void MDBalancer::tick()
+{
+ static int num_bal_times = g_conf()->mds_bal_max;
+ auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
+ auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
+ time now = clock::now();
+
+ if (g_conf()->mds_bal_export_pin) {
+ handle_export_pins();
+ }
+
+ // sample?
+ if (chrono::duration<double>(now-last_sample).count() >
+ g_conf()->mds_bal_sample_interval) {
+ dout(15) << "tick last_sample now " << now << dendl;
+ last_sample = now;
+ }
+
+ // We can use duration_cast below, although the result is an int,
+ // because the values from g_conf are also integers.
+ // balance?
+ if (mds->get_nodeid() == 0
+ && mds->is_active()
+ && bal_interval > 0
+ && duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval
+ && (num_bal_times || (bal_max_until >= 0 && mds->get_uptime().count() > bal_max_until))) {
+ last_heartbeat = now;
+ send_heartbeat();
+ num_bal_times--;
+ }
+
+ mds->mdcache->show_subtrees(10, true);
+}
+
+
+
+
+class C_Bal_SendHeartbeat : public MDSInternalContext {
+public:
+ explicit C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { }
+ void finish(int f) override {
+ mds->balancer->send_heartbeat();
+ }
+};
+
+
+double mds_load_t::mds_load() const
+{
+ switch(g_conf()->mds_bal_mode) {
+ case 0:
+ return
+ .8 * auth.meta_load() +
+ .2 * all.meta_load() +
+ req_rate +
+ 10.0 * queue_len;
+
+ case 1:
+ return req_rate + 10.0*queue_len;
+
+ case 2:
+ return cpu_load_avg;
+
+ }
+ ceph_abort();
+ return 0;
+}
+
+mds_load_t MDBalancer::get_load()
+{
+ auto now = clock::now();
+
+ mds_load_t load{DecayRate()}; /* zero DecayRate! */
+
+ if (mds->mdcache->get_root()) {
+ list<CDir*> ls;
+ mds->mdcache->get_root()->get_dirfrags(ls);
+ for (auto &d : ls) {
+ load.auth.add(d->pop_auth_subtree_nested);
+ load.all.add(d->pop_nested);
+ }
+ } else {
+ dout(20) << "get_load no root, no load" << dendl;
+ }
+
+ uint64_t num_requests = mds->get_num_requests();
+
+ uint64_t cpu_time = 1;
+ {
+ string stat_path = PROCPREFIX "/proc/self/stat";
+ ifstream stat_file(stat_path);
+ if (stat_file.is_open()) {
+ vector<string> stat_vec(std::istream_iterator<string>{stat_file},
+ std::istream_iterator<string>());
+ if (stat_vec.size() >= 15) {
+ // utime + stime
+ cpu_time = strtoll(stat_vec[13].c_str(), nullptr, 10) +
+ strtoll(stat_vec[14].c_str(), nullptr, 10);
+ } else {
+ derr << "input file '" << stat_path << "' not resolvable" << dendl_impl;
+ }
+ } else {
+ derr << "input file '" << stat_path << "' not found" << dendl_impl;
+ }
+ }
+
+ load.queue_len = messenger->get_dispatch_queue_len();
+
+ bool update_last = true;
+ if (last_get_load != clock::zero() &&
+ now > last_get_load) {
+ double el = std::chrono::duration<double>(now-last_get_load).count();
+ if (el >= 1.0) {
+ if (num_requests > last_num_requests)
+ load.req_rate = (num_requests - last_num_requests) / el;
+ if (cpu_time > last_cpu_time)
+ load.cpu_load_avg = (cpu_time - last_cpu_time) / el;
+ } else {
+ auto p = mds_load.find(mds->get_nodeid());
+ if (p != mds_load.end()) {
+ load.req_rate = p->second.req_rate;
+ load.cpu_load_avg = p->second.cpu_load_avg;
+ }
+ if (num_requests >= last_num_requests && cpu_time >= last_cpu_time)
+ update_last = false;
+ }
+ }
+
+ if (update_last) {
+ last_num_requests = num_requests;
+ last_cpu_time = cpu_time;
+ last_get_load = now;
+ }
+
+ dout(15) << "get_load " << load << dendl;
+ return load;
+}
+
+/*
+ * Read synchronously from RADOS using a timeout. We cannot do daemon-local
+ * fallbacks (i.e. kick off async read when we are processing the map and
+ * check status when we get here) with the way the mds is structured.
+ */
+int MDBalancer::localize_balancer()
+{
+ /* reset everything */
+ bool ack = false;
+ int r = 0;
+ bufferlist lua_src;
+ Mutex lock("lock");
+ Cond cond;
+
+ /* we assume that balancer is in the metadata pool */
+ object_t oid = object_t(mds->mdsmap->get_balancer());
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+ ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
+ new C_SafeCond(&lock, &cond, &ack, &r));
+ dout(15) << "launched non-blocking read tid=" << tid
+ << " oid=" << oid << " oloc=" << oloc << dendl;
+
+ /* timeout: if we waste half our time waiting for RADOS, then abort! */
+ auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
+ lock.Lock();
+ int ret_t = cond.WaitInterval(lock, utime_t(bal_interval / 2, 0));
+ lock.Unlock();
+
+ /* success: store the balancer in memory and set the version. */
+ if (!r) {
+ if (ret_t == ETIMEDOUT) {
+ mds->objecter->op_cancel(tid, -ECANCELED);
+ return -ETIMEDOUT;
+ }
+ bal_code.assign(lua_src.to_str());
+ bal_version.assign(oid.name);
+ dout(10) << "localized balancer, bal_code=" << bal_code << dendl;
+ }
+ return r;
+}
+
+void MDBalancer::send_heartbeat()
+{
+ if (mds->is_cluster_degraded()) {
+ dout(10) << "send_heartbeat degraded" << dendl;
+ return;
+ }
+
+ if (!mds->mdcache->is_open()) {
+ dout(5) << "not open" << dendl;
+ mds->mdcache->wait_for_open(new C_Bal_SendHeartbeat(mds));
+ return;
+ }
+
+ if (mds->get_nodeid() == 0) {
+ beat_epoch++;
+ mds_load.clear();
+ }
+
+ // my load
+ mds_load_t load = get_load();
+ mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
+ mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
+
+ auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(mds->get_nodeid()), std::forward_as_tuple(load));
+ if (!em.second) {
+ em.first->second = load;
+ }
+
+ // import_map -- how much do i import from whom
+ map<mds_rank_t, float> import_map;
+ for (auto& im : mds->mdcache->get_auth_subtrees()) {
+ mds_rank_t from = im->inode->authority().first;
+ if (from == mds->get_nodeid()) continue;
+ if (im->get_inode()->is_stray()) continue;
+ import_map[from] += im->pop_auth_subtree.meta_load();
+ }
+ mds_import_map[ mds->get_nodeid() ] = import_map;
+
+
+ dout(5) << "mds." << mds->get_nodeid() << " epoch " << beat_epoch << " load " << load << dendl;
+ for (map<mds_rank_t, float>::iterator it = import_map.begin();
+ it != import_map.end();
+ ++it) {
+ dout(5) << " import_map from " << it->first << " -> " << it->second << dendl;
+ }
+
+
+ set<mds_rank_t> up;
+ mds->get_mds_map()->get_up_mds_set(up);
+ for (const auto& r : up) {
+ if (r == mds->get_nodeid())
+ continue;
+ auto hb = MHeartbeat::create(load, beat_epoch);
+ hb->get_import_map() = import_map;
+ mds->send_message_mds(hb, r);
+ }
+}
+
+void MDBalancer::handle_heartbeat(const MHeartbeat::const_ref &m)
+{
+ mds_rank_t who = mds_rank_t(m->get_source().num());
+ dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
+
+ if (!mds->is_active())
+ return;
+
+ if (!mds->mdcache->is_open()) {
+ dout(10) << "opening root on handle_heartbeat" << dendl;
+ mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ if (mds->is_cluster_degraded()) {
+ dout(10) << " degraded, ignoring" << dendl;
+ return;
+ }
+
+ if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) {
+ dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl;
+
+ beat_epoch = m->get_beat();
+ // clear the mds load info whose epoch is less than beat_epoch
+ mds_load.clear();
+ }
+
+ if (who == 0) {
+ dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
+ if (beat_epoch != m->get_beat()) {
+ beat_epoch = m->get_beat();
+ mds_load.clear();
+ }
+
+ send_heartbeat();
+
+ mds->mdcache->show_subtrees();
+ } else if (mds->get_nodeid() == 0) {
+ if (beat_epoch != m->get_beat()) {
+ dout(10) << " old heartbeat epoch, ignoring" << dendl;
+ return;
+ }
+ }
+
+ {
+ auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(who), std::forward_as_tuple(m->get_load()));
+ if (!em.second) {
+ em.first->second = m->get_load();
+ }
+ }
+ mds_import_map[who] = m->get_import_map();
+
+ {
+ unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
+ if (mds_load.size() == cluster_size) {
+ // let's go!
+ //export_empties(); // no!
+
+ /* avoid spamming ceph -w if user does not turn mantle on */
+ if (mds->mdsmap->get_balancer() != "") {
+ int r = mantle_prep_rebalance();
+ if (!r) return;
+ mds->clog->warn() << "using old balancer; mantle failed for "
+ << "balancer=" << mds->mdsmap->get_balancer()
+ << " : " << cpp_strerror(r);
+ }
+ prep_rebalance(m->get_beat());
+ }
+ }
+}
+
+double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
+ mds_rank_t im, double& maxim)
+{
+ if (maxex <= 0 || maxim <= 0) return 0.0;
+
+ double howmuch = std::min(maxex, maxim);
+ if (howmuch <= 0) return 0.0;
+
+ dout(5) << " - mds." << ex << " exports " << howmuch << " to mds." << im << dendl;
+
+ if (ex == mds->get_nodeid())
+ state.targets[im] += howmuch;
+
+ state.exported[ex] += howmuch;
+ state.imported[im] += howmuch;
+
+ maxex -= howmuch;
+ maxim -= howmuch;
+
+ return howmuch;
+}
+
+void MDBalancer::queue_split(const CDir *dir, bool fast)
+{
+ dout(10) << __func__ << " enqueuing " << *dir
+ << " (fast=" << fast << ")" << dendl;
+
+ const dirfrag_t frag = dir->dirfrag();
+
+ auto callback = [this, frag](int r) {
+ if (split_pending.erase(frag) == 0) {
+ // Someone beat me to it. This can happen in the fast splitting
+ // path, because we spawn two contexts, one with mds->timer and
+ // one with mds->queue_waiter. The loser can safely just drop
+ // out.
+ return;
+ }
+
+ CDir *split_dir = mds->mdcache->get_dirfrag(frag);
+ if (!split_dir) {
+ dout(10) << "drop split on " << frag << " because not in cache" << dendl;
+ return;
+ }
+ if (!split_dir->is_auth()) {
+ dout(10) << "drop split on " << frag << " because non-auth" << dendl;
+ return;
+ }
+
+ // Pass on to MDCache: note that the split might still not
+ // happen if the checks in MDCache::can_fragment fail.
+ dout(10) << __func__ << " splitting " << *split_dir << dendl;
+ mds->mdcache->split_dir(split_dir, g_conf()->mds_bal_split_bits);
+ };
+
+ bool is_new = false;
+ if (split_pending.count(frag) == 0) {
+ split_pending.insert(frag);
+ is_new = true;
+ }
+
+ if (fast) {
+ // Do the split ASAP: enqueue it in the MDSRank waiters which are
+ // run at the end of dispatching the current request
+ mds->queue_waiter(new MDSInternalContextWrapper(mds,
+ new FunctionContext(callback)));
+ } else if (is_new) {
+ // Set a timer to really do the split: we don't do it immediately
+ // so that bursts of ops on a directory have a chance to go through
+ // before we freeze it.
+ mds->timer.add_event_after(bal_fragment_interval,
+ new FunctionContext(callback));
+ }
+}
+
+void MDBalancer::queue_merge(CDir *dir)
+{
+ const auto frag = dir->dirfrag();
+ auto callback = [this, frag](int r) {
+ ceph_assert(frag.frag != frag_t());
+
+ // frag must be in this set because only one context is in flight
+ // for a given frag at a time (because merge_pending is checked before
+ // starting one), and this context is the only one that erases it.
+ merge_pending.erase(frag);
+
+ CDir *dir = mds->mdcache->get_dirfrag(frag);
+ if (!dir) {
+ dout(10) << "drop merge on " << frag << " because not in cache" << dendl;
+ return;
+ }
+ ceph_assert(dir->dirfrag() == frag);
+
+ if(!dir->is_auth()) {
+ dout(10) << "drop merge on " << *dir << " because lost auth" << dendl;
+ return;
+ }
+
+ dout(10) << "merging " << *dir << dendl;
+
+ CInode *diri = dir->get_inode();
+
+ frag_t fg = dir->get_frag();
+ while (fg != frag_t()) {
+ frag_t sibfg = fg.get_sibling();
+ list<CDir*> sibs;
+ bool complete = diri->get_dirfrags_under(sibfg, sibs);
+ if (!complete) {
+ dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl;
+ break;
+ }
+ bool all = true;
+ for (list<CDir*>::iterator p = sibs.begin(); p != sibs.end(); ++p) {
+ CDir *sib = *p;
+ if (!sib->is_auth() || !sib->should_merge()) {
+ all = false;
+ break;
+ }
+ }
+ if (!all) {
+ dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl;
+ break;
+ }
+ dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl;
+ fg = fg.parent();
+ }
+
+ if (fg != dir->get_frag())
+ mds->mdcache->merge_dir(diri, fg);
+ };
+
+ if (merge_pending.count(frag) == 0) {
+ dout(20) << __func__ << " enqueued dir " << *dir << dendl;
+ merge_pending.insert(frag);
+ mds->timer.add_event_after(bal_fragment_interval,
+ new FunctionContext(callback));
+ } else {
+ dout(20) << __func__ << " dir already in queue " << *dir << dendl;
+ }
+}
+
+void MDBalancer::prep_rebalance(int beat)
+{
+ balance_state_t state;
+
+ if (g_conf()->mds_thrash_exports) {
+ //we're going to randomly export to all the mds in the cluster
+ set<mds_rank_t> up_mds;
+ mds->get_mds_map()->get_up_mds_set(up_mds);
+ for (const auto &rank : up_mds) {
+ state.targets[rank] = 0.0;
+ }
+ } else {
+ int cluster_size = mds->get_mds_map()->get_num_in_mds();
+ mds_rank_t whoami = mds->get_nodeid();
+ rebalance_time = clock::now();
+
+ dout(5) << " prep_rebalance: cluster loads are" << dendl;
+
+ mds->mdcache->migrator->clear_export_queue();
+
+ // rescale! turn my mds_load back into meta_load units
+ double load_fac = 1.0;
+ map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
+ if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
+ double metald = m->second.auth.meta_load();
+ double mdsld = m->second.mds_load();
+ load_fac = metald / mdsld;
+ dout(7) << " load_fac is " << load_fac
+ << " <- " << m->second.auth << " " << metald
+ << " / " << mdsld
+ << dendl;
+ }
+
+ mds_meta_load.clear();
+
+ double total_load = 0.0;
+ multimap<double,mds_rank_t> load_map;
+ for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
+ mds_load_t& load = mds_load.at(i);
+
+ double l = load.mds_load() * load_fac;
+ mds_meta_load[i] = l;
+
+ if (whoami == 0)
+ dout(5) << " mds." << i
+ << " " << load
+ << " = " << load.mds_load()
+ << " ~ " << l << dendl;
+
+ if (whoami == i) my_load = l;
+ total_load += l;
+
+ load_map.insert(pair<double,mds_rank_t>( l, i ));
+ }
+
+ // target load
+ target_load = total_load / (double)cluster_size;
+ dout(5) << "prep_rebalance: my load " << my_load
+ << " target " << target_load
+ << " total " << total_load
+ << dendl;
+
+ // under or over?
+ for (auto p : load_map) {
+ if (p.first < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
+ dout(5) << " mds." << p.second << " is underloaded or barely overloaded." << dendl;
+ mds_last_epoch_under_map[p.second] = beat_epoch;
+ }
+ }
+
+ int last_epoch_under = mds_last_epoch_under_map[whoami];
+ if (last_epoch_under == beat_epoch) {
+ dout(5) << " i am underloaded or barely overloaded, doing nothing." << dendl;
+ return;
+ }
+ // am i over long enough?
+ if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
+ dout(5) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
+ return;
+ }
+
+ dout(5) << " i am sufficiently overloaded" << dendl;
+
+
+ // first separate exporters and importers
+ multimap<double,mds_rank_t> importers;
+ multimap<double,mds_rank_t> exporters;
+ set<mds_rank_t> importer_set;
+ set<mds_rank_t> exporter_set;
+
+ for (multimap<double,mds_rank_t>::iterator it = load_map.begin();
+ it != load_map.end();
+ ++it) {
+ if (it->first < target_load) {
+ dout(15) << " mds." << it->second << " is importer" << dendl;
+ importers.insert(pair<double,mds_rank_t>(it->first,it->second));
+ importer_set.insert(it->second);
+ } else {
+ int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
+ if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
+ dout(15) << " mds." << it->second << " is exporter" << dendl;
+ exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
+ exporter_set.insert(it->second);
+ }
+ }
+ }
+
+
+ // determine load transfer mapping
+
+ if (true) {
+ // analyze import_map; do any matches i can
+
+ dout(15) << " matching exporters to import sources" << dendl;
+
+ // big -> small exporters
+ for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
+ ex != exporters.rend();
+ ++ex) {
+ double maxex = get_maxex(state, ex->second);
+ if (maxex <= .001) continue;
+
+ // check importers. for now, just in arbitrary order (no intelligent matching).
+ for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
+ im != mds_import_map[ex->second].end();
+ ++im) {
+ double maxim = get_maxim(state, im->first);
+ if (maxim <= .001) continue;
+ try_match(state, ex->second, maxex, im->first, maxim);
+ if (maxex <= .001) break;
+ }
+ }
+ }
+
+ // old way
+ if (beat % 2 == 1) {
+ dout(15) << " matching big exporters to big importers" << dendl;
+ // big exporters to big importers
+ multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
+ multimap<double,mds_rank_t>::iterator im = importers.begin();
+ while (ex != exporters.rend() &&
+ im != importers.end()) {
+ double maxex = get_maxex(state, ex->second);
+ double maxim = get_maxim(state, im->second);
+ if (maxex < .001 || maxim < .001) break;
+ try_match(state, ex->second, maxex, im->second, maxim);
+ if (maxex <= .001) ++ex;
+ if (maxim <= .001) ++im;
+ }
+ } else { // new way
+ dout(15) << " matching small exporters to big importers" << dendl;
+ // small exporters to big importers
+ multimap<double,mds_rank_t>::iterator ex = exporters.begin();
+ multimap<double,mds_rank_t>::iterator im = importers.begin();
+ while (ex != exporters.end() &&
+ im != importers.end()) {
+ double maxex = get_maxex(state, ex->second);
+ double maxim = get_maxim(state, im->second);
+ if (maxex < .001 || maxim < .001) break;
+ try_match(state, ex->second, maxex, im->second, maxim);
+ if (maxex <= .001) ++ex;
+ if (maxim <= .001) ++im;
+ }
+ }
+ }
+ try_rebalance(state);
+}
+
+int MDBalancer::mantle_prep_rebalance()
+{
+ balance_state_t state;
+
+ /* refresh balancer if it has changed */
+ if (bal_version != mds->mdsmap->get_balancer()) {
+ bal_version.assign("");
+ int r = localize_balancer();
+ if (r) return r;
+
+ /* only spam the cluster log from 1 mds on version changes */
+ if (mds->get_nodeid() == 0)
+ mds->clog->info() << "mantle balancer version changed: " << bal_version;
+ }
+
+ /* prepare for balancing */
+ int cluster_size = mds->get_mds_map()->get_num_in_mds();
+ rebalance_time = clock::now();
+ mds->mdcache->migrator->clear_export_queue();
+
+ /* fill in the metrics for each mds by grabbing load struct */
+ vector < map<string, double> > metrics (cluster_size);
+ for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
+ mds_load_t& load = mds_load.at(i);
+
+ metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
+ {"all.meta_load", load.all.meta_load()},
+ {"req_rate", load.req_rate},
+ {"queue_len", load.queue_len},
+ {"cpu_load_avg", load.cpu_load_avg}};
+ }
+
+ /* execute the balancer */
+ Mantle mantle;
+ int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
+ dout(5) << " mantle decided that new targets=" << state.targets << dendl;
+
+ /* mantle doesn't know about cluster size, so check target len here */
+ if ((int) state.targets.size() != cluster_size)
+ return -EINVAL;
+ else if (ret)
+ return ret;
+
+ try_rebalance(state);
+ return 0;
+}
+
+
+
+void MDBalancer::try_rebalance(balance_state_t& state)
+{
+ if (g_conf()->mds_thrash_exports) {
+ dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
+ << dendl;
+ return;
+ }
+
+ // make a sorted list of my imports
+ multimap<double, CDir*> import_pop_map;
+ multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
+
+ for (auto& dir : mds->mdcache->get_fullauth_subtrees()) {
+ CInode *diri = dir->get_inode();
+ if (diri->is_mdsdir())
+ continue;
+ if (diri->get_export_pin(false) != MDS_RANK_NONE)
+ continue;
+ if (dir->is_freezing() || dir->is_frozen())
+ continue; // export pbly already in progress
+
+ mds_rank_t from = diri->authority().first;
+ double pop = dir->pop_auth_subtree.meta_load();
+ if (g_conf()->mds_bal_idle_threshold > 0 &&
+ pop < g_conf()->mds_bal_idle_threshold &&
+ diri != mds->mdcache->get_root() &&
+ from != mds->get_nodeid()) {
+ dout(5) << " exporting idle (" << pop << ") import " << *dir
+ << " back to mds." << from << dendl;
+ mds->mdcache->migrator->export_dir_nicely(dir, from);
+ continue;
+ }
+
+ dout(15) << " map: i imported " << *dir << " from " << from << dendl;
+ import_pop_map.insert(make_pair(pop, dir));
+ import_from_map.insert(make_pair(from, make_pair(dir, pop)));
+ }
+
+ // do my exports!
+ map<mds_rank_t, double> export_pop_map;
+
+ for (auto &it : state.targets) {
+ mds_rank_t target = it.first;
+ double amount = it.second;
+
+ if (amount < MIN_OFFLOAD)
+ continue;
+ if (amount * 10 * state.targets.size() < target_load)
+ continue;
+
+ dout(5) << "want to send " << amount << " to mds." << target
+ //<< " .. " << (*it).second << " * " << load_fac
+ << " -> " << amount
+ << dendl;//" .. fudge is " << fudge << dendl;
+
+ double& have = export_pop_map[target];
+
+ mds->mdcache->show_subtrees();
+
+ // search imports from target
+ if (import_from_map.count(target)) {
+ dout(5) << " aha, looking through imports from target mds." << target << dendl;
+ for (auto p = import_from_map.equal_range(target);
+ p.first != p.second; ) {
+ CDir *dir = p.first->second.first;
+ double pop = p.first->second.second;
+ dout(5) << "considering " << *dir << " from " << (*p.first).first << dendl;
+ auto plast = p.first++;
+
+ if (dir->inode->is_base())
+ continue;
+ ceph_assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
+
+ if (pop <= amount-have) {
+ dout(5) << "reexporting " << *dir << " pop " << pop
+ << " back to mds." << target << dendl;
+ mds->mdcache->migrator->export_dir_nicely(dir, target);
+ have += pop;
+ import_from_map.erase(plast);
+ for (auto q = import_pop_map.equal_range(pop);
+ q.first != q.second; ) {
+ if (q.first->second == dir) {
+ import_pop_map.erase(q.first);
+ break;
+ }
+ q.first++;
+ }
+ } else {
+ dout(5) << "can't reexport " << *dir << ", too big " << pop << dendl;
+ }
+ if (amount-have < MIN_OFFLOAD)
+ break;
+ }
+ }
+ }
+
+ // any other imports
+ for (auto &it : state.targets) {
+ mds_rank_t target = it.first;
+ double amount = it.second;
+
+ if (!export_pop_map.count(target))
+ continue;
+ double& have = export_pop_map[target];
+ if (amount-have < MIN_OFFLOAD)
+ continue;
+
+ for (auto p = import_pop_map.begin();
+ p != import_pop_map.end(); ) {
+ CDir *dir = p->second;
+ if (dir->inode->is_base()) {
+ ++p;
+ continue;
+ }
+
+ double pop = p->first;
+ if (pop <= amount-have && pop > MIN_REEXPORT) {
+ dout(0) << "reexporting " << *dir << " pop " << pop
+ << " to mds." << target << dendl;
+ have += pop;
+ mds->mdcache->migrator->export_dir_nicely(dir, target);
+ import_pop_map.erase(p++);
+ } else {
+ ++p;
+ }
+ if (amount-have < MIN_OFFLOAD)
+ break;
+ }
+ }
+
+ set<CDir*> already_exporting;
+
+ for (auto &it : state.targets) {
+ mds_rank_t target = it.first;
+ double amount = it.second;
+
+ if (!export_pop_map.count(target))
+ continue;
+ double& have = export_pop_map[target];
+ if (amount-have < MIN_OFFLOAD)
+ continue;
+
+ // okay, search for fragments of my workload
+ list<CDir*> exports;
+
+ for (auto p = import_pop_map.rbegin();
+ p != import_pop_map.rend();
+ ++p) {
+ CDir *dir = p->second;
+ find_exports(dir, amount, exports, have, already_exporting);
+ if (amount-have < MIN_OFFLOAD)
+ break;
+ }
+ //fudge = amount - have;
+
+ for (auto dir : exports) {
+ dout(5) << " - exporting " << dir->pop_auth_subtree
+ << " " << dir->pop_auth_subtree.meta_load()
+ << " to mds." << target << " " << *dir << dendl;
+ mds->mdcache->migrator->export_dir_nicely(dir, target);
+ }
+ }
+
+ dout(5) << "rebalance done" << dendl;
+ mds->mdcache->show_subtrees();
+}
+
+void MDBalancer::find_exports(CDir *dir,
+ double amount,
+ list<CDir*>& exports,
+ double& have,
+ set<CDir*>& already_exporting)
+{
+ auto now = clock::now();
+ auto duration = std::chrono::duration<double>(now-rebalance_time).count();
+ if (duration > 0.1) {
+ derr << " balancer runs too long" << dendl_impl;
+ have = amount;
+ return;
+ }
+
+ ceph_assert(dir->is_auth());
+
+ double need = amount - have;
+ if (need < amount * g_conf()->mds_bal_min_start)
+ return; // good enough!
+
+ double needmax = need * g_conf()->mds_bal_need_max;
+ double needmin = need * g_conf()->mds_bal_need_min;
+ double midchunk = need * g_conf()->mds_bal_midchunk;
+ double minchunk = need * g_conf()->mds_bal_minchunk;
+
+ list<CDir*> bigger_rep, bigger_unrep;
+ multimap<double, CDir*> smaller;
+
+ double dir_pop = dir->pop_auth_subtree.meta_load();
+ dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
+
+ double subdir_sum = 0;
+ for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
+ !it.end(); ) {
+ CInode *in = *it;
+ ++it;
+
+ ceph_assert(in->is_dir());
+ ceph_assert(in->get_parent_dir() == dir);
+
+ list<CDir*> dfls;
+ in->get_nested_dirfrags(dfls);
+
+ size_t num_idle_frags = 0;
+ for (list<CDir*>::iterator p = dfls.begin();
+ p != dfls.end();
+ ++p) {
+ CDir *subdir = *p;
+ if (already_exporting.count(subdir))
+ continue;
+
+ // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
+ // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
+ if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
+ subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
+ continue; // can't export this right now!
+
+ // how popular?
+ double pop = subdir->pop_auth_subtree.meta_load();
+ subdir_sum += pop;
+ dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
+
+ if (pop < minchunk) {
+ num_idle_frags++;
+ continue;
+ }
+
+ // lucky find?
+ if (pop > needmin && pop < needmax) {
+ exports.push_back(subdir);
+ already_exporting.insert(subdir);
+ have += pop;
+ return;
+ }
+
+ if (pop > need) {
+ if (subdir->is_rep())
+ bigger_rep.push_back(subdir);
+ else
+ bigger_unrep.push_back(subdir);
+ } else
+ smaller.insert(pair<double,CDir*>(pop, subdir));
+ }
+ if (dfls.size() == num_idle_frags)
+ in->item_pop_lru.remove_myself();
+ }
+ dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl;
+
+ // grab some sufficiently big small items
+ multimap<double,CDir*>::reverse_iterator it;
+ for (it = smaller.rbegin();
+ it != smaller.rend();
+ ++it) {
+
+ if ((*it).first < midchunk)
+ break; // try later
+
+ dout(7) << " taking smaller " << *(*it).second << dendl;
+
+ exports.push_back((*it).second);
+ already_exporting.insert((*it).second);
+ have += (*it).first;
+ if (have > needmin)
+ return;
+ }
+
+ // apprently not enough; drill deeper into the hierarchy (if non-replicated)
+ for (list<CDir*>::iterator it = bigger_unrep.begin();
+ it != bigger_unrep.end();
+ ++it) {
+ dout(15) << " descending into " << **it << dendl;
+ find_exports(*it, amount, exports, have, already_exporting);
+ if (have > needmin)
+ return;
+ }
+
+ // ok fine, use smaller bits
+ for (;
+ it != smaller.rend();
+ ++it) {
+ dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl;
+
+ exports.push_back((*it).second);
+ already_exporting.insert((*it).second);
+ have += (*it).first;
+ if (have > needmin)
+ return;
+ }
+
+ // ok fine, drill into replicated dirs
+ for (list<CDir*>::iterator it = bigger_rep.begin();
+ it != bigger_rep.end();
+ ++it) {
+ dout(7) << " descending into replicated " << **it << dendl;
+ find_exports(*it, amount, exports, have, already_exporting);
+ if (have > needmin)
+ return;
+ }
+}
+
+void MDBalancer::hit_inode(CInode *in, int type, int who)
+{
+ // hit inode
+ in->pop.get(type).hit();
+
+ if (in->get_parent_dn())
+ hit_dir(in->get_parent_dn()->get_dir(), type, who);
+}
+
+void MDBalancer::maybe_fragment(CDir *dir, bool hot)
+{
+ // split/merge
+ if (bal_fragment_dirs && bal_fragment_interval > 0 &&
+ dir->is_auth() &&
+ !dir->inode->is_base() && // not root/mdsdir (for now at least)
+ !dir->inode->is_stray()) { // not straydir
+
+ // split
+ if (g_conf()->mds_bal_split_size > 0 && (dir->should_split() || hot)) {
+ if (split_pending.count(dir->dirfrag()) == 0) {
+ queue_split(dir, false);
+ } else {
+ if (dir->should_split_fast()) {
+ queue_split(dir, true);
+ } else {
+ dout(10) << __func__ << ": fragment already enqueued to split: "
+ << *dir << dendl;
+ }
+ }
+ }
+
+ // merge?
+ if (dir->get_frag() != frag_t() && dir->should_merge() &&
+ merge_pending.count(dir->dirfrag()) == 0) {
+ queue_merge(dir);
+ }
+ }
+}
+
+void MDBalancer::hit_dir(CDir *dir, int type, int who, double amount)
+{
+ // hit me
+ double v = dir->pop_me.get(type).hit(amount);
+
+ const bool hot = (v > g_conf()->mds_bal_split_rd && type == META_POP_IRD) ||
+ (v > g_conf()->mds_bal_split_wr && type == META_POP_IWR);
+
+ dout(20) << "hit_dir " << type << " pop is " << v << ", frag " << dir->get_frag()
+ << " size " << dir->get_frag_size() << " " << dir->pop_me << dendl;
+
+ maybe_fragment(dir, hot);
+
+ // replicate?
+ if (type == META_POP_IRD && who >= 0) {
+ dir->pop_spread.hit(who);
+ }
+
+ double rd_adj = 0.0;
+ if (type == META_POP_IRD &&
+ dir->last_popularity_sample < last_sample) {
+ double dir_pop = dir->pop_auth_subtree.get(type).get(); // hmm??
+ dir->last_popularity_sample = last_sample;
+ double pop_sp = dir->pop_spread.get();
+ dir_pop += pop_sp * 10;
+
+ //if (dir->ino() == inodeno_t(0x10000000002))
+ if (pop_sp > 0) {
+ dout(20) << "hit_dir " << type << " pop " << dir_pop << " spread " << pop_sp
+ << " " << dir->pop_spread.last[0]
+ << " " << dir->pop_spread.last[1]
+ << " " << dir->pop_spread.last[2]
+ << " " << dir->pop_spread.last[3]
+ << " in " << *dir << dendl;
+ }
+
+ if (dir->is_auth() && !dir->is_ambiguous_auth()) {
+ if (!dir->is_rep() &&
+ dir_pop >= g_conf()->mds_bal_replicate_threshold) {
+ // replicate
+ double rdp = dir->pop_me.get(META_POP_IRD).get();
+ rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
+ rd_adj /= 2.0; // temper somewhat
+
+ dout(5) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl;
+
+ dir->dir_rep = CDir::REP_ALL;
+ mds->mdcache->send_dir_updates(dir, true);
+
+ // fixme this should adjust the whole pop hierarchy
+ dir->pop_me.get(META_POP_IRD).adjust(rd_adj);
+ dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
+ }
+
+ if (dir->ino() != 1 &&
+ dir->is_rep() &&
+ dir_pop < g_conf()->mds_bal_unreplicate_threshold) {
+ // unreplicate
+ dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
+
+ dir->dir_rep = CDir::REP_NONE;
+ mds->mdcache->send_dir_updates(dir);
+ }
+ }
+ }
+
+ // adjust ancestors
+ bool hit_subtree = dir->is_auth(); // current auth subtree (if any)
+ bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
+
+ while (true) {
+ CDir *pdir = dir->inode->get_parent_dir();
+ dir->pop_nested.get(type).hit(amount);
+ if (rd_adj != 0.0)
+ dir->pop_nested.get(META_POP_IRD).adjust(rd_adj);
+
+ if (hit_subtree) {
+ dir->pop_auth_subtree.get(type).hit(amount);
+
+ if (rd_adj != 0.0)
+ dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
+
+ if (dir->is_subtree_root())
+ hit_subtree = false; // end of auth domain, stop hitting auth counters.
+ else if (pdir)
+ pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
+ }
+
+ if (hit_subtree_nested) {
+ dir->pop_auth_subtree_nested.get(type).hit(amount);
+ if (rd_adj != 0.0)
+ dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(rd_adj);
+ }
+ if (!pdir) break;
+ dir = pdir;
+ }
+}
+
+
+/*
+ * subtract off an exported chunk.
+ * this excludes *dir itself (encode_export_dir should have take care of that)
+ * we _just_ do the parents' nested counters.
+ *
+ * NOTE: call me _after_ forcing *dir into a subtree root,
+ * but _before_ doing the encode_export_dirs.
+ */
+void MDBalancer::subtract_export(CDir *dir)
+{
+ dirfrag_load_vec_t subload = dir->pop_auth_subtree;
+
+ while (true) {
+ dir = dir->inode->get_parent_dir();
+ if (!dir) break;
+
+ dir->pop_nested.sub(subload);
+ dir->pop_auth_subtree_nested.sub(subload);
+ }
+}
+
+
+void MDBalancer::add_import(CDir *dir)
+{
+ dirfrag_load_vec_t subload = dir->pop_auth_subtree;
+
+ while (true) {
+ dir = dir->inode->get_parent_dir();
+ if (!dir) break;
+
+ dir->pop_nested.add(subload);
+ dir->pop_auth_subtree_nested.add(subload);
+ }
+}
+
+void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc)
+{
+ bool adjust_subtree_nest = dir->is_auth();
+ bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
+ CDir *cur = dir;
+ while (true) {
+ if (inc) {
+ pdir->pop_nested.add(dir->pop_nested);
+ if (adjust_subtree) {
+ pdir->pop_auth_subtree.add(dir->pop_auth_subtree);
+ pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
+ }
+
+ if (adjust_subtree_nest)
+ pdir->pop_auth_subtree_nested.add(dir->pop_auth_subtree_nested);
+ } else {
+ pdir->pop_nested.sub(dir->pop_nested);
+ if (adjust_subtree)
+ pdir->pop_auth_subtree.sub(dir->pop_auth_subtree);
+
+ if (adjust_subtree_nest)
+ pdir->pop_auth_subtree_nested.sub(dir->pop_auth_subtree_nested);
+ }
+
+ if (pdir->is_subtree_root())
+ adjust_subtree = false;
+ cur = pdir;
+ pdir = pdir->inode->get_parent_dir();
+ if (!pdir) break;
+ }
+}
+
+void MDBalancer::handle_mds_failure(mds_rank_t who)
+{
+ if (0 == who) {
+ mds_last_epoch_under_map.clear();
+ }
+}
+
+int MDBalancer::dump_loads(Formatter *f)
+{
+ list<CDir*> dfs;
+ if (mds->mdcache->get_root()) {
+ mds->mdcache->get_root()->get_dirfrags(dfs);
+ } else {
+ dout(5) << "dump_load no root" << dendl;
+ }
+
+ f->open_object_section("loads");
+
+ f->open_array_section("dirfrags");
+ while (!dfs.empty()) {
+ CDir *dir = dfs.front();
+ dfs.pop_front();
+
+ f->open_object_section("dir");
+ dir->dump_load(f);
+ f->close_section();
+
+ for (auto it = dir->begin(); it != dir->end(); ++it) {
+ CInode *in = it->second->get_linkage()->get_inode();
+ if (!in || !in->is_dir())
+ continue;
+
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (auto subdir : ls) {
+ if (subdir->pop_nested.meta_load() < .001)
+ continue;
+ dfs.push_back(subdir);
+ }
+ }
+ }
+ f->close_section(); // dirfrags array
+
+ f->open_object_section("mds_load");
+ {
+
+ auto dump_mds_load = [f](mds_load_t& load) {
+ f->dump_float("request_rate", load.req_rate);
+ f->dump_float("cache_hit_rate", load.cache_hit_rate);
+ f->dump_float("queue_length", load.queue_len);
+ f->dump_float("cpu_load", load.cpu_load_avg);
+ f->dump_float("mds_load", load.mds_load());
+
+ f->open_object_section("auth_dirfrags");
+ load.auth.dump(f);
+ f->close_section();
+ f->open_object_section("all_dirfrags");
+ load.all.dump(f);
+ f->close_section();
+ };
+
+ for (auto p : mds_load) {
+ stringstream name;
+ name << "mds." << p.first;
+ f->open_object_section(name.str().c_str());
+ dump_mds_load(p.second);
+ f->close_section();
+ }
+ }
+ f->close_section(); // mds_load
+
+ f->open_object_section("mds_meta_load");
+ for (auto p : mds_meta_load) {
+ stringstream name;
+ name << "mds." << p.first;
+ f->dump_float(name.str().c_str(), p.second);
+ }
+ f->close_section(); // mds_meta_load
+
+ f->open_object_section("mds_import_map");
+ for (auto p : mds_import_map) {
+ stringstream name1;
+ name1 << "mds." << p.first;
+ f->open_array_section(name1.str().c_str());
+ for (auto q : p.second) {
+ f->open_object_section("from");
+ stringstream name2;
+ name2 << "mds." << q.first;
+ f->dump_float(name2.str().c_str(), q.second);
+ f->close_section();
+ }
+ f->close_section(); // mds.? array
+ }
+ f->close_section(); // mds_import_map
+
+ f->close_section(); // loads
+ return 0;
+}
diff --git a/src/mds/MDBalancer.h b/src/mds/MDBalancer.h
new file mode 100644
index 00000000..4050eac9
--- /dev/null
+++ b/src/mds/MDBalancer.h
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef CEPH_MDBALANCER_H
+#define CEPH_MDBALANCER_H
+
+#include <list>
+#include <map>
+
+#include "include/types.h"
+#include "common/Clock.h"
+#include "common/Cond.h"
+
+#include "msg/Message.h"
+#include "messages/MHeartbeat.h"
+
+#include "MDSMap.h"
+
+class MDSRank;
+class MHeartbeat;
+class CInode;
+class CDir;
+class Messenger;
+class MonClient;
+
+class MDBalancer {
+public:
+ using clock = ceph::coarse_mono_clock;
+ using time = ceph::coarse_mono_time;
+ friend class C_Bal_SendHeartbeat;
+
+ MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc);
+
+ void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
+
+ int proc_message(const Message::const_ref &m);
+
+ /**
+ * Regularly called upkeep function.
+ *
+ * Sends MHeartbeat messages to the mons.
+ */
+ void tick();
+
+ void subtract_export(CDir *ex);
+ void add_import(CDir *im);
+ void adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc);
+
+ void hit_inode(CInode *in, int type, int who=-1);
+ void hit_dir(CDir *dir, int type, int who=-1, double amount=1.0);
+
+ void queue_split(const CDir *dir, bool fast);
+ void queue_merge(CDir *dir);
+
+ /**
+ * Based on size and configuration, decide whether to issue a queue_split
+ * or queue_merge for this CDir.
+ *
+ * \param hot whether the directory's temperature is enough to split it
+ */
+ void maybe_fragment(CDir *dir, bool hot);
+
+ void handle_mds_failure(mds_rank_t who);
+
+ int dump_loads(Formatter *f);
+
+private:
+ bool bal_fragment_dirs;
+ int64_t bal_fragment_interval;
+ static const unsigned int AUTH_TREES_THRESHOLD = 5;
+
+ typedef struct {
+ std::map<mds_rank_t, double> targets;
+ std::map<mds_rank_t, double> imported;
+ std::map<mds_rank_t, double> exported;
+ } balance_state_t;
+
+ //set up the rebalancing targets for export and do one if the
+ //MDSMap is up to date
+ void prep_rebalance(int beat);
+ int mantle_prep_rebalance();
+
+ void handle_export_pins(void);
+
+ mds_load_t get_load();
+ int localize_balancer();
+ void send_heartbeat();
+ void handle_heartbeat(const MHeartbeat::const_ref &m);
+ void find_exports(CDir *dir,
+ double amount,
+ std::list<CDir*>& exports,
+ double& have,
+ set<CDir*>& already_exporting);
+
+ double try_match(balance_state_t &state,
+ mds_rank_t ex, double& maxex,
+ mds_rank_t im, double& maxim);
+
+ double get_maxim(balance_state_t &state, mds_rank_t im) {
+ return target_load - mds_meta_load[im] - state.imported[im];
+ }
+ double get_maxex(balance_state_t &state, mds_rank_t ex) {
+ return mds_meta_load[ex] - target_load - state.exported[ex];
+ }
+
+ /**
+ * Try to rebalance.
+ *
+ * Check if the monitor has recorded the current export targets;
+ * if it has then do the actual export. Otherwise send off our
+ * export targets message again.
+ */
+ void try_rebalance(balance_state_t& state);
+
+ MDSRank *mds;
+ Messenger *messenger;
+ MonClient *mon_client;
+ int beat_epoch = 0;
+
+ string bal_code;
+ string bal_version;
+
+ time last_heartbeat = clock::zero();
+ time last_sample = clock::zero();
+ time rebalance_time = clock::zero(); //ensure a consistent view of load for rebalance
+
+ time last_get_load = clock::zero();
+ uint64_t last_num_requests = 0;
+ uint64_t last_cpu_time = 0;
+
+ // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
+ // just as soon as a delayed context comes back and triggers it.
+ // These sets just prevent us from spawning extra timer contexts for
+ // dirfrags that already have one in flight.
+ set<dirfrag_t> split_pending, merge_pending;
+
+ // per-epoch scatter/gathered info
+ std::map<mds_rank_t, mds_load_t> mds_load;
+ std::map<mds_rank_t, double> mds_meta_load;
+ std::map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;
+ std::map<mds_rank_t, int> mds_last_epoch_under_map;
+
+ // per-epoch state
+ double my_load = 0;
+ double target_load = 0;
+};
+
+#endif
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
new file mode 100644
index 00000000..eb0a706f
--- /dev/null
+++ b/src/mds/MDCache.cc
@@ -0,0 +1,13084 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <map>
+
+#include "MDCache.h"
+#include "MDSRank.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDLog.h"
+#include "MDBalancer.h"
+#include "Migrator.h"
+#include "ScrubStack.h"
+
+#include "SnapClient.h"
+
+#include "MDSMap.h"
+
+#include "CInode.h"
+#include "CDir.h"
+
+#include "Mutation.h"
+
+#include "include/ceph_fs.h"
+#include "include/filepath.h"
+#include "include/util.h"
+
+#include "messages/MClientCaps.h"
+
+#include "msg/Message.h"
+#include "msg/Messenger.h"
+
+#include "common/MemoryModel.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "common/safe_io.h"
+
+#include "osdc/Journaler.h"
+#include "osdc/Filer.h"
+
+#include "events/ESubtreeMap.h"
+#include "events/EUpdate.h"
+#include "events/ESlaveUpdate.h"
+#include "events/EImportFinish.h"
+#include "events/EFragment.h"
+#include "events/ECommitted.h"
+#include "events/ESessions.h"
+
+#include "InoTable.h"
+
+#include "common/Timer.h"
+
+#include "perfglue/heap_profiler.h"
+
+
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+ return *_dout << "mds." << mds->get_nodeid() << ".cache ";
+}
+
+set<int> SimpleLock::empty_gather_set;
+
+
+/**
+ * All non-I/O contexts that require a reference
+ * to an MDCache instance descend from this.
+ */
+class MDCacheContext : public virtual MDSContext {
+protected:
+ MDCache *mdcache;
+ MDSRank *get_mds() override
+ {
+ ceph_assert(mdcache != NULL);
+ return mdcache->mds;
+ }
+public:
+ explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
+};
+
+
+/**
+ * Only for contexts called back from an I/O completion
+ *
+ * Note: duplication of members wrt MDCacheContext, because
+ * it'ls the lesser of two evils compared with introducing
+ * yet another piece of (multiple) inheritance.
+ */
+class MDCacheIOContext : public virtual MDSIOContextBase {
+protected:
+ MDCache *mdcache;
+ MDSRank *get_mds() override
+ {
+ ceph_assert(mdcache != NULL);
+ return mdcache->mds;
+ }
+public:
+ explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
+ MDSIOContextBase(track), mdcache(mdc_) {}
+};
+
+class MDCacheLogContext : public virtual MDSLogContextBase {
+protected:
+ MDCache *mdcache;
+ MDSRank *get_mds() override
+ {
+ ceph_assert(mdcache != NULL);
+ return mdcache->mds;
+ }
+public:
+ explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
+};
+
+MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
+ mds(m),
+ filer(m->objecter, m->finisher),
+ recovery_queue(m),
+ stray_manager(m, purge_queue_),
+ trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate")),
+ open_file_table(m)
+{
+ migrator.reset(new Migrator(mds, this));
+
+ max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
+ (g_conf()->mds_dir_max_commit_size << 20) :
+ (0.9 *(g_conf()->osd_max_write_size << 20));
+
+ cache_inode_limit = g_conf().get_val<int64_t>("mds_cache_size");
+ cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
+ cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
+ cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
+ forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
+
+ lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
+
+ bottom_lru.lru_set_midpoint(0);
+
+ decayrate.set_halflife(g_conf()->mds_decay_halflife);
+
+ upkeeper = std::thread([this]() {
+ std::unique_lock lock(upkeep_mutex);
+ while (!upkeep_trim_shutdown.load()) {
+ auto now = clock::now();
+ auto since = now-upkeep_last_trim;
+ auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval"));
+ if (since >= trim_interval*.90) {
+ lock.unlock(); /* mds_lock -> upkeep_mutex */
+ std::scoped_lock mds_lock(mds->mds_lock);
+ lock.lock();
+ if (upkeep_trim_shutdown.load())
+ return;
+ if (mds->is_cache_trimmable()) {
+ dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl;
+ trim_client_leases();
+ trim();
+ check_memory_usage();
+ auto flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS;
+ mds->server->recall_client_state(nullptr, flags);
+ upkeep_last_trim = clock::now();
+ upkeep_last_trim = now = clock::now();
+ } else {
+ dout(10) << "cache not ready for trimming" << dendl;
+ }
+ } else {
+ trim_interval -= since;
+ }
+ since = now-upkeep_last_release;
+ auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval"));
+ if (since >= release_interval*.90) {
+ /* XXX not necessary once MDCache uses PriorityCache */
+ dout(10) << "releasing free memory" << dendl;
+ ceph_heap_release_free_memory();
+ upkeep_last_release = clock::now();
+ } else {
+ release_interval -= since;
+ }
+ auto interval = std::min(release_interval, trim_interval);
+ dout(20) << "upkeep thread waiting interval " << interval << dendl;
+ upkeep_cvar.wait_for(lock, interval);
+ }
+ });
+}
+
+MDCache::~MDCache()
+{
+ if (logger) {
+ g_ceph_context->get_perfcounters_collection()->remove(logger.get());
+ }
+ if (upkeeper.joinable())
+ upkeeper.join();
+}
+
+void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap)
+{
+ if (changed.count("mds_cache_size"))
+ cache_inode_limit = g_conf().get_val<int64_t>("mds_cache_size");
+ if (changed.count("mds_cache_memory_limit"))
+ cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
+ if (changed.count("mds_cache_reservation"))
+ cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
+ if (changed.count("mds_health_cache_threshold"))
+ cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
+ if (changed.count("mds_cache_mid"))
+ lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
+ if (changed.count("mds_cache_trim_decay_rate")) {
+ trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
+ }
+ if (changed.count("mds_forward_all_requests_to_auth")){
+ forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
+ }
+
+ migrator->handle_conf_change(changed, mdsmap);
+ mds->balancer->handle_conf_change(changed, mdsmap);
+}
+
+void MDCache::log_stat()
+{
+ mds->logger->set(l_mds_inode_max, cache_inode_limit ? : INT_MAX);
+ mds->logger->set(l_mds_inodes, lru.lru_get_size());
+ mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
+ mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
+ mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
+ mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
+ mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
+ mds->logger->set(l_mds_caps, Capability::count());
+ if (root) {
+ mds->logger->set(l_mds_root_rfiles, root->inode.rstat.rfiles);
+ mds->logger->set(l_mds_root_rbytes, root->inode.rstat.rbytes);
+ mds->logger->set(l_mds_root_rsnaps, root->inode.rstat.rsnaps);
+ }
+}
+
+
+//
+
+bool MDCache::shutdown()
+{
+ {
+ std::scoped_lock lock(upkeep_mutex);
+ upkeep_trim_shutdown = true;
+ upkeep_cvar.notify_one();
+ }
+ if (lru.lru_get_size() > 0) {
+ dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
+ //show_cache();
+ show_subtrees();
+ //dump();
+ }
+ return true;
+}
+
+
+// ====================================================================
+// some inode functions
+
+void MDCache::add_inode(CInode *in)
+{
+ // add to lru, inode map
+ if (in->last == CEPH_NOSNAP) {
+ auto &p = inode_map[in->ino()];
+ ceph_assert(!p); // should be no dup inos!
+ p = in;
+ } else {
+ auto &p = snap_inode_map[in->vino()];
+ ceph_assert(!p); // should be no dup inos!
+ p = in;
+ }
+
+ if (in->ino() < MDS_INO_SYSTEM_BASE) {
+ if (in->ino() == MDS_INO_ROOT)
+ root = in;
+ else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
+ myin = in;
+ else if (in->is_stray()) {
+ if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
+ strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
+ }
+ }
+ if (in->is_base())
+ base_inodes.insert(in);
+ }
+
+ if (cache_toofull()) {
+ exceeded_size_limit = true;
+ }
+}
+
+void MDCache::remove_inode(CInode *o)
+{
+ dout(14) << "remove_inode " << *o << dendl;
+
+ if (o->get_parent_dn()) {
+ // FIXME: multiple parents?
+ CDentry *dn = o->get_parent_dn();
+ ceph_assert(!dn->is_dirty());
+ dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
+ }
+
+ if (o->is_dirty())
+ o->mark_clean();
+ if (o->is_dirty_parent())
+ o->clear_dirty_parent();
+
+ o->clear_scatter_dirty();
+
+ o->item_open_file.remove_myself();
+
+ if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
+ export_pin_queue.erase(o);
+
+ if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
+ export_pin_delayed_queue.erase(o);
+
+ // remove from inode map
+ if (o->last == CEPH_NOSNAP) {
+ inode_map.erase(o->ino());
+ } else {
+ o->item_caps.remove_myself();
+ snap_inode_map.erase(o->vino());
+ }
+
+ if (o->ino() < MDS_INO_SYSTEM_BASE) {
+ if (o == root) root = 0;
+ if (o == myin) myin = 0;
+ if (o->is_stray()) {
+ if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
+ strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
+ }
+ }
+ if (o->is_base())
+ base_inodes.erase(o);
+ }
+
+ // delete it
+ ceph_assert(o->get_num_ref() == 0);
+ delete o;
+}
+
+file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
+{
+ file_layout_t result = file_layout_t::get_default();
+ result.pool_id = mdsmap.get_first_data_pool();
+ return result;
+}
+
+file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
+{
+ file_layout_t result = file_layout_t::get_default();
+ result.pool_id = mdsmap.get_metadata_pool();
+ if (g_conf()->mds_log_segment_size > 0) {
+ result.object_size = g_conf()->mds_log_segment_size;
+ result.stripe_unit = g_conf()->mds_log_segment_size;
+ }
+ return result;
+}
+
+void MDCache::init_layouts()
+{
+ default_file_layout = gen_default_file_layout(*(mds->mdsmap));
+ default_log_layout = gen_default_log_layout(*(mds->mdsmap));
+}
+
+void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
+ int mode) const
+{
+ in->inode.ino = ino;
+ in->inode.version = 1;
+ in->inode.xattr_version = 1;
+ in->inode.mode = 0500 | mode;
+ in->inode.size = 0;
+ in->inode.ctime =
+ in->inode.mtime =
+ in->inode.btime = ceph_clock_now();
+ in->inode.nlink = 1;
+ in->inode.truncate_size = -1ull;
+ in->inode.change_attr = 0;
+ in->inode.export_pin = MDS_RANK_NONE;
+
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
+ if (in->inode.is_dir()) {
+ in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+ in->inode.rstat.rsubdirs = 1; /* itself */
+ in->inode.rstat.rctime = in->inode.ctime;
+ } else {
+ in->inode.layout = default_file_layout;
+ ++in->inode.rstat.rfiles;
+ }
+ in->inode.accounted_rstat = in->inode.rstat;
+
+ if (in->is_base()) {
+ if (in->is_root())
+ in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
+ else
+ in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
+ in->open_snaprealm(); // empty snaprealm
+ ceph_assert(!in->snaprealm->parent); // created its own
+ in->snaprealm->srnode.seq = 1;
+ }
+}
+
+CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
+{
+ dout(0) << "creating system inode with ino:" << ino << dendl;
+ CInode *in = new CInode(this);
+ create_unlinked_system_inode(in, ino, mode);
+ add_inode(in);
+ return in;
+}
+
+CInode *MDCache::create_root_inode()
+{
+ CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
+ i->inode.uid = g_conf()->mds_root_ino_uid;
+ i->inode.gid = g_conf()->mds_root_ino_gid;
+ i->inode.layout = default_file_layout;
+ i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
+ return i;
+}
+
+void MDCache::create_empty_hierarchy(MDSGather *gather)
+{
+ // create root dir
+ CInode *root = create_root_inode();
+
+ // force empty root dir
+ CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
+ adjust_subtree_auth(rootdir, mds->get_nodeid());
+ rootdir->dir_rep = CDir::REP_ALL; //NONE;
+
+ ceph_assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
+ ceph_assert(rootdir->fnode.fragstat == root->inode.dirstat);
+ ceph_assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
+ /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
+ * assume version 0 is stale/invalid.
+ */
+
+ rootdir->mark_complete();
+ rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
+ rootdir->commit(0, gather->new_sub());
+
+ root->mark_clean();
+ root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment());
+ root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
+ root->flush(gather->new_sub());
+}
+
+void MDCache::create_mydir_hierarchy(MDSGather *gather)
+{
+ // create mds dir
+ CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
+
+ CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
+ adjust_subtree_auth(mydir, mds->get_nodeid());
+
+ LogSegment *ls = mds->mdlog->get_current_segment();
+
+ // stray dir
+ for (int i = 0; i < NUM_STRAY; ++i) {
+ CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
+ CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
+ stringstream name;
+ name << "stray" << i;
+ CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
+ sdn->_mark_dirty(mds->mdlog->get_current_segment());
+
+ stray->inode.dirstat = straydir->fnode.fragstat;
+
+ mydir->fnode.rstat.add(stray->inode.rstat);
+ mydir->fnode.fragstat.nsubdirs++;
+ // save them
+ straydir->mark_complete();
+ straydir->mark_dirty(straydir->pre_dirty(), ls);
+ straydir->commit(0, gather->new_sub());
+ stray->mark_dirty_parent(ls, true);
+ stray->store_backtrace(gather->new_sub());
+ }
+
+ mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
+ mydir->fnode.accounted_rstat = mydir->fnode.rstat;
+
+ myin->inode.dirstat = mydir->fnode.fragstat;
+ myin->inode.rstat = mydir->fnode.rstat;
+ ++myin->inode.rstat.rsubdirs;
+ myin->inode.accounted_rstat = myin->inode.rstat;
+
+ mydir->mark_complete();
+ mydir->mark_dirty(mydir->pre_dirty(), ls);
+ mydir->commit(0, gather->new_sub());
+
+ myin->store(gather->new_sub());
+}
+
+struct C_MDC_CreateSystemFile : public MDCacheLogContext {
+ MutationRef mut;
+ CDentry *dn;
+ version_t dpv;
+ MDSContext *fin;
+ C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
+ MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
+ void finish(int r) override {
+ mdcache->_create_system_file_finish(mut, dn, dpv, fin);
+ }
+};
+
+void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
+{
+ dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
+ CDentry *dn = dir->add_null_dentry(name);
+
+ dn->push_projected_linkage(in);
+ version_t dpv = dn->pre_dirty();
+
+ CDir *mdir = 0;
+ if (in->inode.is_dir()) {
+ in->inode.rstat.rsubdirs = 1;
+
+ mdir = in->get_or_open_dirfrag(this, frag_t());
+ mdir->mark_complete();
+ mdir->pre_dirty();
+ } else
+ in->inode.rstat.rfiles = 1;
+ in->inode.version = dn->pre_dirty();
+
+ SnapRealm *realm = dir->get_inode()->find_snaprealm();
+ dn->first = in->first = realm->get_newest_seq() + 1;
+
+ MutationRef mut(new MutationImpl());
+
+ // force some locks. hacky.
+ mds->locker->wrlock_force(&dir->inode->filelock, mut);
+ mds->locker->wrlock_force(&dir->inode->nestlock, mut);
+
+ mut->ls = mds->mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mds->mdlog, "create system file");
+ mds->mdlog->start_entry(le);
+
+ if (!in->is_mdsdir()) {
+ predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+ le->metablob.add_primary_dentry(dn, in, true);
+ } else {
+ predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
+ journal_dirty_inode(mut.get(), &le->metablob, in);
+ dn->push_projected_linkage(in->ino(), in->d_type());
+ le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
+ le->metablob.add_root(true, in);
+ }
+ if (mdir)
+ le->metablob.add_new_dir(mdir); // dirty AND complete AND new
+
+ mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
+ mds->mdlog->flush();
+}
+
+void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
+{
+ dout(10) << "_create_system_file_finish " << *dn << dendl;
+
+ dn->pop_projected_linkage();
+ dn->mark_dirty(dpv, mut->ls);
+
+ CInode *in = dn->get_linkage()->get_inode();
+ in->inode.version--;
+ in->mark_dirty(in->inode.version + 1, mut->ls);
+
+ if (in->inode.is_dir()) {
+ CDir *dir = in->get_dirfrag(frag_t());
+ ceph_assert(dir);
+ dir->mark_dirty(1, mut->ls);
+ dir->mark_new(mut->ls);
+ }
+
+ mut->apply();
+ mds->locker->drop_locks(mut.get());
+ mut->cleanup();
+
+ fin->complete(0);
+
+ //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
+ //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
+}
+
+
+
+struct C_MDS_RetryOpenRoot : public MDSInternalContext {
+ MDCache *cache;
+ explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
+ void finish(int r) override {
+ if (r < 0) {
+ // If we can't open root, something disastrous has happened: mark
+ // this rank damaged for operator intervention. Note that
+ // it is not okay to call suicide() here because we are in
+ // a Finisher callback.
+ cache->mds->damaged();
+ ceph_abort(); // damaged should never return
+ } else {
+ cache->open_root();
+ }
+ }
+};
+
+void MDCache::open_root_inode(MDSContext *c)
+{
+ if (mds->get_nodeid() == mds->mdsmap->get_root()) {
+ CInode *in;
+ in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
+ in->fetch(c);
+ } else {
+ discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
+ }
+}
+
+void MDCache::open_mydir_inode(MDSContext *c)
+{
+ CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
+ in->fetch(c);
+}
+
+void MDCache::open_mydir_frag(MDSContext *c)
+{
+ open_mydir_inode(
+ new MDSInternalContextWrapper(mds,
+ new FunctionContext([this, c](int r) {
+ if (r < 0) {
+ c->complete(r);
+ return;
+ }
+ CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
+ ceph_assert(mydir);
+ adjust_subtree_auth(mydir, mds->get_nodeid());
+ mydir->fetch(c);
+ })
+ )
+ );
+}
+
+void MDCache::open_root()
+{
+ dout(10) << "open_root" << dendl;
+
+ if (!root) {
+ open_root_inode(new C_MDS_RetryOpenRoot(this));
+ return;
+ }
+ if (mds->get_nodeid() == mds->mdsmap->get_root()) {
+ ceph_assert(root->is_auth());
+ CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
+ ceph_assert(rootdir);
+ if (!rootdir->is_subtree_root())
+ adjust_subtree_auth(rootdir, mds->get_nodeid());
+ if (!rootdir->is_complete()) {
+ rootdir->fetch(new C_MDS_RetryOpenRoot(this));
+ return;
+ }
+ } else {
+ ceph_assert(!root->is_auth());
+ CDir *rootdir = root->get_dirfrag(frag_t());
+ if (!rootdir) {
+ open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
+ return;
+ }
+ }
+
+ if (!myin) {
+ CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
+ in->fetch(new C_MDS_RetryOpenRoot(this));
+ return;
+ }
+ CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
+ ceph_assert(mydir);
+ adjust_subtree_auth(mydir, mds->get_nodeid());
+
+ populate_mydir();
+}
+
+void MDCache::populate_mydir()
+{
+ ceph_assert(myin);
+ CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
+ ceph_assert(mydir);
+
+ dout(10) << "populate_mydir " << *mydir << dendl;
+
+ if (!mydir->is_complete()) {
+ mydir->fetch(new C_MDS_RetryOpenRoot(this));
+ return;
+ }
+
+ if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
+ // A missing dirfrag, we will recreate it. Before that, we must dirty
+ // it before dirtying any of the strays we create within it.
+ mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
+ "recreating it now";
+ LogSegment *ls = mds->mdlog->get_current_segment();
+ mydir->state_clear(CDir::STATE_BADFRAG);
+ mydir->mark_complete();
+ mydir->mark_dirty(mydir->pre_dirty(), ls);
+ }
+
+ // open or create stray
+ uint64_t num_strays = 0;
+ for (int i = 0; i < NUM_STRAY; ++i) {
+ stringstream name;
+ name << "stray" << i;
+ CDentry *straydn = mydir->lookup(name.str());
+
+ // allow for older fs's with stray instead of stray0
+ if (straydn == NULL && i == 0)
+ straydn = mydir->lookup("stray");
+
+ if (!straydn || !straydn->get_linkage()->get_inode()) {
+ _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
+ new C_MDS_RetryOpenRoot(this));
+ return;
+ }
+ ceph_assert(straydn);
+ ceph_assert(strays[i]);
+ // we make multiple passes through this method; make sure we only pin each stray once.
+ if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
+ strays[i]->get(CInode::PIN_STRAY);
+ strays[i]->state_set(CInode::STATE_STRAYPINNED);
+ strays[i]->get_stickydirs();
+ }
+ dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
+
+ // open all frags
+ frag_vec_t leaves;
+ strays[i]->dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ CDir *dir = strays[i]->get_dirfrag(leaf);
+ if (!dir) {
+ dir = strays[i]->get_or_open_dirfrag(this, leaf);
+ }
+
+ // DamageTable applies special handling to strays: it will
+ // have damaged() us out if one is damaged.
+ ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
+
+ if (dir->get_version() == 0) {
+ dir->fetch(new C_MDS_RetryOpenRoot(this));
+ return;
+ }
+
+ if (dir->get_frag_size() > 0)
+ num_strays += dir->get_frag_size();
+ }
+ }
+
+ // okay!
+ dout(10) << "populate_mydir done" << dendl;
+ ceph_assert(!open);
+ open = true;
+ mds->queue_waiters(waiting_for_open);
+
+ stray_manager.set_num_strays(num_strays);
+ stray_manager.activate();
+
+ scan_stray_dir();
+}
+
+void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
+{
+ discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
+}
+
+CDir *MDCache::get_stray_dir(CInode *in)
+{
+ string straydname;
+ in->name_stray_dentry(straydname);
+
+ CInode *strayi = get_stray();
+ ceph_assert(strayi);
+ frag_t fg = strayi->pick_dirfrag(straydname);
+ CDir *straydir = strayi->get_dirfrag(fg);
+ ceph_assert(straydir);
+ return straydir;
+}
+
+CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
+{
+ CDir *straydir = get_stray_dir(in);
+ string straydname;
+ in->name_stray_dentry(straydname);
+ CDentry *straydn = straydir->lookup(straydname);
+ if (!straydn) {
+ straydn = straydir->add_null_dentry(straydname);
+ straydn->mark_new();
+ } else {
+ ceph_assert(straydn->get_projected_linkage()->is_null());
+ }
+
+ straydn->state_set(CDentry::STATE_STRAY);
+ return straydn;
+}
+
+
+
+MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
+{
+ // inode?
+ if (info.ino)
+ return get_inode(info.ino, info.snapid);
+
+ // dir or dentry.
+ CDir *dir = get_dirfrag(info.dirfrag);
+ if (!dir) return 0;
+
+ if (info.dname.length())
+ return dir->lookup(info.dname, info.snapid);
+ else
+ return dir;
+}
+
+
+
+
+// ====================================================================
+// subtree management
+
+/*
+ * adjust the dir_auth of a subtree.
+ * merge with parent and/or child subtrees, if is it appropriate.
+ * merge can ONLY happen if both parent and child have unambiguous auth.
+ */
+void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
+{
+ dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
+ << " on " << *dir << dendl;
+
+ show_subtrees();
+
+ CDir *root;
+ if (dir->inode->is_base()) {
+ root = dir; // bootstrap hack.
+ if (subtrees.count(root) == 0) {
+ subtrees[root];
+ root->get(CDir::PIN_SUBTREE);
+ }
+ } else {
+ root = get_subtree_root(dir); // subtree root
+ }
+ ceph_assert(root);
+ ceph_assert(subtrees.count(root));
+ dout(7) << " current root is " << *root << dendl;
+
+ if (root == dir) {
+ // i am already a subtree.
+ dir->set_dir_auth(auth);
+ } else {
+ // i am a new subtree.
+ dout(10) << " new subtree at " << *dir << dendl;
+ ceph_assert(subtrees.count(dir) == 0);
+ subtrees[dir]; // create empty subtree bounds list for me.
+ dir->get(CDir::PIN_SUBTREE);
+
+ // set dir_auth
+ dir->set_dir_auth(auth);
+
+ // move items nested beneath me, under me.
+ set<CDir*>::iterator p = subtrees[root].begin();
+ while (p != subtrees[root].end()) {
+ set<CDir*>::iterator next = p;
+ ++next;
+ if (get_subtree_root((*p)->get_parent_dir()) == dir) {
+ // move under me
+ dout(10) << " claiming child bound " << **p << dendl;
+ subtrees[dir].insert(*p);
+ subtrees[root].erase(p);
+ }
+ p = next;
+ }
+
+ // i am a bound of the parent subtree.
+ subtrees[root].insert(dir);
+
+ // i am now the subtree root.
+ root = dir;
+
+ // adjust recursive pop counters
+ if (adjust_pop && dir->is_auth()) {
+ CDir *p = dir->get_parent_dir();
+ while (p) {
+ p->pop_auth_subtree.sub(dir->pop_auth_subtree);
+ if (p->is_subtree_root()) break;
+ p = p->inode->get_parent_dir();
+ }
+ }
+ }
+
+ show_subtrees();
+}
+
+
+void MDCache::try_subtree_merge(CDir *dir)
+{
+ dout(7) << "try_subtree_merge " << *dir << dendl;
+ // record my old bounds
+ auto oldbounds = subtrees.at(dir);
+
+ set<CInode*> to_eval;
+ // try merge at my root
+ try_subtree_merge_at(dir, &to_eval);
+
+ // try merge at my old bounds
+ for (auto bound : oldbounds)
+ try_subtree_merge_at(bound, &to_eval);
+
+ if (!(mds->is_any_replay() || mds->is_resolve())) {
+ for(auto in : to_eval)
+ eval_subtree_root(in);
+ }
+}
+
+class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
+ CInode *in;
+ MutationRef mut;
+public:
+ C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
+ void finish(int r) override {
+ mdcache->subtree_merge_writebehind_finish(in, mut);
+ }
+};
+
+void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
+{
+ dout(10) << "try_subtree_merge_at " << *dir << dendl;
+
+ if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
+ dir->state_test(CDir::STATE_EXPORTBOUND) ||
+ dir->state_test(CDir::STATE_AUXSUBTREE))
+ return;
+
+ auto it = subtrees.find(dir);
+ ceph_assert(it != subtrees.end());
+
+ // merge with parent?
+ CDir *parent = dir;
+ if (!dir->inode->is_base())
+ parent = get_subtree_root(dir->get_parent_dir());
+
+ if (parent != dir && // we have a parent,
+ parent->dir_auth == dir->dir_auth) { // auth matches,
+ // merge with parent.
+ dout(10) << " subtree merge at " << *dir << dendl;
+ dir->set_dir_auth(CDIR_AUTH_DEFAULT);
+
+ // move our bounds under the parent
+ subtrees[parent].insert(it->second.begin(), it->second.end());
+
+ // we are no longer a subtree or bound
+ dir->put(CDir::PIN_SUBTREE);
+ subtrees.erase(it);
+ subtrees[parent].erase(dir);
+
+ // adjust popularity?
+ if (adjust_pop && dir->is_auth()) {
+ CDir *cur = dir;
+ CDir *p = dir->get_parent_dir();
+ while (p) {
+ p->pop_auth_subtree.add(dir->pop_auth_subtree);
+ p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
+ if (p->is_subtree_root()) break;
+ cur = p;
+ p = p->inode->get_parent_dir();
+ }
+ }
+
+ if (to_eval && dir->get_inode()->is_auth())
+ to_eval->insert(dir->get_inode());
+
+ show_subtrees(15);
+ }
+}
+
+void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
+{
+ dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
+ in->pop_and_dirty_projected_inode(mut->ls);
+
+ mut->apply();
+ mds->locker->drop_locks(mut.get());
+ mut->cleanup();
+
+ in->auth_unpin(this);
+}
+
+void MDCache::eval_subtree_root(CInode *diri)
+{
+ // evaluate subtree inode filelock?
+ // (we should scatter the filelock on subtree bounds)
+ ceph_assert(diri->is_auth());
+ mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
+}
+
+
+void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
+{
+ dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
+ << " on " << *dir
+ << " bounds " << bounds
+ << dendl;
+
+ show_subtrees();
+
+ CDir *root;
+ if (dir->ino() == MDS_INO_ROOT) {
+ root = dir; // bootstrap hack.
+ if (subtrees.count(root) == 0) {
+ subtrees[root];
+ root->get(CDir::PIN_SUBTREE);
+ }
+ } else {
+ root = get_subtree_root(dir); // subtree root
+ }
+ ceph_assert(root);
+ ceph_assert(subtrees.count(root));
+ dout(7) << " current root is " << *root << dendl;
+
+ mds_authority_t oldauth = dir->authority();
+
+ if (root == dir) {
+ // i am already a subtree.
+ dir->set_dir_auth(auth);
+ } else {
+ // i am a new subtree.
+ dout(10) << " new subtree at " << *dir << dendl;
+ ceph_assert(subtrees.count(dir) == 0);
+ subtrees[dir]; // create empty subtree bounds list for me.
+ dir->get(CDir::PIN_SUBTREE);
+
+ // set dir_auth
+ dir->set_dir_auth(auth);
+
+ // move items nested beneath me, under me.
+ set<CDir*>::iterator p = subtrees[root].begin();
+ while (p != subtrees[root].end()) {
+ set<CDir*>::iterator next = p;
+ ++next;
+ if (get_subtree_root((*p)->get_parent_dir()) == dir) {
+ // move under me
+ dout(10) << " claiming child bound " << **p << dendl;
+ subtrees[dir].insert(*p);
+ subtrees[root].erase(p);
+ }
+ p = next;
+ }
+
+ // i am a bound of the parent subtree.
+ subtrees[root].insert(dir);
+
+ // i am now the subtree root.
+ root = dir;
+ }
+
+ set<CInode*> to_eval;
+
+ // verify/adjust bounds.
+ // - these may be new, or
+ // - beneath existing ambiguous bounds (which will be collapsed),
+ // - but NOT beneath unambiguous bounds.
+ for (const auto& bound : bounds) {
+ // new bound?
+ if (subtrees[dir].count(bound) == 0) {
+ if (get_subtree_root(bound) == dir) {
+ dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
+ adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
+ }
+ else {
+ dout(10) << " want bound " << *bound << dendl;
+ CDir *t = get_subtree_root(bound->get_parent_dir());
+ if (subtrees[t].count(bound) == 0) {
+ ceph_assert(t != dir);
+ dout(10) << " new bound " << *bound << dendl;
+ adjust_subtree_auth(bound, t->authority());
+ }
+ // make sure it's nested beneath ambiguous subtree(s)
+ while (1) {
+ while (subtrees[dir].count(t) == 0)
+ t = get_subtree_root(t->get_parent_dir());
+ dout(10) << " swallowing intervening subtree at " << *t << dendl;
+ adjust_subtree_auth(t, auth);
+ try_subtree_merge_at(t, &to_eval);
+ t = get_subtree_root(bound->get_parent_dir());
+ if (t == dir) break;
+ }
+ }
+ }
+ else {
+ dout(10) << " already have bound " << *bound << dendl;
+ }
+ }
+ // merge stray bounds?
+ while (!subtrees[dir].empty()) {
+ set<CDir*> copy = subtrees[dir];
+ for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
+ if (bounds.count(*p) == 0) {
+ CDir *stray = *p;
+ dout(10) << " swallowing extra subtree at " << *stray << dendl;
+ adjust_subtree_auth(stray, auth);
+ try_subtree_merge_at(stray, &to_eval);
+ }
+ }
+ // swallowing subtree may add new subtree bounds
+ if (copy == subtrees[dir])
+ break;
+ }
+
+ // bound should now match.
+ verify_subtree_bounds(dir, bounds);
+
+ show_subtrees();
+
+ if (!(mds->is_any_replay() || mds->is_resolve())) {
+ for(auto in : to_eval)
+ eval_subtree_root(in);
+ }
+}
+
+
+/*
+ * return a set of CDir*'s that correspond to the given bound set. Only adjust
+ * fragmentation as necessary to get an equivalent bounding set. That is, only
+ * split if one of our frags spans the provided bounding set. Never merge.
+ */
+void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
+{
+ dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
+
+ // sort by ino
+ map<inodeno_t, fragset_t> byino;
+ for (auto& frag : dfs) {
+ byino[frag.ino].insert(frag.frag);
+ }
+ dout(10) << " by ino: " << byino << dendl;
+
+ for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
+ CInode *diri = get_inode(p->first);
+ if (!diri)
+ continue;
+ dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
+
+ fragtree_t tmpdft;
+ for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
+ tmpdft.force_to_leaf(g_ceph_context, *q);
+
+ for (const auto& fg : p->second) {
+ frag_vec_t leaves;
+ diri->dirfragtree.get_leaves_under(fg, leaves);
+ if (leaves.empty()) {
+ bool all = true;
+ frag_t approx_fg = diri->dirfragtree[fg.value()];
+ frag_vec_t approx_leaves;
+ tmpdft.get_leaves_under(approx_fg, approx_leaves);
+ for (const auto& leaf : approx_leaves) {
+ if (p->second.get().count(leaf) == 0) {
+ // not bound, so the resolve message is from auth MDS of the dirfrag
+ force_dir_fragment(diri, leaf);
+ all = false;
+ }
+ }
+ if (all)
+ leaves.push_back(approx_fg);
+ else
+ diri->dirfragtree.get_leaves_under(fg, leaves);
+ }
+ dout(10) << " frag " << fg << " contains " << leaves << dendl;
+ for (const auto& leaf : leaves) {
+ CDir *dir = diri->get_dirfrag(leaf);
+ if (dir)
+ bounds.insert(dir);
+ }
+ }
+ }
+}
+
+void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
+{
+ dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
+ << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
+
+ set<CDir*> bounds;
+ get_force_dirfrag_bound_set(bound_dfs, bounds);
+ adjust_bounded_subtree_auth(dir, bounds, auth);
+}
+
+void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
+{
+ dout(10) << "map_dirfrag_set " << dfs << dendl;
+
+ // group by inode
+ map<inodeno_t, fragset_t> ino_fragset;
+ for (const auto &df : dfs) {
+ ino_fragset[df.ino].insert(df.frag);
+ }
+
+ // get frags
+ for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
+ p != ino_fragset.end();
+ ++p) {
+ CInode *in = get_inode(p->first);
+ if (!in)
+ continue;
+
+ frag_vec_t fgs;
+ for (const auto& fg : p->second) {
+ in->dirfragtree.get_leaves_under(fg, fgs);
+ }
+
+ dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
+ << " on " << *in << dendl;
+
+ for (const auto& fg : fgs) {
+ CDir *dir = in->get_dirfrag(fg);
+ if (dir)
+ result.insert(dir);
+ }
+ }
+}
+
+
+
+CDir *MDCache::get_subtree_root(CDir *dir)
+{
+ // find the underlying dir that delegates (or is about to delegate) auth
+ while (true) {
+ if (dir->is_subtree_root())
+ return dir;
+ dir = dir->get_inode()->get_parent_dir();
+ if (!dir)
+ return 0; // none
+ }
+}
+
+CDir *MDCache::get_projected_subtree_root(CDir *dir)
+{
+ // find the underlying dir that delegates (or is about to delegate) auth
+ while (true) {
+ if (dir->is_subtree_root())
+ return dir;
+ dir = dir->get_inode()->get_projected_parent_dir();
+ if (!dir)
+ return 0; // none
+ }
+}
+
+void MDCache::remove_subtree(CDir *dir)
+{
+ dout(10) << "remove_subtree " << *dir << dendl;
+ ceph_assert(subtrees.count(dir));
+ ceph_assert(subtrees[dir].empty());
+ subtrees.erase(dir);
+ dir->put(CDir::PIN_SUBTREE);
+ if (dir->get_parent_dir()) {
+ CDir *p = get_subtree_root(dir->get_parent_dir());
+ ceph_assert(subtrees[p].count(dir));
+ subtrees[p].erase(dir);
+ }
+}
+
+void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
+{
+ ceph_assert(subtrees.count(dir));
+ bounds = subtrees[dir];
+}
+
+void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
+{
+ if (subtrees.count(dir)) {
+ // just copy them, dir is a subtree.
+ get_subtree_bounds(dir, bounds);
+ } else {
+ // find them
+ CDir *root = get_subtree_root(dir);
+ for (set<CDir*>::iterator p = subtrees[root].begin();
+ p != subtrees[root].end();
+ ++p) {
+ CDir *t = *p;
+ while (t != root) {
+ t = t->get_parent_dir();
+ ceph_assert(t);
+ if (t == dir) {
+ bounds.insert(*p);
+ continue;
+ }
+ }
+ }
+ }
+}
+
+void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
+{
+ // for debugging only.
+ ceph_assert(subtrees.count(dir));
+ if (bounds != subtrees[dir]) {
+ dout(0) << "verify_subtree_bounds failed" << dendl;
+ set<CDir*> b = bounds;
+ for (auto &cd : subtrees[dir]) {
+ if (bounds.count(cd)) {
+ b.erase(cd);
+ continue;
+ }
+ dout(0) << " missing bound " << *cd << dendl;
+ }
+ for (const auto &cd : b)
+ dout(0) << " extra bound " << *cd << dendl;
+ }
+ ceph_assert(bounds == subtrees[dir]);
+}
+
+void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
+{
+ // for debugging only.
+ ceph_assert(subtrees.count(dir));
+
+ // make sure that any bounds i do have are properly noted as such.
+ int failed = 0;
+ for (const auto &fg : bounds) {
+ CDir *bd = get_dirfrag(fg);
+ if (!bd) continue;
+ if (subtrees[dir].count(bd) == 0) {
+ dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
+ failed++;
+ }
+ }
+ ceph_assert(failed == 0);
+}
+
+void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
+{
+ dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
+ << " to " << *newdir << dendl;
+ projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
+}
+
+void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
+{
+ dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
+
+ CDir *newdir = diri->get_parent_dir();
+
+ if (pop) {
+ map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
+ ceph_assert(p != projected_subtree_renames.end());
+ ceph_assert(!p->second.empty());
+ ceph_assert(p->second.front().first == olddir);
+ ceph_assert(p->second.front().second == newdir);
+ p->second.pop_front();
+ if (p->second.empty())
+ projected_subtree_renames.erase(p);
+ }
+
+ vector<CDir*> dfls;
+
+ // adjust total auth pin of freezing subtree
+ if (olddir != newdir) {
+ diri->get_nested_dirfrags(dfls);
+ for (auto dir : dfls)
+ olddir->adjust_freeze_after_rename(dir);
+ dfls.clear();
+ }
+
+ // adjust subtree
+ // make sure subtree dirfrags are at the front of the list
+ diri->get_subtree_dirfrags(dfls);
+ diri->get_nested_dirfrags(dfls);
+ for (auto dir : dfls) {
+ dout(10) << "dirfrag " << *dir << dendl;
+ CDir *oldparent = get_subtree_root(olddir);
+ dout(10) << " old parent " << *oldparent << dendl;
+ CDir *newparent = get_subtree_root(newdir);
+ dout(10) << " new parent " << *newparent << dendl;
+
+ if (olddir != newdir)
+ mds->balancer->adjust_pop_for_rename(olddir, dir, false);
+
+ if (oldparent == newparent) {
+ dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
+ } else if (dir->is_subtree_root()) {
+ // children are fine. change parent.
+ dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
+ ceph_assert(subtrees[oldparent].count(dir));
+ subtrees[oldparent].erase(dir);
+ ceph_assert(subtrees.count(newparent));
+ subtrees[newparent].insert(dir);
+ // caller is responsible for 'eval diri'
+ try_subtree_merge_at(dir, NULL, false);
+ } else {
+ // mid-subtree.
+
+ // see if any old bounds move to the new parent.
+ list<CDir*> tomove;
+ for (set<CDir*>::iterator p = subtrees[oldparent].begin();
+ p != subtrees[oldparent].end();
+ ++p) {
+ CDir *bound = *p;
+ CDir *broot = get_subtree_root(bound->get_parent_dir());
+ if (broot != oldparent) {
+ ceph_assert(broot == newparent);
+ tomove.push_back(bound);
+ }
+ }
+ for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) {
+ CDir *bound = *p;
+ dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
+ subtrees[oldparent].erase(bound);
+ subtrees[newparent].insert(bound);
+ }
+
+ // did auth change?
+ if (oldparent->authority() != newparent->authority()) {
+ adjust_subtree_auth(dir, oldparent->authority(), false);
+ // caller is responsible for 'eval diri'
+ try_subtree_merge_at(dir, NULL, false);
+ }
+ }
+
+ if (olddir != newdir)
+ mds->balancer->adjust_pop_for_rename(newdir, dir, true);
+ }
+
+ show_subtrees();
+}
+
+// ===================================
+// journal and snap/cow helpers
+
+
+/*
+ * find first inode in cache that follows given snapid. otherwise, return current.
+ */
+CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
+{
+ dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
+ ceph_assert(in->last == CEPH_NOSNAP);
+
+ auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
+ if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
+ dout(10) << "pick_inode_snap found " << *p->second << dendl;
+ in = p->second;
+ }
+
+ return in;
+}
+
+
+/*
+ * note: i'm currently cheating wrt dirty and inode.version on cow
+ * items. instead of doing a full dir predirty, i just take the
+ * original item's version, and set the dirty flag (via
+ * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
+ * means a special case in the dir commit clean sweep assertions.
+ * bah.
+ */
+CInode *MDCache::cow_inode(CInode *in, snapid_t last)
+{
+ ceph_assert(last >= in->first);
+
+ CInode *oldin = new CInode(this, true, in->first, last);
+ oldin->inode = *in->get_previous_projected_inode();
+ oldin->xattrs = *in->get_previous_projected_xattrs();
+ oldin->symlink = in->symlink;
+ oldin->inode.trim_client_ranges(last);
+
+ if (in->first < in->oldest_snap)
+ in->oldest_snap = in->first;
+
+ in->first = last+1;
+
+ dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
+ add_inode(oldin);
+
+ if (in->last != CEPH_NOSNAP) {
+ CInode *head_in = get_inode(in->ino());
+ ceph_assert(head_in);
+ auto ret = head_in->split_need_snapflush(oldin, in);
+ if (ret.first) {
+ oldin->client_snap_caps = in->client_snap_caps;
+ if (!oldin->client_snap_caps.empty()) {
+ for (int i = 0; i < num_cinode_locks; i++) {
+ SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
+ ceph_assert(lock);
+ if (lock->get_state() != LOCK_SNAP_SYNC) {
+ ceph_assert(lock->is_stable());
+ lock->set_state(LOCK_SNAP_SYNC); // gathering
+ oldin->auth_pin(lock);
+ }
+ lock->get_wrlock(true);
+ }
+ }
+ }
+ if (!ret.second) {
+ auto client_snap_caps = std::move(in->client_snap_caps);
+ in->client_snap_caps.clear();
+ in->item_open_file.remove_myself();
+ in->item_caps.remove_myself();
+
+ if (!client_snap_caps.empty()) {
+ MDSContext::vec finished;
+ for (int i = 0; i < num_cinode_locks; i++) {
+ SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
+ ceph_assert(lock);
+ ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering
+ lock->put_wrlock();
+ if (!lock->get_num_wrlocks()) {
+ lock->set_state(LOCK_SYNC);
+ lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished);
+ in->auth_unpin(lock);
+ }
+ }
+ mds->queue_waiters(finished);
+ }
+ }
+ return oldin;
+ }
+
+ if (!in->client_caps.empty()) {
+ const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
+ // clone caps?
+ for (auto &p : in->client_caps) {
+ client_t client = p.first;
+ Capability *cap = &p.second;
+ int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
+ if ((issued & CEPH_CAP_ANY_WR) &&
+ cap->client_follows < last) {
+ dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl;
+ oldin->client_snap_caps.insert(client);
+ cap->client_follows = last;
+
+ // we need snapflushes for any intervening snaps
+ dout(10) << " snaps " << snaps << dendl;
+ for (auto q = snaps.lower_bound(oldin->first);
+ q != snaps.end() && *q <= last;
+ ++q) {
+ in->add_need_snapflush(oldin, *q, client);
+ }
+ } else {
+ dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
+ }
+ }
+
+ if (!oldin->client_snap_caps.empty()) {
+ for (int i = 0; i < num_cinode_locks; i++) {
+ SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
+ ceph_assert(lock);
+ if (lock->get_state() != LOCK_SNAP_SYNC) {
+ ceph_assert(lock->is_stable());
+ lock->set_state(LOCK_SNAP_SYNC); // gathering
+ oldin->auth_pin(lock);
+ }
+ lock->get_wrlock(true);
+ }
+ }
+ }
+ return oldin;
+}
+
+void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
+ CDentry *dn, snapid_t follows,
+ CInode **pcow_inode, CDentry::linkage_t *dnl)
+{
+ if (!dn) {
+ dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
+ return;
+ }
+ dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
+ ceph_assert(dn->is_auth());
+
+ // nothing to cow on a null dentry, fix caller
+ if (!dnl)
+ dnl = dn->get_projected_linkage();
+ ceph_assert(!dnl->is_null());
+
+ CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
+ bool cow_head = false;
+ if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
+ ceph_assert(in->is_frozen_inode());
+ cow_head = true;
+ }
+ if (in && (in->is_multiversion() || cow_head)) {
+ // multiversion inode.
+ SnapRealm *realm = NULL;
+
+ if (in->get_projected_parent_dn() != dn) {
+ ceph_assert(follows == CEPH_NOSNAP);
+ realm = dn->dir->inode->find_snaprealm();
+ snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
+ ceph_assert(dir_follows >= realm->get_newest_seq());
+
+ if (dir_follows+1 > dn->first) {
+ snapid_t oldfirst = dn->first;
+ dn->first = dir_follows+1;
+ if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
+ CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(),
+ oldfirst, dir_follows);
+ olddn->pre_dirty();
+ dout(10) << " olddn " << *olddn << dendl;
+ metablob->add_remote_dentry(olddn, true);
+ mut->add_cow_dentry(olddn);
+ // FIXME: adjust link count here? hmm.
+
+ if (dir_follows+1 > in->first)
+ in->cow_old_inode(dir_follows, cow_head);
+ }
+ }
+
+ follows = dir_follows;
+ if (in->snaprealm) {
+ realm = in->snaprealm;
+ ceph_assert(follows >= realm->get_newest_seq());
+ }
+ } else {
+ realm = in->find_snaprealm();
+ if (follows == CEPH_NOSNAP) {
+ follows = get_global_snaprealm()->get_newest_seq();
+ ceph_assert(follows >= realm->get_newest_seq());
+ }
+ }
+
+ // already cloned?
+ if (follows < in->first) {
+ dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
+ return;
+ }
+
+ if (!realm->has_snaps_in_range(in->first, follows)) {
+ dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
+ in->first = follows + 1;
+ return;
+ }
+
+ in->cow_old_inode(follows, cow_head);
+
+ } else {
+ SnapRealm *realm = dn->dir->inode->find_snaprealm();
+ if (follows == CEPH_NOSNAP) {
+ follows = get_global_snaprealm()->get_newest_seq();
+ ceph_assert(follows >= realm->get_newest_seq());
+ }
+
+ // already cloned?
+ if (follows < dn->first) {
+ dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
+ return;
+ }
+
+ // update dn.first before adding old dentry to cdir's map
+ snapid_t oldfirst = dn->first;
+ dn->first = follows+1;
+
+ if (!realm->has_snaps_in_range(oldfirst, follows)) {
+ dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
+ if (in)
+ in->first = follows+1;
+ return;
+ }
+
+ dout(10) << " dn " << *dn << dendl;
+ if (in) {
+ CInode *oldin = cow_inode(in, follows);
+ mut->add_cow_inode(oldin);
+ if (pcow_inode)
+ *pcow_inode = oldin;
+ CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, follows);
+ oldin->inode.version = olddn->pre_dirty();
+ dout(10) << " olddn " << *olddn << dendl;
+ bool need_snapflush = !oldin->client_snap_caps.empty();
+ if (need_snapflush) {
+ mut->ls->open_files.push_back(&oldin->item_open_file);
+ mds->locker->mark_need_snapflush_inode(oldin);
+ }
+ metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
+ mut->add_cow_dentry(olddn);
+ } else {
+ ceph_assert(dnl->is_remote());
+ CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
+ oldfirst, follows);
+ olddn->pre_dirty();
+ dout(10) << " olddn " << *olddn << dendl;
+ metablob->add_remote_dentry(olddn, true);
+ mut->add_cow_dentry(olddn);
+ }
+ }
+}
+
+
+void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
+ CInode *in, snapid_t follows,
+ CInode **pcow_inode)
+{
+ dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
+ CDentry *dn = in->get_projected_parent_dn();
+ journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
+}
+
+void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
+{
+ if (in->is_base()) {
+ metablob->add_root(true, in);
+ } else {
+ if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
+ follows = in->first - 1;
+ CDentry *dn = in->get_projected_parent_dn();
+ if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
+ journal_cow_dentry(mut, metablob, dn, follows);
+ if (in->get_projected_inode()->is_backtrace_updated()) {
+ bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
+ in->get_previous_projected_inode()->layout.pool_id;
+ metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
+ } else {
+ metablob->add_primary_dentry(dn, in, true);
+ }
+ }
+}
+
+
+
+// nested ---------------------------------------------------------------
+
+void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
+ int linkunlink, SnapRealm *prealm)
+{
+ CDentry *parentdn = cur->get_projected_parent_dn();
+ CInode::mempool_inode *curi = cur->get_projected_inode();
+
+ if (cur->first > first)
+ first = cur->first;
+
+ dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
+ << " " << *cur << dendl;
+ dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
+ dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
+
+ /*
+ * FIXME. this incompletely propagates rstats to _old_ parents
+ * (i.e. shortly after a directory rename). but we need full
+ * blown hard link backpointers to make this work properly...
+ */
+ snapid_t floor = parentdn->first;
+ dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
+
+ if (!prealm)
+ prealm = parent->inode->find_snaprealm();
+ const set<snapid_t> snaps = prealm->get_snaps();
+
+ if (cur->last != CEPH_NOSNAP) {
+ ceph_assert(cur->dirty_old_rstats.empty());
+ set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
+ if (q == snaps.end() || *q > cur->last)
+ return;
+ }
+
+ if (cur->last >= floor) {
+ bool update = true;
+ if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
+ // rename src inode is not projected in the slave rename prep case. so we should
+ // avoid updateing the inode.
+ ceph_assert(linkunlink < 0);
+ ceph_assert(cur->is_frozen_inode());
+ update = false;
+ }
+ _project_rstat_inode_to_frag(*curi, std::max(first, floor), cur->last, parent,
+ linkunlink, update);
+ }
+
+ if (g_conf()->mds_snap_rstat) {
+ for (const auto &p : cur->dirty_old_rstats) {
+ auto &old = cur->old_inodes[p];
+ snapid_t ofirst = std::max(old.first, floor);
+ auto it = snaps.lower_bound(ofirst);
+ if (it == snaps.end() || *it > p)
+ continue;
+ if (p >= floor)
+ _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false);
+ }
+ }
+ cur->dirty_old_rstats.clear();
+}
+
+
+void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last,
+ CDir *parent, int linkunlink, bool update_inode)
+{
+ dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
+ dout(20) << " inode rstat " << inode.rstat << dendl;
+ dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
+ nest_info_t delta;
+ if (linkunlink == 0) {
+ delta.add(inode.rstat);
+ delta.sub(inode.accounted_rstat);
+ } else if (linkunlink < 0) {
+ delta.sub(inode.accounted_rstat);
+ } else {
+ delta.add(inode.rstat);
+ }
+ dout(20) << " delta " << delta << dendl;
+
+ if (update_inode)
+ inode.accounted_rstat = inode.rstat;
+
+ while (last >= ofirst) {
+ /*
+ * pick fnode version to update. at each iteration, we want to
+ * pick a segment ending in 'last' to update. split as necessary
+ * to make that work. then, adjust first up so that we only
+ * update one segment at a time. then loop to cover the whole
+ * [ofirst,last] interval.
+ */
+ nest_info_t *prstat;
+ snapid_t first;
+ fnode_t *pf = parent->get_projected_fnode();
+ if (last == CEPH_NOSNAP) {
+ if (g_conf()->mds_snap_rstat)
+ first = std::max(ofirst, parent->first);
+ else
+ first = parent->first;
+ prstat = &pf->rstat;
+ dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
+
+ if (first > parent->first &&
+ !(pf->rstat == pf->accounted_rstat)) {
+ dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
+ << parent->first << "," << (first-1) << "] "
+ << " " << *prstat << "/" << pf->accounted_rstat
+ << dendl;
+ parent->dirty_old_rstat[first-1].first = parent->first;
+ parent->dirty_old_rstat[first-1].rstat = pf->rstat;
+ parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
+ }
+ parent->first = first;
+ } else if (!g_conf()->mds_snap_rstat) {
+ // drop snapshots' rstats
+ break;
+ } else if (last >= parent->first) {
+ first = parent->first;
+ parent->dirty_old_rstat[last].first = first;
+ parent->dirty_old_rstat[last].rstat = pf->rstat;
+ parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
+ prstat = &parent->dirty_old_rstat[last].rstat;
+ dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
+ << " " << *prstat << "/" << pf->accounted_rstat << dendl;
+ } else {
+ // be careful, dirty_old_rstat is a _sparse_ map.
+ // sorry, this is ugly.
+ first = ofirst;
+
+ // find any intersection with last
+ auto it = parent->dirty_old_rstat.lower_bound(last);
+ if (it == parent->dirty_old_rstat.end()) {
+ dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
+ if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
+ dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
+ first = parent->dirty_old_rstat.rbegin()->first+1;
+ }
+ } else {
+ // *it last is >= last
+ if (it->second.first <= last) {
+ // *it intersects [first,last]
+ if (it->second.first < first) {
+ dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
+ parent->dirty_old_rstat[first-1] = it->second;
+ it->second.first = first;
+ }
+ if (it->second.first > first)
+ first = it->second.first;
+ if (last < it->first) {
+ dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
+ parent->dirty_old_rstat[last] = it->second;
+ it->second.first = last+1;
+ }
+ } else {
+ // *it is to the _right_ of [first,last]
+ it = parent->dirty_old_rstat.lower_bound(first);
+ // new *it last is >= first
+ if (it->second.first <= last && // new *it isn't also to the right, and
+ it->first >= first) { // it intersects our first bit,
+ dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
+ first = it->first+1;
+ }
+ dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
+ }
+ }
+ dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
+ parent->dirty_old_rstat[last].first = first;
+ prstat = &parent->dirty_old_rstat[last].rstat;
+ }
+
+ // apply
+ dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
+ ceph_assert(last >= first);
+ prstat->add(delta);
+ if (update_inode)
+ inode.accounted_rstat = inode.rstat;
+ dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
+
+ last = first-1;
+ }
+}
+
+void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
+ snapid_t ofirst, snapid_t last,
+ CInode *pin, bool cow_head)
+{
+ dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
+ dout(20) << " frag rstat " << rstat << dendl;
+ dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
+ nest_info_t delta = rstat;
+ delta.sub(accounted_rstat);
+ dout(20) << " delta " << delta << dendl;
+
+ while (last >= ofirst) {
+ CInode::mempool_inode *pi;
+ snapid_t first;
+ if (last == pin->last) {
+ pi = pin->get_projected_inode();
+ first = std::max(ofirst, pin->first);
+ if (first > pin->first) {
+ auto &old = pin->cow_old_inode(first-1, cow_head);
+ dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
+ }
+ } else {
+ if (last >= pin->first) {
+ first = pin->first;
+ pin->cow_old_inode(last, cow_head);
+ } else {
+ // our life is easier here because old_inodes is not sparse
+ // (although it may not begin at snapid 1)
+ auto it = pin->old_inodes.lower_bound(last);
+ if (it == pin->old_inodes.end()) {
+ dout(10) << " no old_inode <= " << last << ", done." << dendl;
+ break;
+ }
+ first = it->second.first;
+ if (first > last) {
+ dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
+ //assert(p == pin->old_inodes.begin());
+ break;
+ }
+ if (it->first > last) {
+ dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
+ << (last+1) << "," << it->first << "]" << dendl;
+ pin->old_inodes[last] = it->second;
+ it->second.first = last+1;
+ pin->dirty_old_rstats.insert(it->first);
+ }
+ }
+ if (first < ofirst) {
+ dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
+ << first << "," << ofirst-1 << "]" << dendl;
+ pin->old_inodes[ofirst-1] = pin->old_inodes[last];
+ pin->dirty_old_rstats.insert(ofirst-1);
+ pin->old_inodes[last].first = first = ofirst;
+ }
+ pi = &pin->old_inodes[last].inode;
+ pin->dirty_old_rstats.insert(last);
+ }
+ dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
+ pi->rstat.add(delta);
+ dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
+
+ last = first-1;
+ }
+}
+
+void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
+{
+ if (!(mds->is_active() || mds->is_stopping()))
+ return;
+
+ if (!in->is_auth() || in->is_frozen())
+ return;
+
+ auto i = in->get_projected_inode();
+
+ if (!i->quota.is_enable() &&
+ !quota_change)
+ return;
+
+ // creaete snaprealm for quota inode (quota was set before mimic)
+ if (!in->get_projected_srnode())
+ mds->server->create_quota_realm(in);
+
+ for (auto &p : in->client_caps) {
+ Capability *cap = &p.second;
+ if (cap->is_noquota())
+ continue;
+
+ if (exclude_ct >= 0 && exclude_ct != p.first)
+ goto update;
+
+ if (cap->last_rbytes == i->rstat.rbytes &&
+ cap->last_rsize == i->rstat.rsize())
+ continue;
+
+ if (i->quota.max_files > 0) {
+ if (i->rstat.rsize() >= i->quota.max_files)
+ goto update;
+
+ if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
+ abs(cap->last_rsize - i->rstat.rsize()))
+ goto update;
+ }
+
+ if (i->quota.max_bytes > 0) {
+ if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
+ goto update;
+
+ if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
+ abs(cap->last_rbytes - i->rstat.rbytes))
+ goto update;
+ }
+
+ continue;
+
+update:
+ cap->last_rsize = i->rstat.rsize();
+ cap->last_rbytes = i->rstat.rbytes;
+
+ auto msg = MClientQuota::create();
+ msg->ino = in->ino();
+ msg->rstat = i->rstat;
+ msg->quota = i->quota;
+ mds->send_message_client_counted(msg, cap->get_session());
+ }
+ for (const auto &it : in->get_replicas()) {
+ auto msg = MGatherCaps::create();
+ msg->ino = in->ino();
+ mds->send_message_mds(msg, it.first);
+ }
+}
+
+/*
+ * NOTE: we _have_ to delay the scatter if we are called during a
+ * rejoin, because we can't twiddle locks between when the
+ * rejoin_(weak|strong) is received and when we send the rejoin_ack.
+ * normally, this isn't a problem: a recover mds doesn't twiddle locks
+ * (no requests), and a survivor acks immediately. _except_ that
+ * during rejoin_(weak|strong) processing, we may complete a lock
+ * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
+ * scatterlock state in that case or the lock states will get out of
+ * sync between the auth and replica.
+ *
+ * the simple solution is to never do the scatter here. instead, put
+ * the scatterlock on a list if it isn't already wrlockable. this is
+ * probably the best plan anyway, since we avoid too many
+ * scatters/locks under normal usage.
+ */
+/*
+ * some notes on dirlock/nestlock scatterlock semantics:
+ *
+ * the fragstat (dirlock) will never be updated without
+ * dirlock+nestlock wrlock held by the caller.
+ *
+ * the rstat (nestlock) _may_ get updated without a wrlock when nested
+ * data is pushed up the tree. this could be changed with some
+ * restructuring here, but in its current form we ensure that the
+ * fragstat+rstat _always_ reflect an accurrate summation over the dir
+ * frag, which is nice. and, we only need to track frags that need to
+ * be nudged (and not inodes with pending rstat changes that need to
+ * be pushed into the frag). a consequence of this is that the
+ * accounted_rstat on scatterlock sync may not match our current
+ * rstat. this is normal and expected.
+ */
+void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
+ CInode *in, CDir *parent,
+ int flags, int linkunlink,
+ snapid_t cfollows)
+{
+ bool primary_dn = flags & PREDIRTY_PRIMARY;
+ bool do_parent_mtime = flags & PREDIRTY_DIR;
+ bool shallow = flags & PREDIRTY_SHALLOW;
+
+ ceph_assert(mds->mdlog->entry_is_open());
+
+ // make sure stamp is set
+ if (mut->get_mds_stamp() == utime_t())
+ mut->set_mds_stamp(ceph_clock_now());
+
+ if (in->is_base())
+ return;
+
+ dout(10) << "predirty_journal_parents"
+ << (do_parent_mtime ? " do_parent_mtime":"")
+ << " linkunlink=" << linkunlink
+ << (primary_dn ? " primary_dn":" remote_dn")
+ << (shallow ? " SHALLOW":"")
+ << " follows " << cfollows
+ << " " << *in << dendl;
+
+ if (!parent) {
+ ceph_assert(primary_dn);
+ parent = in->get_projected_parent_dn()->get_dir();
+ }
+
+ if (flags == 0 && linkunlink == 0) {
+ dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
+ blob->add_dir_context(parent);
+ return;
+ }
+
+ // build list of inodes to wrlock, dirty, and update
+ list<CInode*> lsi;
+ CInode *cur = in;
+ CDentry *parentdn = NULL;
+ bool first = true;
+ while (parent) {
+ //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
+ ceph_assert(parent->is_auth());
+
+ // opportunistically adjust parent dirfrag
+ CInode *pin = parent->get_inode();
+
+ // inode -> dirfrag
+ mut->auth_pin(parent);
+ mut->add_projected_fnode(parent);
+
+ fnode_t *pf = parent->project_fnode();
+ pf->version = parent->pre_dirty();
+
+ if (do_parent_mtime || linkunlink) {
+ ceph_assert(mut->is_wrlocked(&pin->filelock));
+ ceph_assert(mut->is_wrlocked(&pin->nestlock));
+ ceph_assert(cfollows == CEPH_NOSNAP);
+
+ // update stale fragstat/rstat?
+ parent->resync_accounted_fragstat();
+ parent->resync_accounted_rstat();
+
+ if (do_parent_mtime) {
+ pf->fragstat.mtime = mut->get_op_stamp();
+ pf->fragstat.change_attr++;
+ dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
+ if (pf->fragstat.mtime > pf->rstat.rctime) {
+ dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
+ pf->rstat.rctime = pf->fragstat.mtime;
+ } else {
+ dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
+ }
+ }
+ if (linkunlink) {
+ dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
+ if (in->is_dir()) {
+ pf->fragstat.nsubdirs += linkunlink;
+ //pf->rstat.rsubdirs += linkunlink;
+ } else {
+ pf->fragstat.nfiles += linkunlink;
+ //pf->rstat.rfiles += linkunlink;
+ }
+ }
+ }
+
+ // rstat
+ if (!primary_dn) {
+ // don't update parent this pass
+ } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
+ pin->versionlock.can_wrlock())) {
+ dout(20) << " unwritable parent nestlock " << pin->nestlock
+ << ", marking dirty rstat on " << *cur << dendl;
+ cur->mark_dirty_rstat();
+ } else {
+ // if we don't hold a wrlock reference on this nestlock, take one,
+ // because we are about to write into the dirfrag fnode and that needs
+ // to commit before the lock can cycle.
+ if (linkunlink) {
+ ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
+ }
+
+ if (!mut->is_wrlocked(&pin->nestlock)) {
+ dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
+ mds->locker->wrlock_force(&pin->nestlock, mut);
+ }
+
+ // now we can project the inode rstat diff the dirfrag
+ SnapRealm *prealm = pin->find_snaprealm();
+
+ snapid_t follows = cfollows;
+ if (follows == CEPH_NOSNAP)
+ follows = prealm->get_newest_seq();
+
+ snapid_t first = follows+1;
+
+ // first, if the frag is stale, bring it back in sync.
+ parent->resync_accounted_rstat();
+
+ // now push inode rstats into frag
+ project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
+ cur->clear_dirty_rstat();
+ }
+
+ bool stop = false;
+ if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
+ dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
+ stop = true;
+ }
+
+ // delay propagating until later?
+ if (!stop && !first &&
+ g_conf()->mds_dirstat_min_interval > 0) {
+ double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
+ if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
+ dout(10) << "predirty_journal_parents last prop " << since_last_prop
+ << " < " << g_conf()->mds_dirstat_min_interval
+ << ", stopping" << dendl;
+ stop = true;
+ } else {
+ dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
+ }
+ }
+
+ // can cast only because i'm passing nowait=true in the sole user
+ MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
+ if (!stop &&
+ !mut->is_wrlocked(&pin->nestlock) &&
+ (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
+ //true
+ !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
+ )) { // ** do not initiate.. see above comment **
+ dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
+ << " on " << *pin << dendl;
+ stop = true;
+ }
+ if (stop) {
+ dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
+ mds->locker->mark_updated_scatterlock(&pin->nestlock);
+ mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
+ mut->add_updated_lock(&pin->nestlock);
+ if (do_parent_mtime || linkunlink) {
+ mds->locker->mark_updated_scatterlock(&pin->filelock);
+ mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
+ mut->add_updated_lock(&pin->filelock);
+ }
+ break;
+ }
+ if (!mut->is_wrlocked(&pin->versionlock))
+ mds->locker->local_wrlock_grab(&pin->versionlock, mut);
+
+ ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_slave());
+
+ pin->last_dirstat_prop = mut->get_mds_stamp();
+
+ // dirfrag -> diri
+ mut->auth_pin(pin);
+ mut->add_projected_inode(pin);
+ lsi.push_front(pin);
+
+ pin->pre_cow_old_inode(); // avoid cow mayhem!
+
+ auto &pi = pin->project_inode();
+ pi.inode.version = pin->pre_dirty();
+
+ // dirstat
+ if (do_parent_mtime || linkunlink) {
+ dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
+ dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
+ bool touched_mtime = false, touched_chattr = false;
+ pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
+ pf->accounted_fragstat = pf->fragstat;
+ if (touched_mtime)
+ pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime;
+ if (touched_chattr)
+ pi.inode.change_attr = pi.inode.dirstat.change_attr;
+ dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl;
+
+ if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
+ if (pi.inode.dirstat.size() < 0)
+ ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
+ if (pi.inode.dirstat.size() != pf->fragstat.size()) {
+ mds->clog->error() << "unmatched fragstat size on single dirfrag "
+ << parent->dirfrag() << ", inode has " << pi.inode.dirstat
+ << ", dirfrag has " << pf->fragstat;
+
+ // trust the dirfrag for now
+ pi.inode.dirstat = pf->fragstat;
+
+ ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
+ }
+ }
+ }
+
+ /*
+ * the rule here is to follow the _oldest_ parent with dirty rstat
+ * data. if we don't propagate all data, we add ourselves to the
+ * nudge list. that way all rstat data will (eventually) get
+ * pushed up the tree.
+ *
+ * actually, no. for now, silently drop rstats for old parents. we need
+ * hard link backpointers to do the above properly.
+ */
+
+ // stop?
+ if (pin->is_base())
+ break;
+ parentdn = pin->get_projected_parent_dn();
+ ceph_assert(parentdn);
+
+ // rstat
+ dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
+
+ // first, if the frag is stale, bring it back in sync.
+ parent->resync_accounted_rstat();
+
+ if (g_conf()->mds_snap_rstat) {
+ for (auto &p : parent->dirty_old_rstat) {
+ project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
+ p.first, pin, true);
+ }
+ }
+ parent->dirty_old_rstat.clear();
+ project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
+
+ pf->accounted_rstat = pf->rstat;
+
+ if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
+ if (pi.inode.rstat.rbytes != pf->rstat.rbytes) {
+ mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
+ << parent->dirfrag() << ", inode has " << pi.inode.rstat
+ << ", dirfrag has " << pf->rstat;
+
+ // trust the dirfrag for now
+ pi.inode.rstat = pf->rstat;
+
+ ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
+ }
+ }
+
+ parent->check_rstats();
+ broadcast_quota_to_client(pin);
+ // next parent!
+ cur = pin;
+ parent = parentdn->get_dir();
+ linkunlink = 0;
+ do_parent_mtime = false;
+ primary_dn = true;
+ first = false;
+ }
+
+ // now, stick it in the blob
+ ceph_assert(parent);
+ ceph_assert(parent->is_auth());
+ blob->add_dir_context(parent);
+ blob->add_dir(parent, true);
+ for (list<CInode*>::iterator p = lsi.begin();
+ p != lsi.end();
+ ++p) {
+ CInode *cur = *p;
+ journal_dirty_inode(mut.get(), blob, cur);
+ }
+
+}
+
+
+
+
+
+// ===================================
+// slave requests
+
+
+/*
+ * some handlers for master requests with slaves. we need to make
+ * sure slaves journal commits before we forget we mastered them and
+ * remove them from the uncommitted_masters map (used during recovery
+ * to commit|abort slaves).
+ */
+struct C_MDC_CommittedMaster : public MDCacheLogContext {
+ metareqid_t reqid;
+ C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
+ void finish(int r) override {
+ mdcache->_logged_master_commit(reqid);
+ }
+};
+
+void MDCache::log_master_commit(metareqid_t reqid)
+{
+ dout(10) << "log_master_commit " << reqid << dendl;
+ uncommitted_masters[reqid].committing = true;
+ mds->mdlog->start_submit_entry(new ECommitted(reqid),
+ new C_MDC_CommittedMaster(this, reqid));
+}
+
+void MDCache::_logged_master_commit(metareqid_t reqid)
+{
+ dout(10) << "_logged_master_commit " << reqid << dendl;
+ ceph_assert(uncommitted_masters.count(reqid));
+ uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
+ mds->queue_waiters(uncommitted_masters[reqid].waiters);
+ uncommitted_masters.erase(reqid);
+}
+
+// while active...
+
+void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
+{
+ dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
+ ceph_assert(uncommitted_masters.count(r));
+ uncommitted_masters[r].slaves.erase(from);
+ if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
+ log_master_commit(r);
+}
+
+void MDCache::logged_master_update(metareqid_t reqid)
+{
+ dout(10) << "logged_master_update " << reqid << dendl;
+ ceph_assert(uncommitted_masters.count(reqid));
+ uncommitted_masters[reqid].safe = true;
+ auto p = pending_masters.find(reqid);
+ if (p != pending_masters.end()) {
+ pending_masters.erase(p);
+ if (pending_masters.empty())
+ process_delayed_resolve();
+ }
+}
+
+/*
+ * Master may crash after receiving all slaves' commit acks, but before journalling
+ * the final commit. Slaves may crash after journalling the slave commit, but before
+ * sending commit ack to the master. Commit masters with no uncommitted slave when
+ * resolve finishes.
+ */
+void MDCache::finish_committed_masters()
+{
+ for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
+ p != uncommitted_masters.end();
+ ++p) {
+ p->second.recovering = false;
+ if (!p->second.committing && p->second.slaves.empty()) {
+ dout(10) << "finish_committed_masters " << p->first << dendl;
+ log_master_commit(p->first);
+ }
+ }
+}
+
+/*
+ * at end of resolve... we must journal a commit|abort for all slave
+ * updates, before moving on.
+ *
+ * this is so that the master can safely journal ECommitted on ops it
+ * masters when it reaches up:active (all other recovering nodes must
+ * complete resolve before that happens).
+ */
+struct C_MDC_SlaveCommit : public MDCacheLogContext {
+ mds_rank_t from;
+ metareqid_t reqid;
+ C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
+ void finish(int r) override {
+ mdcache->_logged_slave_commit(from, reqid);
+ }
+};
+
+void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
+{
+ dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
+
+ // send a message
+ auto req = MMDSSlaveRequest::create(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
+ mds->send_message_mds(req, from);
+}
+
+
+
+
+
+
+// ====================================================================
+// import map, recovery
+
+void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
+ map<dirfrag_t,vector<dirfrag_t> >& subtrees)
+{
+ if (subtrees.count(oldparent)) {
+ vector<dirfrag_t>& v = subtrees[oldparent];
+ dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
+ for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
+ if (*it == df) {
+ v.erase(it);
+ break;
+ }
+ }
+ if (subtrees.count(newparent)) {
+ vector<dirfrag_t>& v = subtrees[newparent];
+ dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
+ v.push_back(df);
+ }
+}
+
+ESubtreeMap *MDCache::create_subtree_map()
+{
+ dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
+ << num_subtrees_fullauth() << " fullauth"
+ << dendl;
+
+ show_subtrees();
+
+ ESubtreeMap *le = new ESubtreeMap();
+ mds->mdlog->_start_entry(le);
+
+ map<dirfrag_t, CDir*> dirs_to_add;
+
+ if (myin) {
+ CDir* mydir = myin->get_dirfrag(frag_t());
+ dirs_to_add[mydir->dirfrag()] = mydir;
+ }
+
+ // include all auth subtrees, and their bounds.
+ // and a spanning tree to tie it to the root.
+ for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ CDir *dir = p->first;
+
+ // journal subtree as "ours" if we are
+ // me, -2
+ // me, me
+ // me, !me (may be importing and ambiguous!)
+
+ // so not
+ // !me, *
+ if (dir->get_dir_auth().first != mds->get_nodeid())
+ continue;
+
+ if (migrator->is_ambiguous_import(dir->dirfrag()) ||
+ my_ambiguous_imports.count(dir->dirfrag())) {
+ dout(15) << " ambig subtree " << *dir << dendl;
+ le->ambiguous_subtrees.insert(dir->dirfrag());
+ } else {
+ dout(15) << " subtree " << *dir << dendl;
+ }
+
+ dirs_to_add[dir->dirfrag()] = dir;
+ le->subtrees[dir->dirfrag()].clear();
+
+
+ // bounds
+ for (set<CDir*>::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q) {
+ CDir *bound = *q;
+ dout(15) << " subtree bound " << *bound << dendl;
+ dirs_to_add[bound->dirfrag()] = bound;
+ le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
+ }
+ }
+
+ // apply projected renames
+ for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin();
+ p != projected_subtree_renames.end();
+ ++p) {
+ for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) {
+ CInode *diri = p->first;
+ CDir *olddir = q->first;
+ CDir *newdir = q->second;
+ dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
+
+ list<CDir*> dfls;
+ diri->get_dirfrags(dfls);
+ for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
+ CDir *dir = *p;
+ dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
+ CDir *oldparent = get_projected_subtree_root(olddir);
+ dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
+ CDir *newparent = get_projected_subtree_root(newdir);
+ dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
+
+ if (oldparent == newparent) {
+ dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
+ << oldparent->dirfrag() << dendl;
+ continue;
+ }
+
+ if (dir->is_subtree_root()) {
+ if (le->subtrees.count(newparent->dirfrag()) &&
+ oldparent->get_dir_auth() != newparent->get_dir_auth())
+ dirs_to_add[dir->dirfrag()] = dir;
+ // children are fine. change parent.
+ _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
+ le->subtrees);
+ } else {
+ // mid-subtree.
+
+ if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
+ dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
+ // if oldparent is auth, subtree is mine; include it.
+ if (le->subtrees.count(oldparent->dirfrag())) {
+ dirs_to_add[dir->dirfrag()] = dir;
+ le->subtrees[dir->dirfrag()].clear();
+ }
+ // if newparent is auth, subtree is a new bound
+ if (le->subtrees.count(newparent->dirfrag())) {
+ dirs_to_add[dir->dirfrag()] = dir;
+ le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
+ }
+ newparent = dir;
+ }
+
+ // see if any old bounds move to the new parent.
+ for (set<CDir*>::iterator p = subtrees[oldparent].begin();
+ p != subtrees[oldparent].end();
+ ++p) {
+ CDir *bound = *p;
+ if (dir->contains(bound->get_parent_dir()))
+ _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
+ le->subtrees);
+ }
+ }
+ }
+ }
+ }
+
+ // simplify the journaled map. our in memory map may have more
+ // subtrees than needed due to migrations that are just getting
+ // started or just completing. but on replay, the "live" map will
+ // be simple and we can do a straight comparison.
+ for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
+ if (le->ambiguous_subtrees.count(p->first))
+ continue;
+ unsigned i = 0;
+ while (i < p->second.size()) {
+ dirfrag_t b = p->second[i];
+ if (le->subtrees.count(b) &&
+ le->ambiguous_subtrees.count(b) == 0) {
+ vector<dirfrag_t>& bb = le->subtrees[b];
+ dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
+ for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
+ p->second.push_back(*r);
+ dirs_to_add.erase(b);
+ le->subtrees.erase(b);
+ p->second.erase(p->second.begin() + i);
+ } else {
+ ++i;
+ }
+ }
+ }
+
+ for (auto &p : dirs_to_add) {
+ CDir *dir = p.second;
+ le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
+ le->metablob.add_dir(dir, false);
+ }
+
+ dout(15) << " subtrees " << le->subtrees << dendl;
+ dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
+
+ //le->metablob.print(cout);
+ le->expire_pos = mds->mdlog->journaler->get_expire_pos();
+ return le;
+}
+
+void MDCache::dump_resolve_status(Formatter *f) const
+{
+ f->open_object_section("resolve_status");
+ f->dump_stream("resolve_gather") << resolve_gather;
+ f->dump_stream("resolve_ack_gather") << resolve_gather;
+ f->close_section();
+}
+
+void MDCache::resolve_start(MDSContext *resolve_done_)
+{
+ dout(10) << "resolve_start" << dendl;
+ ceph_assert(!resolve_done);
+ resolve_done.reset(resolve_done_);
+
+ if (mds->mdsmap->get_root() != mds->get_nodeid()) {
+ // if we don't have the root dir, adjust it to UNKNOWN. during
+ // resolve we want mds0 to explicit claim the portion of it that
+ // it owns, so that anything beyond its bounds get left as
+ // unknown.
+ CDir *rootdir = root->get_dirfrag(frag_t());
+ if (rootdir)
+ adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
+ }
+ resolve_gather = recovery_set;
+
+ resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
+}
+
+void MDCache::send_resolves()
+{
+ send_slave_resolves();
+
+ if (!resolve_done) {
+ // I'm survivor: refresh snap cache
+ mds->snapclient->sync(
+ new MDSInternalContextWrapper(mds,
+ new FunctionContext([this](int r) {
+ maybe_finish_slave_resolve();
+ })
+ )
+ );
+ dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
+ return;
+ }
+ if (!resolve_ack_gather.empty()) {
+ dout(10) << "send_resolves still waiting for resolve ack from ("
+ << resolve_ack_gather << ")" << dendl;
+ return;
+ }
+ if (!resolve_need_rollback.empty()) {
+ dout(10) << "send_resolves still waiting for rollback to commit on ("
+ << resolve_need_rollback << ")" << dendl;
+ return;
+ }
+
+ send_subtree_resolves();
+}
+
+void MDCache::send_slave_resolves()
+{
+ dout(10) << "send_slave_resolves" << dendl;
+
+ map<mds_rank_t, MMDSResolve::ref> resolves;
+
+ if (mds->is_resolve()) {
+ for (map<metareqid_t, uslave>::iterator p = uncommitted_slaves.begin();
+ p != uncommitted_slaves.end();
+ ++p) {
+ mds_rank_t master = p->second.master;
+ auto &m = resolves[master];
+ if (!m) m = MMDSResolve::create();
+ m->add_slave_request(p->first, false);
+ }
+ } else {
+ set<mds_rank_t> resolve_set;
+ mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
+ for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+ p != active_requests.end();
+ ++p) {
+ MDRequestRef& mdr = p->second;
+ if (!mdr->is_slave())
+ continue;
+ if (!mdr->slave_did_prepare() && !mdr->committing) {
+ continue;
+ }
+ mds_rank_t master = mdr->slave_to_mds;
+ if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
+ dout(10) << " including uncommitted " << *mdr << dendl;
+ if (!resolves.count(master))
+ resolves[master] = MMDSResolve::create();
+ if (!mdr->committing &&
+ mdr->has_more() && mdr->more()->is_inode_exporter) {
+ // re-send cap exports
+ CInode *in = mdr->more()->rename_inode;
+ map<client_t, Capability::Export> cap_map;
+ in->export_client_caps(cap_map);
+ bufferlist bl;
+ encode(in->ino(), bl);
+ encode(cap_map, bl);
+ resolves[master]->add_slave_request(p->first, bl);
+ } else {
+ resolves[master]->add_slave_request(p->first, mdr->committing);
+ }
+ }
+ }
+ }
+
+ for (auto &p : resolves) {
+ dout(10) << "sending slave resolve to mds." << p.first << dendl;
+ mds->send_message_mds(p.second, p.first);
+ resolve_ack_gather.insert(p.first);
+ }
+}
+
+void MDCache::send_subtree_resolves()
+{
+ dout(10) << "send_subtree_resolves" << dendl;
+
+ if (migrator->is_exporting() || migrator->is_importing()) {
+ dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
+ migrator->show_importing();
+ migrator->show_exporting();
+ resolves_pending = true;
+ return; // not now
+ }
+
+ map<mds_rank_t, MMDSResolve::ref> resolves;
+ for (set<mds_rank_t>::iterator p = recovery_set.begin();
+ p != recovery_set.end();
+ ++p) {
+ if (*p == mds->get_nodeid())
+ continue;
+ if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
+ resolves[*p] = MMDSResolve::create();
+ }
+
+ map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
+ map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
+
+ // known
+ for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ CDir *dir = p->first;
+
+ // only our subtrees
+ if (dir->authority().first != mds->get_nodeid())
+ continue;
+
+ if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
+ continue; // we'll add it below
+
+ if (migrator->is_ambiguous_import(dir->dirfrag())) {
+ // ambiguous (mid-import)
+ set<CDir*> bounds;
+ get_subtree_bounds(dir, bounds);
+ vector<dirfrag_t> dfls;
+ for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
+ dfls.push_back((*q)->dirfrag());
+
+ my_ambig_imports[dir->dirfrag()] = dfls;
+ dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
+ } else {
+ // not ambiguous.
+ for (auto &q : resolves) {
+ resolves[q.first]->add_subtree(dir->dirfrag());
+ }
+ // bounds too
+ vector<dirfrag_t> dfls;
+ for (set<CDir*>::iterator q = subtrees[dir].begin();
+ q != subtrees[dir].end();
+ ++q) {
+ CDir *bound = *q;
+ dfls.push_back(bound->dirfrag());
+ }
+
+ my_subtrees[dir->dirfrag()] = dfls;
+ dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
+ }
+ }
+
+ // ambiguous
+ for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
+ p != my_ambiguous_imports.end();
+ ++p) {
+ my_ambig_imports[p->first] = p->second;
+ dout(10) << " ambig " << p->first << " " << p->second << dendl;
+ }
+
+ // simplify the claimed subtree.
+ for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
+ unsigned i = 0;
+ while (i < p->second.size()) {
+ dirfrag_t b = p->second[i];
+ if (my_subtrees.count(b)) {
+ vector<dirfrag_t>& bb = my_subtrees[b];
+ dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
+ for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
+ p->second.push_back(*r);
+ my_subtrees.erase(b);
+ p->second.erase(p->second.begin() + i);
+ } else {
+ ++i;
+ }
+ }
+ }
+
+ // send
+ for (auto &p : resolves) {
+ const MMDSResolve::ref &m = p.second;
+ if (mds->is_resolve()) {
+ m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
+ } else {
+ m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
+ }
+ m->subtrees = my_subtrees;
+ m->ambiguous_imports = my_ambig_imports;
+ dout(10) << "sending subtee resolve to mds." << p.first << dendl;
+ mds->send_message_mds(m, p.first);
+ }
+ resolves_pending = false;
+}
+
+void MDCache::maybe_finish_slave_resolve() {
+ if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
+ // snap cache get synced or I'm in resolve state
+ if (mds->snapclient->is_synced() || resolve_done)
+ send_subtree_resolves();
+ process_delayed_resolve();
+ }
+}
+
+void MDCache::handle_mds_failure(mds_rank_t who)
+{
+ dout(7) << "handle_mds_failure mds." << who << dendl;
+
+ dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
+
+ resolve_gather.insert(who);
+ discard_delayed_resolve(who);
+ ambiguous_slave_updates.erase(who);
+
+ rejoin_gather.insert(who);
+ rejoin_sent.erase(who); // i need to send another
+ rejoin_ack_sent.erase(who); // i need to send another
+ rejoin_ack_gather.erase(who); // i'll need/get another.
+
+ dout(10) << " resolve_gather " << resolve_gather << dendl;
+ dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
+ dout(10) << " rejoin_sent " << rejoin_sent << dendl;
+ dout(10) << " rejoin_gather " << rejoin_gather << dendl;
+ dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
+
+
+ // tell the migrator too.
+ migrator->handle_mds_failure_or_stop(who);
+
+ // tell the balancer too.
+ mds->balancer->handle_mds_failure(who);
+
+ // clean up any requests slave to/from this node
+ list<MDRequestRef> finish;
+ for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+ p != active_requests.end();
+ ++p) {
+ MDRequestRef& mdr = p->second;
+ // slave to the failed node?
+ if (mdr->slave_to_mds == who) {
+ if (mdr->slave_did_prepare()) {
+ dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
+ if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
+ remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
+
+ if (!mdr->more()->waiting_on_slave.empty()) {
+ ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
+ // will rollback, no need to wait
+ mdr->reset_slave_request();
+ mdr->more()->waiting_on_slave.clear();
+ }
+ } else if (!mdr->committing) {
+ dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
+ if (mdr->slave_request || mdr->slave_rolling_back())
+ mdr->aborted = true;
+ else
+ finish.push_back(mdr);
+ }
+ }
+
+ if (mdr->is_slave() && mdr->slave_did_prepare()) {
+ if (mdr->more()->waiting_on_slave.count(who)) {
+ ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
+ dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
+ << who << dendl;
+ mdr->more()->waiting_on_slave.erase(who);
+ if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
+ mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
+ }
+
+ if (mdr->more()->srcdn_auth_mds == who &&
+ mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
+ // rename srcdn's auth mds failed, resolve even I'm a survivor.
+ dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
+ add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
+ }
+ } else if (mdr->slave_request) {
+ const MMDSSlaveRequest::const_ref &slave_req = mdr->slave_request;
+ // FIXME: Slave rename request can arrive after we notice mds failure.
+ // This can cause mds to crash (does not affect integrity of FS).
+ if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
+ slave_req->srcdn_auth == who)
+ slave_req->mark_interrupted();
+ }
+
+ // failed node is slave?
+ if (mdr->is_master() && !mdr->committing) {
+ if (mdr->more()->srcdn_auth_mds == who) {
+ dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
+ << who << " to recover" << dendl;
+ ceph_assert(mdr->more()->witnessed.count(who) == 0);
+ if (mdr->more()->is_ambiguous_auth)
+ mdr->clear_ambiguous_auth();
+ // rename srcdn's auth mds failed, all witnesses will rollback
+ mdr->more()->witnessed.clear();
+ pending_masters.erase(p->first);
+ }
+
+ if (mdr->more()->witnessed.count(who)) {
+ mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
+ if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
+ dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
+ << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
+ // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
+ // until either the request is committing or the slave also fails.
+ ceph_assert(mdr->more()->waiting_on_slave.size() == 1);
+ pending_masters.insert(p->first);
+ } else {
+ dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
+ << who << " to recover" << dendl;
+ if (srcdn_auth >= 0)
+ ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
+
+ // discard this peer's prepare (if any)
+ mdr->more()->witnessed.erase(who);
+ }
+ }
+
+ if (mdr->more()->waiting_on_slave.count(who)) {
+ dout(10) << " master request " << *mdr << " waiting for slave mds." << who
+ << " to recover" << dendl;
+ // retry request when peer recovers
+ mdr->more()->waiting_on_slave.erase(who);
+ if (mdr->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
+ }
+
+ if (mdr->locking && mdr->locking_target_mds == who)
+ mdr->finish_locking(mdr->locking);
+ }
+ }
+
+ for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
+ p != uncommitted_masters.end();
+ ++p) {
+ // The failed MDS may have already committed the slave update
+ if (p->second.slaves.count(who)) {
+ p->second.recovering = true;
+ p->second.slaves.erase(who);
+ }
+ }
+
+ while (!finish.empty()) {
+ dout(10) << "cleaning up slave request " << *finish.front() << dendl;
+ request_finish(finish.front());
+ finish.pop_front();
+ }
+
+ kick_find_ino_peers(who);
+ kick_open_ino_peers(who);
+
+ for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
+ p != fragments.end(); ) {
+ dirfrag_t df = p->first;
+ fragment_info_t& info = p->second;
+
+ if (info.is_fragmenting()) {
+ if (info.notify_ack_waiting.erase(who) &&
+ info.notify_ack_waiting.empty()) {
+ fragment_drop_locks(info);
+ fragment_maybe_finish(p++);
+ } else {
+ ++p;
+ }
+ continue;
+ }
+
+ ++p;
+ dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
+ list<CDir*> dirs;
+ info.dirs.swap(dirs);
+ fragments.erase(df);
+ fragment_unmark_unfreeze_dirs(dirs);
+ }
+
+ // MDCache::shutdown_export_strays() always exports strays to mds.0
+ if (who == mds_rank_t(0))
+ shutdown_exporting_strays.clear();
+
+ show_subtrees();
+}
+
+/*
+ * handle_mds_recovery - called on another node's transition
+ * from resolve -> active.
+ */
+void MDCache::handle_mds_recovery(mds_rank_t who)
+{
+ dout(7) << "handle_mds_recovery mds." << who << dendl;
+
+ // exclude all discover waiters. kick_discovers() will do the job
+ static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
+ static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
+
+ MDSContext::vec waiters;
+
+ // wake up any waiters in their subtrees
+ for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ CDir *dir = p->first;
+
+ if (dir->authority().first != who ||
+ dir->authority().second == mds->get_nodeid())
+ continue;
+ ceph_assert(!dir->is_auth());
+
+ // wake any waiters
+ list<CDir*> q;
+ q.push_back(dir);
+
+ while (!q.empty()) {
+ CDir *d = q.front();
+ q.pop_front();
+ d->take_waiting(d_mask, waiters);
+
+ // inode waiters too
+ for (auto &p : d->items) {
+ CDentry *dn = p.second;
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ if (dnl->is_primary()) {
+ dnl->get_inode()->take_waiting(i_mask, waiters);
+
+ // recurse?
+ list<CDir*> ls;
+ dnl->get_inode()->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin();
+ p != ls.end();
+ ++p) {
+ CDir *subdir = *p;
+ if (!subdir->is_subtree_root())
+ q.push_back(subdir);
+ }
+ }
+ }
+ }
+ }
+
+ kick_open_ino_peers(who);
+ kick_find_ino_peers(who);
+
+ // queue them up.
+ mds->queue_waiters(waiters);
+}
+
+void MDCache::set_recovery_set(set<mds_rank_t>& s)
+{
+ dout(7) << "set_recovery_set " << s << dendl;
+ recovery_set = s;
+}
+
+
+/*
+ * during resolve state, we share resolves to determine who
+ * is authoritative for which trees. we expect to get an resolve
+ * from _everyone_ in the recovery_set (the mds cluster at the time of
+ * the first failure).
+ *
+ * This functions puts the passed message before returning
+ */
+void MDCache::handle_resolve(const MMDSResolve::const_ref &m)
+{
+ dout(7) << "handle_resolve from " << m->get_source() << dendl;
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+
+ if (mds->get_state() < MDSMap::STATE_RESOLVE) {
+ if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
+ mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ // wait until we reach the resolve stage!
+ return;
+ }
+
+ discard_delayed_resolve(from);
+
+ // ambiguous slave requests?
+ if (!m->slave_requests.empty()) {
+ if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
+ for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
+ if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
+ ceph_assert(!p->second.committing);
+ pending_masters.insert(p->first);
+ }
+ }
+
+ if (!pending_masters.empty()) {
+ dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
+ delayed_resolve[from] = m;
+ return;
+ }
+ }
+
+ auto ack = MMDSResolveAck::create();
+ for (const auto &p : m->slave_requests) {
+ if (uncommitted_masters.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) {
+ // COMMIT
+ if (p.second.committing) {
+ // already committing, waiting for the OP_COMMITTED slave reply
+ dout(10) << " already committing slave request " << p << " noop "<< dendl;
+ } else {
+ dout(10) << " ambiguous slave request " << p << " will COMMIT" << dendl;
+ ack->add_commit(p.first);
+ }
+ uncommitted_masters[p.first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
+
+ if (p.second.inode_caps.length() > 0) {
+ // slave wants to export caps (rename)
+ ceph_assert(mds->is_resolve());
+
+ inodeno_t ino;
+ map<client_t,Capability::Export> cap_exports;
+ auto q = p.second.inode_caps.cbegin();
+ decode(ino, q);
+ decode(cap_exports, q);
+
+ ceph_assert(get_inode(ino));
+
+ for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
+ q != cap_exports.end();
+ ++q) {
+ Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
+ im.cap_id = ++last_cap_id; // assign a new cap ID
+ im.issue_seq = 1;
+ im.mseq = q->second.mseq;
+
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+ if (session)
+ rejoin_client_map.emplace(q->first, session->info.inst);
+ }
+
+ // will process these caps in rejoin stage
+ rejoin_slave_exports[ino].first = from;
+ rejoin_slave_exports[ino].second.swap(cap_exports);
+
+ // send information of imported caps back to slave
+ encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
+ }
+ } else {
+ // ABORT
+ dout(10) << " ambiguous slave request " << p << " will ABORT" << dendl;
+ ceph_assert(!p.second.committing);
+ ack->add_abort(p.first);
+ }
+ }
+ mds->send_message(ack, m->get_connection());
+ return;
+ }
+
+ if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
+ dout(10) << "delay processing subtree resolve" << dendl;
+ delayed_resolve[from] = m;
+ return;
+ }
+
+ bool survivor = false;
+ // am i a surviving ambiguous importer?
+ if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
+ survivor = true;
+ // check for any import success/failure (from this node)
+ map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
+ while (p != my_ambiguous_imports.end()) {
+ map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
+ ++next;
+ CDir *dir = get_dirfrag(p->first);
+ ceph_assert(dir);
+ dout(10) << "checking ambiguous import " << *dir << dendl;
+ if (migrator->is_importing(dir->dirfrag()) &&
+ migrator->get_import_peer(dir->dirfrag()) == from) {
+ ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
+
+ // check if sender claims the subtree
+ bool claimed_by_sender = false;
+ for (const auto &q : m->subtrees) {
+ // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
+ CDir *base = get_force_dirfrag(q.first, false);
+ if (!base || !base->contains(dir))
+ continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
+
+ bool inside = true;
+ set<CDir*> bounds;
+ get_force_dirfrag_bound_set(q.second, bounds);
+ for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
+ CDir *bound = *p;
+ if (bound->contains(dir)) {
+ inside = false; // nope, bound is dir or parent of dir, not inside.
+ break;
+ }
+ }
+ if (inside)
+ claimed_by_sender = true;
+ }
+
+ my_ambiguous_imports.erase(p); // no longer ambiguous.
+ if (claimed_by_sender) {
+ dout(7) << "ambiguous import failed on " << *dir << dendl;
+ migrator->import_reverse(dir);
+ } else {
+ dout(7) << "ambiguous import succeeded on " << *dir << dendl;
+ migrator->import_finish(dir, true);
+ }
+ }
+ p = next;
+ }
+ }
+
+ // update my dir_auth values
+ // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
+ // migrations between other nodes)
+ for (const auto& p : m->subtrees) {
+ dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
+ CDir *dir = get_force_dirfrag(p.first, !survivor);
+ if (!dir)
+ continue;
+ adjust_bounded_subtree_auth(dir, p.second, from);
+ try_subtree_merge(dir);
+ }
+
+ show_subtrees();
+
+ // note ambiguous imports too
+ for (const auto& p : m->ambiguous_imports) {
+ dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
+ other_ambiguous_imports[from][p.first] = p.second;
+ }
+
+ // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
+ // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
+ for (const auto& p : m->table_clients) {
+ dout(10) << " noting " << get_mdstable_name(p.type)
+ << " pending_commits " << p.pending_commits << dendl;
+ MDSTableClient *client = mds->get_table_client(p.type);
+ for (const auto& q : p.pending_commits)
+ client->notify_commit(q);
+ }
+
+ // did i get them all?
+ resolve_gather.erase(from);
+
+ maybe_resolve_finish();
+}
+
+void MDCache::process_delayed_resolve()
+{
+ dout(10) << "process_delayed_resolve" << dendl;
+ map<mds_rank_t, MMDSResolve::const_ref> tmp;
+ tmp.swap(delayed_resolve);
+ for (auto &p : tmp) {
+ handle_resolve(p.second);
+ }
+}
+
+void MDCache::discard_delayed_resolve(mds_rank_t who)
+{
+ delayed_resolve.erase(who);
+}
+
+void MDCache::maybe_resolve_finish()
+{
+ ceph_assert(resolve_ack_gather.empty());
+ ceph_assert(resolve_need_rollback.empty());
+
+ if (!resolve_gather.empty()) {
+ dout(10) << "maybe_resolve_finish still waiting for resolves ("
+ << resolve_gather << ")" << dendl;
+ return;
+ }
+
+ dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
+ disambiguate_my_imports();
+ finish_committed_masters();
+
+ if (resolve_done) {
+ ceph_assert(mds->is_resolve());
+ trim_unlinked_inodes();
+ recalc_auth_bits(false);
+ resolve_done.release()->complete(0);
+ } else {
+ // I am survivor.
+ maybe_send_pending_rejoins();
+ }
+}
+
+void MDCache::handle_resolve_ack(const MMDSResolveAck::const_ref &ack)
+{
+ dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
+ mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+ if (!resolve_ack_gather.count(from) ||
+ mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
+ return;
+ }
+
+ if (ambiguous_slave_updates.count(from)) {
+ ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
+ ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+ }
+
+ for (const auto &p : ack->commit) {
+ dout(10) << " commit on slave " << p.first << dendl;
+
+ if (ambiguous_slave_updates.count(from)) {
+ remove_ambiguous_slave_update(p.first, from);
+ continue;
+ }
+
+ if (mds->is_resolve()) {
+ // replay
+ MDSlaveUpdate *su = get_uncommitted_slave(p.first, from);
+ ceph_assert(su);
+
+ // log commit
+ mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p.first, from,
+ ESlaveUpdate::OP_COMMIT, su->origop),
+ new C_MDC_SlaveCommit(this, from, p.first));
+ mds->mdlog->flush();
+
+ finish_uncommitted_slave(p.first);
+ } else {
+ MDRequestRef mdr = request_get(p.first);
+ // information about master imported caps
+ if (p.second.length() > 0)
+ mdr->more()->inode_import.share(p.second);
+
+ ceph_assert(mdr->slave_request == 0); // shouldn't be doing anything!
+ request_finish(mdr);
+ }
+ }
+
+ for (const auto &metareq : ack->abort) {
+ dout(10) << " abort on slave " << metareq << dendl;
+
+ if (mds->is_resolve()) {
+ MDSlaveUpdate *su = get_uncommitted_slave(metareq, from);
+ ceph_assert(su);
+
+ // perform rollback (and journal a rollback entry)
+ // note: this will hold up the resolve a bit, until the rollback entries journal.
+ MDRequestRef null_ref;
+ switch (su->origop) {
+ case ESlaveUpdate::LINK:
+ mds->server->do_link_rollback(su->rollback, from, null_ref);
+ break;
+ case ESlaveUpdate::RENAME:
+ mds->server->do_rename_rollback(su->rollback, from, null_ref);
+ break;
+ case ESlaveUpdate::RMDIR:
+ mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
+ break;
+ default:
+ ceph_abort();
+ }
+ } else {
+ MDRequestRef mdr = request_get(metareq);
+ mdr->aborted = true;
+ if (mdr->slave_request) {
+ if (mdr->slave_did_prepare()) // journaling slave prepare ?
+ add_rollback(metareq, from);
+ } else {
+ request_finish(mdr);
+ }
+ }
+ }
+
+ if (!ambiguous_slave_updates.count(from)) {
+ resolve_ack_gather.erase(from);
+ maybe_finish_slave_resolve();
+ }
+}
+
+void MDCache::add_uncommitted_slave(metareqid_t reqid, LogSegment *ls, mds_rank_t master, MDSlaveUpdate *su)
+{
+ auto const &ret = uncommitted_slaves.emplace(std::piecewise_construct,
+ std::forward_as_tuple(reqid),
+ std::forward_as_tuple());
+ ceph_assert(ret.second);
+ ls->uncommitted_slaves.insert(reqid);
+ uslave &u = ret.first->second;
+ u.master = master;
+ u.ls = ls;
+ u.su = su;
+ if (su == nullptr) {
+ return;
+ }
+ for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
+ uncommitted_slave_rename_olddir[*p]++;
+ for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
+ uncommitted_slave_unlink[*p]++;
+}
+
+void MDCache::finish_uncommitted_slave(metareqid_t reqid, bool assert_exist)
+{
+ auto it = uncommitted_slaves.find(reqid);
+ if (it == uncommitted_slaves.end()) {
+ ceph_assert(!assert_exist);
+ return;
+ }
+ uslave &u = it->second;
+ MDSlaveUpdate* su = u.su;
+
+ if (!u.waiters.empty()) {
+ mds->queue_waiters(u.waiters);
+ }
+ u.ls->uncommitted_slaves.erase(reqid);
+ uncommitted_slaves.erase(it);
+
+ if (su == nullptr) {
+ return;
+ }
+ // discard the non-auth subtree we renamed out of
+ for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
+ CInode *diri = *p;
+ map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
+ ceph_assert(it != uncommitted_slave_rename_olddir.end());
+ it->second--;
+ if (it->second == 0) {
+ uncommitted_slave_rename_olddir.erase(it);
+ list<CDir*> ls;
+ diri->get_dirfrags(ls);
+ for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
+ CDir *root = get_subtree_root(*q);
+ if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
+ try_trim_non_auth_subtree(root);
+ if (*q != root)
+ break;
+ }
+ }
+ } else
+ ceph_assert(it->second > 0);
+ }
+ // removed the inodes that were unlinked by slave update
+ for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
+ CInode *in = *p;
+ map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
+ ceph_assert(it != uncommitted_slave_unlink.end());
+ it->second--;
+ if (it->second == 0) {
+ uncommitted_slave_unlink.erase(it);
+ if (!in->get_projected_parent_dn())
+ mds->mdcache->remove_inode_recursive(in);
+ } else
+ ceph_assert(it->second > 0);
+ }
+ delete su;
+}
+
+MDSlaveUpdate* MDCache::get_uncommitted_slave(metareqid_t reqid, mds_rank_t master)
+{
+
+ MDSlaveUpdate* su = nullptr;
+ auto it = uncommitted_slaves.find(reqid);
+ if (it != uncommitted_slaves.end() &&
+ it->second.master == master) {
+ su = it->second.su;
+ }
+ return su;
+}
+
+void MDCache::finish_rollback(metareqid_t reqid, MDRequestRef& mdr) {
+ auto p = resolve_need_rollback.find(reqid);
+ ceph_assert(p != resolve_need_rollback.end());
+ if (mds->is_resolve()) {
+ finish_uncommitted_slave(reqid, false);
+ } else if (mdr) {
+ finish_uncommitted_slave(mdr->reqid, mdr->more()->slave_update_journaled);
+ }
+ resolve_need_rollback.erase(p);
+ maybe_finish_slave_resolve();
+}
+
+void MDCache::disambiguate_other_imports()
+{
+ dout(10) << "disambiguate_other_imports" << dendl;
+
+ bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+ // other nodes' ambiguous imports
+ for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
+ p != other_ambiguous_imports.end();
+ ++p) {
+ mds_rank_t who = p->first;
+ dout(10) << "ambiguous imports for mds." << who << dendl;
+
+ for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q) {
+ dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
+ // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
+ CDir *dir = get_force_dirfrag(q->first, recovering);
+ if (!dir) continue;
+
+ if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
+ dir->authority() == CDIR_AUTH_UNDEF) { // resolving
+ dout(10) << " mds." << who << " did import " << *dir << dendl;
+ adjust_bounded_subtree_auth(dir, q->second, who);
+ try_subtree_merge(dir);
+ } else {
+ dout(10) << " mds." << who << " did not import " << *dir << dendl;
+ }
+ }
+ }
+ other_ambiguous_imports.clear();
+}
+
+void MDCache::disambiguate_my_imports()
+{
+ dout(10) << "disambiguate_my_imports" << dendl;
+
+ if (!mds->is_resolve()) {
+ ceph_assert(my_ambiguous_imports.empty());
+ return;
+ }
+
+ disambiguate_other_imports();
+
+ // my ambiguous imports
+ mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
+ while (!my_ambiguous_imports.empty()) {
+ map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
+
+ CDir *dir = get_dirfrag(q->first);
+ ceph_assert(dir);
+
+ if (dir->authority() != me_ambig) {
+ dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
+ cancel_ambiguous_import(dir);
+
+ mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
+
+ // subtree may have been swallowed by another node claiming dir
+ // as their own.
+ CDir *root = get_subtree_root(dir);
+ if (root != dir)
+ dout(10) << " subtree root is " << *root << dendl;
+ ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us!
+ try_trim_non_auth_subtree(root);
+ } else {
+ dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
+ finish_ambiguous_import(q->first);
+ mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
+ }
+ }
+ ceph_assert(my_ambiguous_imports.empty());
+ mds->mdlog->flush();
+
+ // verify all my subtrees are unambiguous!
+ for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ CDir *dir = p->first;
+ if (dir->is_ambiguous_dir_auth()) {
+ dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
+ }
+ ceph_assert(!dir->is_ambiguous_dir_auth());
+ }
+
+ show_subtrees();
+}
+
+
+void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
+{
+ ceph_assert(my_ambiguous_imports.count(base) == 0);
+ my_ambiguous_imports[base] = bounds;
+}
+
+
+void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
+{
+ // make a list
+ vector<dirfrag_t> binos;
+ for (set<CDir*>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p)
+ binos.push_back((*p)->dirfrag());
+
+ // note: this can get called twice if the exporter fails during recovery
+ if (my_ambiguous_imports.count(base->dirfrag()))
+ my_ambiguous_imports.erase(base->dirfrag());
+
+ add_ambiguous_import(base->dirfrag(), binos);
+}
+
+void MDCache::cancel_ambiguous_import(CDir *dir)
+{
+ dirfrag_t df = dir->dirfrag();
+ ceph_assert(my_ambiguous_imports.count(df));
+ dout(10) << "cancel_ambiguous_import " << df
+ << " bounds " << my_ambiguous_imports[df]
+ << " " << *dir
+ << dendl;
+ my_ambiguous_imports.erase(df);
+}
+
+void MDCache::finish_ambiguous_import(dirfrag_t df)
+{
+ ceph_assert(my_ambiguous_imports.count(df));
+ vector<dirfrag_t> bounds;
+ bounds.swap(my_ambiguous_imports[df]);
+ my_ambiguous_imports.erase(df);
+
+ dout(10) << "finish_ambiguous_import " << df
+ << " bounds " << bounds
+ << dendl;
+ CDir *dir = get_dirfrag(df);
+ ceph_assert(dir);
+
+ // adjust dir_auth, import maps
+ adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
+ try_subtree_merge(dir);
+}
+
+void MDCache::remove_inode_recursive(CInode *in)
+{
+ dout(10) << "remove_inode_recursive " << *in << dendl;
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ list<CDir*>::iterator p = ls.begin();
+ while (p != ls.end()) {
+ CDir *subdir = *p++;
+
+ dout(10) << " removing dirfrag " << subdir << dendl;
+ auto it = subdir->items.begin();
+ while (it != subdir->items.end()) {
+ CDentry *dn = it->second;
+ ++it;
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ if (dnl->is_primary()) {
+ CInode *tin = dnl->get_inode();
+ subdir->unlink_inode(dn, false);
+ remove_inode_recursive(tin);
+ }
+ subdir->remove_dentry(dn);
+ }
+
+ if (subdir->is_subtree_root())
+ remove_subtree(subdir);
+ in->close_dirfrag(subdir->dirfrag().frag);
+ }
+ remove_inode(in);
+}
+
+bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
+{
+ ceph_assert(!in->is_auth());
+
+ dout(10) << __func__ << ":" << *in << dendl;
+
+ // Recurse into any dirfrags beneath this inode
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (auto subdir : ls) {
+ if (!in->is_mdsdir() && subdir->is_subtree_root()) {
+ dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
+ return true;
+ }
+
+ for (auto &it : subdir->items) {
+ CDentry *dn = it.second;
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ if (dnl->is_primary()) {
+ CInode *tin = dnl->get_inode();
+
+ /* Remote strays with linkage (i.e. hardlinks) should not be
+ * expired, because they may be the target of
+ * a rename() as the owning MDS shuts down */
+ if (!tin->is_stray() && tin->inode.nlink) {
+ dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
+ return true;
+ }
+
+ const bool abort = expire_recursive(tin, expiremap);
+ if (abort) {
+ return true;
+ }
+ }
+ if (dn->lru_is_expireable()) {
+ trim_dentry(dn, expiremap);
+ } else {
+ dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+void MDCache::trim_unlinked_inodes()
+{
+ dout(7) << "trim_unlinked_inodes" << dendl;
+ int count = 0;
+ vector<CInode*> q;
+ for (auto &p : inode_map) {
+ CInode *in = p.second;
+ if (in->get_parent_dn() == NULL && !in->is_base()) {
+ dout(7) << " will trim from " << *in << dendl;
+ q.push_back(in);
+ }
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+
+ for (auto& in : q) {
+ remove_inode_recursive(in);
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+}
+
+/** recalc_auth_bits()
+ * once subtree auth is disambiguated, we need to adjust all the
+ * auth and dirty bits in our cache before moving on.
+ */
+void MDCache::recalc_auth_bits(bool replay)
+{
+ dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
+
+ if (root) {
+ root->inode_auth.first = mds->mdsmap->get_root();
+ bool auth = mds->get_nodeid() == root->inode_auth.first;
+ if (auth) {
+ root->state_set(CInode::STATE_AUTH);
+ } else {
+ root->state_clear(CInode::STATE_AUTH);
+ if (!replay)
+ root->state_set(CInode::STATE_REJOINING);
+ }
+ }
+
+ set<CInode*> subtree_inodes;
+ for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ if (p->first->dir_auth.first == mds->get_nodeid())
+ subtree_inodes.insert(p->first->inode);
+ }
+
+ for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ if (p->first->inode->is_mdsdir()) {
+ CInode *in = p->first->inode;
+ bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
+ if (auth) {
+ in->state_set(CInode::STATE_AUTH);
+ } else {
+ in->state_clear(CInode::STATE_AUTH);
+ if (!replay)
+ in->state_set(CInode::STATE_REJOINING);
+ }
+ }
+
+ list<CDir*> dfq; // dirfrag queue
+ dfq.push_back(p->first);
+
+ bool auth = p->first->authority().first == mds->get_nodeid();
+ dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
+
+ while (!dfq.empty()) {
+ CDir *dir = dfq.front();
+ dfq.pop_front();
+
+ // dir
+ if (auth) {
+ dir->state_set(CDir::STATE_AUTH);
+ } else {
+ dir->state_clear(CDir::STATE_AUTH);
+ if (!replay) {
+ // close empty non-auth dirfrag
+ if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
+ dir->inode->close_dirfrag(dir->get_frag());
+ continue;
+ }
+ dir->state_set(CDir::STATE_REJOINING);
+ dir->state_clear(CDir::STATE_COMPLETE);
+ if (dir->is_dirty())
+ dir->mark_clean();
+ }
+ }
+
+ // dentries in this dir
+ for (auto &p : dir->items) {
+ // dn
+ CDentry *dn = p.second;
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ if (auth) {
+ dn->state_set(CDentry::STATE_AUTH);
+ } else {
+ dn->state_clear(CDentry::STATE_AUTH);
+ if (!replay) {
+ dn->state_set(CDentry::STATE_REJOINING);
+ if (dn->is_dirty())
+ dn->mark_clean();
+ }
+ }
+
+ if (dnl->is_primary()) {
+ // inode
+ CInode *in = dnl->get_inode();
+ if (auth) {
+ in->state_set(CInode::STATE_AUTH);
+ } else {
+ in->state_clear(CInode::STATE_AUTH);
+ if (!replay) {
+ in->state_set(CInode::STATE_REJOINING);
+ if (in->is_dirty())
+ in->mark_clean();
+ if (in->is_dirty_parent())
+ in->clear_dirty_parent();
+ // avoid touching scatterlocks for our subtree roots!
+ if (subtree_inodes.count(in) == 0)
+ in->clear_scatter_dirty();
+ }
+ }
+ // recurse?
+ if (in->is_dir())
+ in->get_nested_dirfrags(dfq);
+ }
+ }
+ }
+ }
+
+ show_subtrees();
+ show_cache();
+}
+
+
+
+// ===========================================================================
+// REJOIN
+
+/*
+ * notes on scatterlock recovery:
+ *
+ * - recovering inode replica sends scatterlock data for any subtree
+ * roots (the only ones that are possibly dirty).
+ *
+ * - surviving auth incorporates any provided scatterlock data. any
+ * pending gathers are then finished, as with the other lock types.
+ *
+ * that takes care of surviving auth + (recovering replica)*.
+ *
+ * - surviving replica sends strong_inode, which includes current
+ * scatterlock state, AND any dirty scatterlock data. this
+ * provides the recovering auth with everything it might need.
+ *
+ * - recovering auth must pick initial scatterlock state based on
+ * (weak|strong) rejoins.
+ * - always assimilate scatterlock data (it can't hurt)
+ * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
+ * - include base inode in ack for all inodes that saw scatterlock content
+ *
+ * also, for scatter gather,
+ *
+ * - auth increments {frag,r}stat.version on completion of any gather.
+ *
+ * - auth incorporates changes in a gather _only_ if the version
+ * matches.
+ *
+ * - replica discards changes any time the scatterlock syncs, and
+ * after recovery.
+ */
+
+void MDCache::dump_rejoin_status(Formatter *f) const
+{
+ f->open_object_section("rejoin_status");
+ f->dump_stream("rejoin_gather") << rejoin_gather;
+ f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
+ f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
+ f->close_section();
+}
+
+void MDCache::rejoin_start(MDSContext *rejoin_done_)
+{
+ dout(10) << "rejoin_start" << dendl;
+ ceph_assert(!rejoin_done);
+ rejoin_done.reset(rejoin_done_);
+
+ rejoin_gather = recovery_set;
+ // need finish opening cap inodes before sending cache rejoins
+ rejoin_gather.insert(mds->get_nodeid());
+ process_imported_caps();
+}
+
+/*
+ * rejoin phase!
+ *
+ * this initiates rejoin. it should be called before we get any
+ * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
+ *
+ * we start out by sending rejoins to everyone in the recovery set.
+ *
+ * if we are rejoin, send for all regions in our cache.
+ * if we are active|stopping, send only to nodes that are rejoining.
+ */
+void MDCache::rejoin_send_rejoins()
+{
+ dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
+
+ if (rejoin_gather.count(mds->get_nodeid())) {
+ dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
+ rejoins_pending = true;
+ return;
+ }
+ if (!resolve_gather.empty()) {
+ dout(7) << "rejoin_send_rejoins still waiting for resolves ("
+ << resolve_gather << ")" << dendl;
+ rejoins_pending = true;
+ return;
+ }
+
+ ceph_assert(!migrator->is_importing());
+ ceph_assert(!migrator->is_exporting());
+
+ if (!mds->is_rejoin()) {
+ disambiguate_other_imports();
+ }
+
+ map<mds_rank_t, MMDSCacheRejoin::ref> rejoins;
+
+
+ // if i am rejoining, send a rejoin to everyone.
+ // otherwise, just send to others who are rejoining.
+ for (set<mds_rank_t>::iterator p = recovery_set.begin();
+ p != recovery_set.end();
+ ++p) {
+ if (*p == mds->get_nodeid()) continue; // nothing to myself!
+ if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
+ if (mds->is_rejoin())
+ rejoins[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_WEAK);
+ else if (mds->mdsmap->is_rejoin(*p))
+ rejoins[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_STRONG);
+ }
+
+ if (mds->is_rejoin()) {
+ map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
+ for (auto& p : cap_exports) {
+ mds_rank_t target = p.second.first;
+ if (rejoins.count(target) == 0)
+ continue;
+ for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
+ Session *session = nullptr;
+ auto it = client_exports.find(q->first);
+ if (it != client_exports.end()) {
+ session = it->second.first;
+ if (session)
+ it->second.second.insert(target);
+ } else {
+ session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+ auto& r = client_exports[q->first];
+ r.first = session;
+ if (session)
+ r.second.insert(target);
+ }
+ if (session) {
+ ++q;
+ } else {
+ // remove reconnect with no session
+ p.second.second.erase(q++);
+ }
+ }
+ rejoins[target]->cap_exports[p.first] = p.second.second;
+ }
+ for (auto& p : client_exports) {
+ Session *session = p.second.first;
+ for (auto& q : p.second.second) {
+ auto rejoin = rejoins[q];
+ rejoin->client_map[p.first] = session->info.inst;
+ rejoin->client_metadata_map[p.first] = session->info.client_metadata;
+ }
+ }
+ }
+
+
+ // check all subtrees
+ for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ CDir *dir = p->first;
+ ceph_assert(dir->is_subtree_root());
+ if (dir->is_ambiguous_dir_auth()) {
+ // exporter is recovering, importer is survivor.
+ ceph_assert(rejoins.count(dir->authority().first));
+ ceph_assert(!rejoins.count(dir->authority().second));
+ continue;
+ }
+
+ // my subtree?
+ if (dir->is_auth())
+ continue; // skip my own regions!
+
+ mds_rank_t auth = dir->get_dir_auth().first;
+ ceph_assert(auth >= 0);
+ if (rejoins.count(auth) == 0)
+ continue; // don't care about this node's subtrees
+
+ rejoin_walk(dir, rejoins[auth]);
+ }
+
+ // rejoin root inodes, too
+ for (auto &p : rejoins) {
+ if (mds->is_rejoin()) {
+ // weak
+ if (p.first == 0 && root) {
+ p.second->add_weak_inode(root->vino());
+ if (root->is_dirty_scattered()) {
+ dout(10) << " sending scatterlock state on root " << *root << dendl;
+ p.second->add_scatterlock_state(root);
+ }
+ }
+ if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
+ if (in)
+ p.second->add_weak_inode(in->vino());
+ }
+ } else {
+ // strong
+ if (p.first == 0 && root) {
+ p.second->add_strong_inode(root->vino(),
+ root->get_replica_nonce(),
+ root->get_caps_wanted(),
+ root->filelock.get_state(),
+ root->nestlock.get_state(),
+ root->dirfragtreelock.get_state());
+ root->state_set(CInode::STATE_REJOINING);
+ if (root->is_dirty_scattered()) {
+ dout(10) << " sending scatterlock state on root " << *root << dendl;
+ p.second->add_scatterlock_state(root);
+ }
+ }
+
+ if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
+ p.second->add_strong_inode(in->vino(),
+ in->get_replica_nonce(),
+ in->get_caps_wanted(),
+ in->filelock.get_state(),
+ in->nestlock.get_state(),
+ in->dirfragtreelock.get_state());
+ in->state_set(CInode::STATE_REJOINING);
+ }
+ }
+ }
+
+ if (!mds->is_rejoin()) {
+ // i am survivor. send strong rejoin.
+ // note request remote_auth_pins, xlocks
+ for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+ p != active_requests.end();
+ ++p) {
+ MDRequestRef& mdr = p->second;
+ if (mdr->is_slave())
+ continue;
+ // auth pins
+ for (const auto& q : mdr->remote_auth_pins) {
+ if (!q.first->is_auth()) {
+ ceph_assert(q.second == q.first->authority().first);
+ if (rejoins.count(q.second) == 0) continue;
+ const MMDSCacheRejoin::ref &rejoin = rejoins[q.second];
+
+ dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
+ MDSCacheObjectInfo i;
+ q.first->set_object_info(i);
+ if (i.ino)
+ rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
+ else
+ rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
+
+ if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
+ mdr->more()->rename_inode == q.first)
+ rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
+ mdr->reqid, mdr->attempt);
+ }
+ }
+ // xlocks
+ for (const auto& q : mdr->locks) {
+ auto lock = q.lock;
+ auto obj = lock->get_parent();
+ if (q.is_xlock() && !obj->is_auth()) {
+ mds_rank_t who = obj->authority().first;
+ if (rejoins.count(who) == 0) continue;
+ const MMDSCacheRejoin::ref &rejoin = rejoins[who];
+
+ dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
+ MDSCacheObjectInfo i;
+ obj->set_object_info(i);
+ if (i.ino)
+ rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
+ mdr->reqid, mdr->attempt);
+ else
+ rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
+ mdr->reqid, mdr->attempt);
+ } else if (q.is_remote_wrlock()) {
+ mds_rank_t who = q.wrlock_target;
+ if (rejoins.count(who) == 0) continue;
+ const MMDSCacheRejoin::ref &rejoin = rejoins[who];
+
+ dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
+ MDSCacheObjectInfo i;
+ obj->set_object_info(i);
+ ceph_assert(i.ino);
+ rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
+ mdr->reqid, mdr->attempt);
+ }
+ }
+ }
+ }
+
+ // send the messages
+ for (auto &p : rejoins) {
+ ceph_assert(rejoin_sent.count(p.first) == 0);
+ ceph_assert(rejoin_ack_gather.count(p.first) == 0);
+ rejoin_sent.insert(p.first);
+ rejoin_ack_gather.insert(p.first);
+ mds->send_message_mds(p.second, p.first);
+ }
+ rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
+ rejoins_pending = false;
+
+ // nothing?
+ if (mds->is_rejoin() && rejoin_gather.empty()) {
+ dout(10) << "nothing to rejoin" << dendl;
+ rejoin_gather_finish();
+ }
+}
+
+
+/**
+ * rejoin_walk - build rejoin declarations for a subtree
+ *
+ * @param dir subtree root
+ * @param rejoin rejoin message
+ *
+ * from a rejoining node:
+ * weak dirfrag
+ * weak dentries (w/ connectivity)
+ *
+ * from a surviving node:
+ * strong dirfrag
+ * strong dentries (no connectivity!)
+ * strong inodes
+ */
+void MDCache::rejoin_walk(CDir *dir, const MMDSCacheRejoin::ref &rejoin)
+{
+ dout(10) << "rejoin_walk " << *dir << dendl;
+
+ list<CDir*> nested; // finish this dir, then do nested items
+
+ if (mds->is_rejoin()) {
+ // WEAK
+ rejoin->add_weak_dirfrag(dir->dirfrag());
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
+ ceph_assert(dn->last == CEPH_NOSNAP);
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ dout(15) << " add_weak_primary_dentry " << *dn << dendl;
+ ceph_assert(dnl->is_primary());
+ CInode *in = dnl->get_inode();
+ ceph_assert(dnl->get_inode()->is_dir());
+ rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
+ in->get_nested_dirfrags(nested);
+ if (in->is_dirty_scattered()) {
+ dout(10) << " sending scatterlock state on " << *in << dendl;
+ rejoin->add_scatterlock_state(in);
+ }
+ }
+ } else {
+ // STRONG
+ dout(15) << " add_strong_dirfrag " << *dir << dendl;
+ rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
+ dir->state_set(CDir::STATE_REJOINING);
+
+ for (auto it = dir->items.begin(); it != dir->items.end(); ) {
+ CDentry *dn = it->second;
+ ++it;
+ dn->state_set(CDentry::STATE_REJOINING);
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
+
+ // trim snap dentries. because they may have been pruned by
+ // their auth mds (snap deleted)
+ if (dn->last != CEPH_NOSNAP) {
+ if (in && !in->remote_parents.empty()) {
+ // unlink any stale remote snap dentry.
+ for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
+ CDentry *remote_dn = *it2;
+ ++it2;
+ ceph_assert(remote_dn->last != CEPH_NOSNAP);
+ remote_dn->unlink_remote(remote_dn->get_linkage());
+ }
+ }
+ if (dn->lru_is_expireable()) {
+ if (!dnl->is_null())
+ dir->unlink_inode(dn, false);
+ if (in)
+ remove_inode(in);
+ dir->remove_dentry(dn);
+ continue;
+ } else {
+ // Inventing null/remote dentry shouldn't cause problem
+ ceph_assert(!dnl->is_primary());
+ }
+ }
+
+ dout(15) << " add_strong_dentry " << *dn << dendl;
+ rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
+ dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
+ dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
+ dnl->is_remote() ? dnl->get_remote_d_type():0,
+ dn->get_replica_nonce(),
+ dn->lock.get_state());
+ dn->state_set(CDentry::STATE_REJOINING);
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ dout(15) << " add_strong_inode " << *in << dendl;
+ rejoin->add_strong_inode(in->vino(),
+ in->get_replica_nonce(),
+ in->get_caps_wanted(),
+ in->filelock.get_state(),
+ in->nestlock.get_state(),
+ in->dirfragtreelock.get_state());
+ in->state_set(CInode::STATE_REJOINING);
+ in->get_nested_dirfrags(nested);
+ if (in->is_dirty_scattered()) {
+ dout(10) << " sending scatterlock state on " << *in << dendl;
+ rejoin->add_scatterlock_state(in);
+ }
+ }
+ }
+ }
+
+ // recurse into nested dirs
+ for (list<CDir*>::iterator p = nested.begin();
+ p != nested.end();
+ ++p)
+ rejoin_walk(*p, rejoin);
+}
+
+
+/*
+ * i got a rejoin.
+ * - reply with the lockstate
+ *
+ * if i am active|stopping,
+ * - remove source from replica list for everything not referenced here.
+ */
+void MDCache::handle_cache_rejoin(const MMDSCacheRejoin::const_ref &m)
+{
+ dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
+ << " (" << m->get_payload().length() << " bytes)"
+ << dendl;
+
+ switch (m->op) {
+ case MMDSCacheRejoin::OP_WEAK:
+ handle_cache_rejoin_weak(m);
+ break;
+ case MMDSCacheRejoin::OP_STRONG:
+ handle_cache_rejoin_strong(m);
+ break;
+ case MMDSCacheRejoin::OP_ACK:
+ handle_cache_rejoin_ack(m);
+ break;
+
+ default:
+ ceph_abort();
+ }
+}
+
+
+/*
+ * handle_cache_rejoin_weak
+ *
+ * the sender
+ * - is recovering from their journal.
+ * - may have incorrect (out of date) inode contents
+ * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
+ *
+ * if the sender didn't trim_non_auth(), they
+ * - may have incorrect (out of date) dentry/inode linkage
+ * - may have deleted/purged inodes
+ * and i may have to go to disk to get accurate inode contents. yuck.
+ */
+void MDCache::handle_cache_rejoin_weak(const MMDSCacheRejoin::const_ref &weak)
+{
+ mds_rank_t from = mds_rank_t(weak->get_source().num());
+
+ // possible response(s)
+ MMDSCacheRejoin::ref ack; // if survivor
+ set<vinodeno_t> acked_inodes; // if survivor
+ set<SimpleLock *> gather_locks; // if survivor
+ bool survivor = false; // am i a survivor?
+
+ if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
+ survivor = true;
+ dout(10) << "i am a surivivor, and will ack immediately" << dendl;
+ ack = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_ACK);
+
+ map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
+
+ // check cap exports
+ for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
+ CInode *in = get_inode(p->first);
+ ceph_assert(!in || in->is_auth());
+ for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+ dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
+ Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
+ Capability::Import& im = imported_caps[p->first][q->first];
+ if (cap) {
+ im.cap_id = cap->get_cap_id();
+ im.issue_seq = cap->get_last_seq();
+ im.mseq = cap->get_mseq();
+ } else {
+ // all are zero
+ }
+ }
+ mds->locker->eval(in, CEPH_CAP_LOCKS, true);
+ }
+
+ encode(imported_caps, ack->imported_caps);
+ } else {
+ ceph_assert(mds->is_rejoin());
+
+ // we may have already received a strong rejoin from the sender.
+ rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
+ ceph_assert(gather_locks.empty());
+
+ // check cap exports.
+ rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
+ rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
+ weak->client_metadata_map.end());
+
+ for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
+ CInode *in = get_inode(p->first);
+ ceph_assert(!in || in->is_auth());
+ // note
+ for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+ dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
+ cap_imports[p->first][q->first][from] = q->second;
+ }
+ }
+ }
+
+ // assimilate any potentially dirty scatterlock state
+ for (const auto &p : weak->inode_scatterlocks) {
+ CInode *in = get_inode(p.first);
+ ceph_assert(in);
+ in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
+ in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
+ in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
+ if (!survivor)
+ rejoin_potential_updated_scatterlocks.insert(in);
+ }
+
+ // recovering peer may send incorrect dirfrags here. we need to
+ // infer which dirfrag they meant. the ack will include a
+ // strong_dirfrag that will set them straight on the fragmentation.
+
+ // walk weak map
+ set<CDir*> dirs_to_share;
+ for (const auto &p : weak->weak_dirfrags) {
+ CInode *diri = get_inode(p.ino);
+ if (!diri)
+ dout(0) << " missing dir ino " << p.ino << dendl;
+ ceph_assert(diri);
+
+ frag_vec_t leaves;
+ if (diri->dirfragtree.is_leaf(p.frag)) {
+ leaves.push_back(p.frag);
+ } else {
+ diri->dirfragtree.get_leaves_under(p.frag, leaves);
+ if (leaves.empty())
+ leaves.push_back(diri->dirfragtree[p.frag.value()]);
+ }
+ for (const auto& leaf : leaves) {
+ CDir *dir = diri->get_dirfrag(leaf);
+ if (!dir) {
+ dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
+ continue;
+ }
+ ceph_assert(dir);
+ if (dirs_to_share.count(dir)) {
+ dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
+ } else {
+ dirs_to_share.insert(dir);
+ unsigned nonce = dir->add_replica(from);
+ dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
+ if (ack) {
+ ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
+ ack->add_dirfrag_base(dir);
+ }
+ }
+ }
+ }
+
+ for (const auto &p : weak->weak) {
+ CInode *diri = get_inode(p.first);
+ if (!diri)
+ dout(0) << " missing dir ino " << p.first << dendl;
+ ceph_assert(diri);
+
+ // weak dentries
+ CDir *dir = 0;
+ for (const auto &q : p.second) {
+ // locate proper dirfrag.
+ // optimize for common case (one dirfrag) to avoid dirs_to_share set check
+ frag_t fg = diri->pick_dirfrag(q.first.name);
+ if (!dir || dir->get_frag() != fg) {
+ dir = diri->get_dirfrag(fg);
+ if (!dir)
+ dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
+ ceph_assert(dir);
+ ceph_assert(dirs_to_share.count(dir));
+ }
+
+ // and dentry
+ CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
+ ceph_assert(dn);
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ ceph_assert(dnl->is_primary());
+
+ if (survivor && dn->is_replica(from))
+ dentry_remove_replica(dn, from, gather_locks);
+ unsigned dnonce = dn->add_replica(from);
+ dout(10) << " have " << *dn << dendl;
+ if (ack)
+ ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
+ dnl->get_inode()->ino(), inodeno_t(0), 0,
+ dnonce, dn->lock.get_replica_state());
+
+ // inode
+ CInode *in = dnl->get_inode();
+ ceph_assert(in);
+
+ if (survivor && in->is_replica(from))
+ inode_remove_replica(in, from, true, gather_locks);
+ unsigned inonce = in->add_replica(from);
+ dout(10) << " have " << *in << dendl;
+
+ // scatter the dirlock, just in case?
+ if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
+ in->filelock.set_state(LOCK_MIX);
+
+ if (ack) {
+ acked_inodes.insert(in->vino());
+ ack->add_inode_base(in, mds->mdsmap->get_up_features());
+ bufferlist bl;
+ in->_encode_locks_state_for_rejoin(bl, from);
+ ack->add_inode_locks(in, inonce, bl);
+ }
+ }
+ }
+
+ // weak base inodes? (root, stray, etc.)
+ for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
+ p != weak->weak_inodes.end();
+ ++p) {
+ CInode *in = get_inode(*p);
+ ceph_assert(in); // hmm fixme wrt stray?
+ if (survivor && in->is_replica(from))
+ inode_remove_replica(in, from, true, gather_locks);
+ unsigned inonce = in->add_replica(from);
+ dout(10) << " have base " << *in << dendl;
+
+ if (ack) {
+ acked_inodes.insert(in->vino());
+ ack->add_inode_base(in, mds->mdsmap->get_up_features());
+ bufferlist bl;
+ in->_encode_locks_state_for_rejoin(bl, from);
+ ack->add_inode_locks(in, inonce, bl);
+ }
+ }
+
+ ceph_assert(rejoin_gather.count(from));
+ rejoin_gather.erase(from);
+ if (survivor) {
+ // survivor. do everything now.
+ for (const auto &p : weak->inode_scatterlocks) {
+ CInode *in = get_inode(p.first);
+ ceph_assert(in);
+ dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
+ acked_inodes.insert(in->vino());
+ ack->add_inode_base(in, mds->mdsmap->get_up_features());
+ }
+
+ rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
+ mds->send_message(ack, weak->get_connection());
+
+ for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
+ if (!(*p)->is_stable())
+ mds->locker->eval_gather(*p);
+ }
+ } else {
+ // done?
+ if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
+ rejoin_gather_finish();
+ } else {
+ dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
+ }
+ }
+}
+
+/*
+ * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
+ *
+ * all validated replicas are acked with a strong nonce, etc. if that isn't in the
+ * ack, the replica dne, and we can remove it from our replica maps.
+ */
+void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const MMDSCacheRejoin::const_ref &ack,
+ set<vinodeno_t>& acked_inodes,
+ set<SimpleLock *>& gather_locks)
+{
+ dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
+
+ auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
+ // inode?
+ if (in->is_auth() &&
+ in->is_replica(from) &&
+ (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
+ inode_remove_replica(in, from, false, gather_locks);
+ dout(10) << " rem " << *in << dendl;
+ }
+
+ if (!in->is_dir())
+ return;
+
+ list<CDir*> dfs;
+ in->get_dirfrags(dfs);
+ for (list<CDir*>::iterator p = dfs.begin();
+ p != dfs.end();
+ ++p) {
+ CDir *dir = *p;
+ if (!dir->is_auth())
+ continue;
+
+ if (dir->is_replica(from) &&
+ (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
+ dir->remove_replica(from);
+ dout(10) << " rem " << *dir << dendl;
+ }
+
+ // dentries
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
+
+ if (dn->is_replica(from)) {
+ if (ack) {
+ const auto it = ack->strong_dentries.find(dir->dirfrag());
+ if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
+ continue;
+ }
+ }
+ dentry_remove_replica(dn, from, gather_locks);
+ dout(10) << " rem " << *dn << dendl;
+ }
+ }
+ }
+ };
+
+ for (auto &p : inode_map)
+ scour_func(p.second);
+ for (auto &p : snap_inode_map)
+ scour_func(p.second);
+}
+
+
+CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
+{
+ CInode *in = new CInode(this, true, 1, last);
+ in->inode.ino = ino;
+ in->state_set(CInode::STATE_REJOINUNDEF);
+ add_inode(in);
+ rejoin_undef_inodes.insert(in);
+ dout(10) << " invented " << *in << dendl;
+ return in;
+}
+
+CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
+{
+ CInode *in = get_inode(df.ino);
+ if (!in)
+ in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
+ if (!in->is_dir()) {
+ ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
+ in->inode.mode = S_IFDIR;
+ in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+ }
+ CDir *dir = in->get_or_open_dirfrag(this, df.frag);
+ dir->state_set(CDir::STATE_REJOINUNDEF);
+ rejoin_undef_dirfrags.insert(dir);
+ dout(10) << " invented " << *dir << dendl;
+ return dir;
+}
+
+void MDCache::handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref &strong)
+{
+ mds_rank_t from = mds_rank_t(strong->get_source().num());
+
+ // only a recovering node will get a strong rejoin.
+ if (!mds->is_rejoin()) {
+ if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
+ mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
+ return;
+ }
+ ceph_abort_msg("got unexpected rejoin message during recovery");
+ }
+
+ // assimilate any potentially dirty scatterlock state
+ for (const auto &p : strong->inode_scatterlocks) {
+ CInode *in = get_inode(p.first);
+ ceph_assert(in);
+ in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
+ in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
+ in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
+ rejoin_potential_updated_scatterlocks.insert(in);
+ }
+
+ rejoin_unlinked_inodes[from].clear();
+
+ // surviving peer may send incorrect dirfrag here (maybe they didn't
+ // get the fragment notify, or maybe we rolled back?). we need to
+ // infer the right frag and get them with the program. somehow.
+ // we don't normally send ACK.. so we'll need to bundle this with
+ // MISSING or something.
+
+ // strong dirfrags/dentries.
+ // also process auth_pins, xlocks.
+ for (const auto &p : strong->strong_dirfrags) {
+ auto& dirfrag = p.first;
+ CInode *diri = get_inode(dirfrag.ino);
+ if (!diri)
+ diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
+ CDir *dir = diri->get_dirfrag(dirfrag.frag);
+ bool refragged = false;
+ if (dir) {
+ dout(10) << " have " << *dir << dendl;
+ } else {
+ if (diri->state_test(CInode::STATE_REJOINUNDEF))
+ dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
+ else if (diri->dirfragtree.is_leaf(dirfrag.frag))
+ dir = rejoin_invent_dirfrag(dirfrag);
+ }
+ if (dir) {
+ dir->add_replica(from, p.second.nonce);
+ dir->dir_rep = p.second.dir_rep;
+ } else {
+ dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
+ frag_vec_t leaves;
+ diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
+ if (leaves.empty())
+ leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
+ dout(10) << " maps to frag(s) " << leaves << dendl;
+ for (const auto& leaf : leaves) {
+ CDir *dir = diri->get_dirfrag(leaf);
+ if (!dir)
+ dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
+ else
+ dout(10) << " have(approx) " << *dir << dendl;
+ dir->add_replica(from, p.second.nonce);
+ dir->dir_rep = p.second.dir_rep;
+ }
+ refragged = true;
+ }
+
+ const auto it = strong->strong_dentries.find(dirfrag);
+ if (it != strong->strong_dentries.end()) {
+ const map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = it->second;
+ for (const auto &q : dmap) {
+ const string_snap_t& ss = q.first;
+ const MMDSCacheRejoin::dn_strong& d = q.second;
+ CDentry *dn;
+ if (!refragged)
+ dn = dir->lookup(ss.name, ss.snapid);
+ else {
+ frag_t fg = diri->pick_dirfrag(ss.name);
+ dir = diri->get_dirfrag(fg);
+ ceph_assert(dir);
+ dn = dir->lookup(ss.name, ss.snapid);
+ }
+ if (!dn) {
+ if (d.is_remote()) {
+ dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, d.first, ss.snapid);
+ } else if (d.is_null()) {
+ dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
+ } else {
+ CInode *in = get_inode(d.ino, ss.snapid);
+ if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
+ dn = dir->add_primary_dentry(ss.name, in, d.first, ss.snapid);
+ }
+ dout(10) << " invented " << *dn << dendl;
+ }
+ CDentry::linkage_t *dnl = dn->get_linkage();
+
+ // dn auth_pin?
+ const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
+ if (pinned_it != strong->authpinned_dentries.end()) {
+ const auto slave_reqid_it = pinned_it->second.find(ss);
+ if (slave_reqid_it != pinned_it->second.end()) {
+ for (const auto &r : slave_reqid_it->second) {
+ dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
+
+ // get/create slave mdrequest
+ MDRequestRef mdr;
+ if (have_request(r.reqid))
+ mdr = request_get(r.reqid);
+ else
+ mdr = request_start_slave(r.reqid, r.attempt, strong);
+ mdr->auth_pin(dn);
+ }
+ }
+ }
+
+ // dn xlock?
+ const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
+ if (xlocked_it != strong->xlocked_dentries.end()) {
+ const auto ss_req_it = xlocked_it->second.find(ss);
+ if (ss_req_it != xlocked_it->second.end()) {
+ const MMDSCacheRejoin::slave_reqid& r = ss_req_it->second;
+ dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
+ MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
+ ceph_assert(mdr->is_auth_pinned(dn));
+ if (!mdr->is_xlocked(&dn->versionlock)) {
+ ceph_assert(dn->versionlock.can_xlock_local());
+ dn->versionlock.get_xlock(mdr, mdr->get_client());
+ mdr->locks.emplace(&dn->versionlock, MutationImpl::LockOp::XLOCK);
+ }
+ if (dn->lock.is_stable())
+ dn->auth_pin(&dn->lock);
+ dn->lock.set_state(LOCK_XLOCK);
+ dn->lock.get_xlock(mdr, mdr->get_client());
+ mdr->locks.emplace(&dn->lock, MutationImpl::LockOp::XLOCK);
+ }
+ }
+
+ dn->add_replica(from, d.nonce);
+ dout(10) << " have " << *dn << dendl;
+
+ if (dnl->is_primary()) {
+ if (d.is_primary()) {
+ if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
+ // the survivor missed MDentryUnlink+MDentryLink messages ?
+ ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
+ CInode *in = get_inode(d.ino, ss.snapid);
+ ceph_assert(in);
+ ceph_assert(in->get_parent_dn());
+ rejoin_unlinked_inodes[from].insert(in);
+ dout(7) << " sender has primary dentry but wrong inode" << dendl;
+ }
+ } else {
+ // the survivor missed MDentryLink message ?
+ ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
+ dout(7) << " sender doesn't have primay dentry" << dendl;
+ }
+ } else {
+ if (d.is_primary()) {
+ // the survivor missed MDentryUnlink message ?
+ CInode *in = get_inode(d.ino, ss.snapid);
+ ceph_assert(in);
+ ceph_assert(in->get_parent_dn());
+ rejoin_unlinked_inodes[from].insert(in);
+ dout(7) << " sender has primary dentry but we don't" << dendl;
+ }
+ }
+ }
+ }
+ }
+
+ for (const auto &p : strong->strong_inodes) {
+ CInode *in = get_inode(p.first);
+ ceph_assert(in);
+ in->add_replica(from, p.second.nonce);
+ dout(10) << " have " << *in << dendl;
+
+ const MMDSCacheRejoin::inode_strong& is = p.second;
+
+ // caps_wanted
+ if (is.caps_wanted) {
+ in->set_mds_caps_wanted(from, is.caps_wanted);
+ dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
+ << " on " << *in << dendl;
+ }
+
+ // scatterlocks?
+ // infer state from replica state:
+ // * go to MIX if they might have wrlocks
+ // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
+ in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
+ in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
+ in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
+
+ // auth pin?
+ const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
+ if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
+ for (const auto& r : authpinned_inodes_it->second) {
+ dout(10) << " inode authpin by " << r << " on " << *in << dendl;
+
+ // get/create slave mdrequest
+ MDRequestRef mdr;
+ if (have_request(r.reqid))
+ mdr = request_get(r.reqid);
+ else
+ mdr = request_start_slave(r.reqid, r.attempt, strong);
+ if (strong->frozen_authpin_inodes.count(in->vino())) {
+ ceph_assert(!in->get_num_auth_pins());
+ mdr->freeze_auth_pin(in);
+ } else {
+ ceph_assert(!in->is_frozen_auth_pin());
+ }
+ mdr->auth_pin(in);
+ }
+ }
+ // xlock(s)?
+ const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
+ if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
+ for (const auto &q : xlocked_inodes_it->second) {
+ SimpleLock *lock = in->get_lock(q.first);
+ dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
+ MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above.
+ ceph_assert(mdr->is_auth_pinned(in));
+ if (!mdr->is_xlocked(&in->versionlock)) {
+ ceph_assert(in->versionlock.can_xlock_local());
+ in->versionlock.get_xlock(mdr, mdr->get_client());
+ mdr->locks.emplace(&in->versionlock, MutationImpl::LockOp::XLOCK);
+ }
+ if (lock->is_stable())
+ in->auth_pin(lock);
+ lock->set_state(LOCK_XLOCK);
+ if (lock == &in->filelock)
+ in->loner_cap = -1;
+ lock->get_xlock(mdr, mdr->get_client());
+ mdr->locks.emplace(lock, MutationImpl::LockOp::XLOCK);
+ }
+ }
+ }
+ // wrlock(s)?
+ for (const auto &p : strong->wrlocked_inodes) {
+ CInode *in = get_inode(p.first);
+ for (const auto &q : p.second) {
+ SimpleLock *lock = in->get_lock(q.first);
+ for (const auto &r : q.second) {
+ dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
+ MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
+ if (in->is_auth())
+ ceph_assert(mdr->is_auth_pinned(in));
+ lock->set_state(LOCK_MIX);
+ if (lock == &in->filelock)
+ in->loner_cap = -1;
+ lock->get_wrlock(true);
+ mdr->locks.emplace(lock, MutationImpl::LockOp::WRLOCK);
+ }
+ }
+ }
+
+ // done?
+ ceph_assert(rejoin_gather.count(from));
+ rejoin_gather.erase(from);
+ if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
+ rejoin_gather_finish();
+ } else {
+ dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
+ }
+}
+
+void MDCache::handle_cache_rejoin_ack(const MMDSCacheRejoin::const_ref &ack)
+{
+ dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
+ mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+ ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
+ bool survivor = !mds->is_rejoin();
+
+ // for sending cache expire message
+ set<CInode*> isolated_inodes;
+ set<CInode*> refragged_inodes;
+ list<pair<CInode*,int> > updated_realms;
+
+ // dirs
+ for (const auto &p : ack->strong_dirfrags) {
+ // we may have had incorrect dir fragmentation; refragment based
+ // on what they auth tells us.
+ CDir *dir = get_dirfrag(p.first);
+ if (!dir) {
+ dir = get_force_dirfrag(p.first, false);
+ if (dir)
+ refragged_inodes.insert(dir->get_inode());
+ }
+ if (!dir) {
+ CInode *diri = get_inode(p.first.ino);
+ if (!diri) {
+ // barebones inode; the full inode loop below will clean up.
+ diri = new CInode(this, false);
+ diri->inode.ino = p.first.ino;
+ diri->inode.mode = S_IFDIR;
+ diri->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+ add_inode(diri);
+ if (MDS_INO_MDSDIR(from) == p.first.ino) {
+ diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
+ dout(10) << " add inode " << *diri << dendl;
+ } else {
+ diri->inode_auth = CDIR_AUTH_DEFAULT;
+ isolated_inodes.insert(diri);
+ dout(10) << " unconnected dirfrag " << p.first << dendl;
+ }
+ }
+ // barebones dirfrag; the full dirfrag loop below will clean up.
+ dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
+ if (MDS_INO_MDSDIR(from) == p.first.ino ||
+ (dir->authority() != CDIR_AUTH_UNDEF &&
+ dir->authority().first != from))
+ adjust_subtree_auth(dir, from);
+ dout(10) << " add dirfrag " << *dir << dendl;
+ }
+
+ dir->set_replica_nonce(p.second.nonce);
+ dir->state_clear(CDir::STATE_REJOINING);
+ dout(10) << " got " << *dir << dendl;
+
+ // dentries
+ auto it = ack->strong_dentries.find(p.first);
+ if (it != ack->strong_dentries.end()) {
+ for (const auto &q : it->second) {
+ CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
+ if(!dn)
+ dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
+
+ CDentry::linkage_t *dnl = dn->get_linkage();
+
+ ceph_assert(dn->last == q.first.snapid);
+ if (dn->first != q.second.first) {
+ dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
+ dn->first = q.second.first;
+ }
+
+ // may have bad linkage if we missed dentry link/unlink messages
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ if (!q.second.is_primary() ||
+ vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
+ dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
+ dir->unlink_inode(dn);
+ }
+ } else if (dnl->is_remote()) {
+ if (!q.second.is_remote() ||
+ q.second.remote_ino != dnl->get_remote_ino() ||
+ q.second.remote_d_type != dnl->get_remote_d_type()) {
+ dout(10) << " had bad linkage for " << *dn << dendl;
+ dir->unlink_inode(dn);
+ }
+ } else {
+ if (!q.second.is_null())
+ dout(10) << " had bad linkage for " << *dn << dendl;
+ }
+
+ // hmm, did we have the proper linkage here?
+ if (dnl->is_null() && !q.second.is_null()) {
+ if (q.second.is_remote()) {
+ dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
+ } else {
+ CInode *in = get_inode(q.second.ino, q.first.snapid);
+ if (!in) {
+ // barebones inode; assume it's dir, the full inode loop below will clean up.
+ in = new CInode(this, false, q.second.first, q.first.snapid);
+ in->inode.ino = q.second.ino;
+ in->inode.mode = S_IFDIR;
+ in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+ add_inode(in);
+ dout(10) << " add inode " << *in << dendl;
+ } else if (in->get_parent_dn()) {
+ dout(10) << " had bad linkage for " << *(in->get_parent_dn())
+ << ", unlinking " << *in << dendl;
+ in->get_parent_dir()->unlink_inode(in->get_parent_dn());
+ }
+ dn->dir->link_primary_inode(dn, in);
+ isolated_inodes.erase(in);
+ }
+ }
+
+ dn->set_replica_nonce(q.second.nonce);
+ dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
+ dn->state_clear(CDentry::STATE_REJOINING);
+ dout(10) << " got " << *dn << dendl;
+ }
+ }
+ }
+
+ for (set<CInode*>::iterator p = refragged_inodes.begin();
+ p != refragged_inodes.end();
+ ++p) {
+ list<CDir*> ls;
+ (*p)->get_nested_dirfrags(ls);
+ for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
+ if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
+ continue;
+ ceph_assert((*q)->get_num_any() == 0);
+ (*p)->close_dirfrag((*q)->get_frag());
+ }
+ }
+
+ // full dirfrags
+ for (const auto &p : ack->dirfrag_bases) {
+ CDir *dir = get_dirfrag(p.first);
+ ceph_assert(dir);
+ auto q = p.second.cbegin();
+ dir->_decode_base(q);
+ dout(10) << " got dir replica " << *dir << dendl;
+ }
+
+ // full inodes
+ auto p = ack->inode_base.cbegin();
+ while (!p.end()) {
+ inodeno_t ino;
+ snapid_t last;
+ bufferlist basebl;
+ decode(ino, p);
+ decode(last, p);
+ decode(basebl, p);
+ CInode *in = get_inode(ino, last);
+ ceph_assert(in);
+ auto q = basebl.cbegin();
+ snapid_t sseq = 0;
+ if (in->snaprealm)
+ sseq = in->snaprealm->srnode.seq;
+ in->_decode_base(q);
+ if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
+ int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
+ updated_realms.push_back(pair<CInode*,int>(in, snap_op));
+ }
+ dout(10) << " got inode base " << *in << dendl;
+ }
+
+ // inodes
+ p = ack->inode_locks.cbegin();
+ //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
+ while (!p.end()) {
+ inodeno_t ino;
+ snapid_t last;
+ __u32 nonce;
+ bufferlist lockbl;
+ decode(ino, p);
+ decode(last, p);
+ decode(nonce, p);
+ decode(lockbl, p);
+
+ CInode *in = get_inode(ino, last);
+ ceph_assert(in);
+ in->set_replica_nonce(nonce);
+ auto q = lockbl.cbegin();
+ in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
+ in->state_clear(CInode::STATE_REJOINING);
+ dout(10) << " got inode locks " << *in << dendl;
+ }
+
+ // FIXME: This can happen if entire subtree, together with the inode subtree root
+ // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
+ ceph_assert(isolated_inodes.empty());
+
+ map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
+ auto bp = ack->imported_caps.cbegin();
+ decode(peer_imported, bp);
+
+ for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
+ p != peer_imported.end();
+ ++p) {
+ auto& ex = cap_exports.at(p->first);
+ ceph_assert(ex.first == from);
+ for (map<client_t,Capability::Import>::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q) {
+ auto r = ex.second.find(q->first);
+ ceph_assert(r != ex.second.end());
+
+ dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+ if (!session) {
+ dout(10) << " no session for client." << p->first << dendl;
+ ex.second.erase(r);
+ continue;
+ }
+
+ // mark client caps stale.
+ auto m = MClientCaps::create(CEPH_CAP_OP_EXPORT, p->first, 0,
+ r->second.capinfo.cap_id, 0,
+ mds->get_osd_epoch_barrier());
+ m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
+ (q->second.cap_id > 0 ? from : -1), 0);
+ mds->send_message_client_counted(m, session);
+
+ ex.second.erase(r);
+ }
+ ceph_assert(ex.second.empty());
+ }
+
+ for (auto p : updated_realms) {
+ CInode *in = p.first;
+ bool notify_clients;
+ if (mds->is_rejoin()) {
+ if (!rejoin_pending_snaprealms.count(in)) {
+ in->get(CInode::PIN_OPENINGSNAPPARENTS);
+ rejoin_pending_snaprealms.insert(in);
+ }
+ notify_clients = false;
+ } else {
+ // notify clients if I'm survivor
+ notify_clients = true;
+ }
+ do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
+ }
+
+ // done?
+ ceph_assert(rejoin_ack_gather.count(from));
+ rejoin_ack_gather.erase(from);
+ if (!survivor) {
+ if (rejoin_gather.empty()) {
+ // eval unstable scatter locks after all wrlocks are rejoined.
+ while (!rejoin_eval_locks.empty()) {
+ SimpleLock *lock = rejoin_eval_locks.front();
+ rejoin_eval_locks.pop_front();
+ if (!lock->is_stable())
+ mds->locker->eval_gather(lock);
+ }
+ }
+
+ if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
+ rejoin_ack_gather.empty()) {
+ // finally, kickstart past snap parent opens
+ open_snaprealms();
+ } else {
+ dout(7) << "still need rejoin from (" << rejoin_gather << ")"
+ << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
+ }
+ } else {
+ // survivor.
+ mds->queue_waiters(rejoin_waiters);
+ }
+}
+
+/**
+ * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
+ *
+ * FIXME: wait, can this actually happen? a survivor should generate cache trim
+ * messages that clean these guys up...
+ */
+void MDCache::rejoin_trim_undef_inodes()
+{
+ dout(10) << "rejoin_trim_undef_inodes" << dendl;
+
+ while (!rejoin_undef_inodes.empty()) {
+ set<CInode*>::iterator p = rejoin_undef_inodes.begin();
+ CInode *in = *p;
+ rejoin_undef_inodes.erase(p);
+
+ in->clear_replica_map();
+
+ // close out dirfrags
+ if (in->is_dir()) {
+ list<CDir*> dfls;
+ in->get_dirfrags(dfls);
+ for (list<CDir*>::iterator p = dfls.begin();
+ p != dfls.end();
+ ++p) {
+ CDir *dir = *p;
+ dir->clear_replica_map();
+
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
+ dn->clear_replica_map();
+
+ dout(10) << " trimming " << *dn << dendl;
+ dir->remove_dentry(dn);
+ }
+
+ dout(10) << " trimming " << *dir << dendl;
+ in->close_dirfrag(dir->dirfrag().frag);
+ }
+ }
+
+ CDentry *dn = in->get_parent_dn();
+ if (dn) {
+ dn->clear_replica_map();
+ dout(10) << " trimming " << *dn << dendl;
+ dn->dir->remove_dentry(dn);
+ } else {
+ dout(10) << " trimming " << *in << dendl;
+ remove_inode(in);
+ }
+ }
+
+ ceph_assert(rejoin_undef_inodes.empty());
+}
+
+void MDCache::rejoin_gather_finish()
+{
+ dout(10) << "rejoin_gather_finish" << dendl;
+ ceph_assert(mds->is_rejoin());
+ ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
+
+ if (open_undef_inodes_dirfrags())
+ return;
+
+ if (process_imported_caps())
+ return;
+
+ choose_lock_states_and_reconnect_caps();
+
+ identify_files_to_recover();
+ rejoin_send_acks();
+
+ // signal completion of fetches, rejoin_gather_finish, etc.
+ rejoin_ack_gather.erase(mds->get_nodeid());
+
+ // did we already get our acks too?
+ if (rejoin_ack_gather.empty()) {
+ // finally, open snaprealms
+ open_snaprealms();
+ }
+}
+
+class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
+ inodeno_t ino;
+public:
+ C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
+ void finish(int r) override {
+ mdcache->rejoin_open_ino_finish(ino, r);
+ }
+};
+
+void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
+{
+ dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
+
+ if (ret < 0) {
+ cap_imports_missing.insert(ino);
+ } else if (ret == mds->get_nodeid()) {
+ ceph_assert(get_inode(ino));
+ } else {
+ auto p = cap_imports.find(ino);
+ ceph_assert(p != cap_imports.end());
+ for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+ ceph_assert(q->second.count(MDS_RANK_NONE));
+ ceph_assert(q->second.size() == 1);
+ rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
+ }
+ cap_imports.erase(p);
+ }
+
+ ceph_assert(cap_imports_num_opening > 0);
+ cap_imports_num_opening--;
+
+ if (cap_imports_num_opening == 0) {
+ if (rejoin_gather.empty())
+ rejoin_gather_finish();
+ else if (rejoin_gather.count(mds->get_nodeid()))
+ process_imported_caps();
+ }
+}
+
+class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
+public:
+ map<client_t,pair<Session*,uint64_t> > session_map;
+ C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ mdcache->rejoin_open_sessions_finish(session_map);
+ }
+};
+
+void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
+{
+ dout(10) << "rejoin_open_sessions_finish" << dendl;
+ mds->server->finish_force_open_sessions(session_map);
+ rejoin_session_map.swap(session_map);
+ if (rejoin_gather.empty())
+ rejoin_gather_finish();
+}
+
+void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
+{
+ auto p = cap_imports.find(ino);
+ if (p != cap_imports.end()) {
+ dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
+ if (ret < 0) {
+ cap_imports_missing.insert(ino);
+ } else if (ret != mds->get_nodeid()) {
+ for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+ ceph_assert(q->second.count(MDS_RANK_NONE));
+ ceph_assert(q->second.size() == 1);
+ rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
+ }
+ cap_imports.erase(p);
+ }
+ }
+}
+
+bool MDCache::process_imported_caps()
+{
+ dout(10) << "process_imported_caps" << dendl;
+
+ if (!open_file_table.is_prefetched() &&
+ open_file_table.prefetch_inodes()) {
+ open_file_table.wait_for_prefetch(
+ new MDSInternalContextWrapper(mds,
+ new FunctionContext([this](int r) {
+ ceph_assert(rejoin_gather.count(mds->get_nodeid()));
+ process_imported_caps();
+ })
+ )
+ );
+ return true;
+ }
+
+ for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
+ CInode *in = get_inode(p->first);
+ if (in) {
+ ceph_assert(in->is_auth());
+ cap_imports_missing.erase(p->first);
+ continue;
+ }
+ if (cap_imports_missing.count(p->first) > 0)
+ continue;
+
+ cap_imports_num_opening++;
+ dout(10) << " opening missing ino " << p->first << dendl;
+ open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
+ if (!(cap_imports_num_opening % 1000))
+ mds->heartbeat_reset();
+ }
+
+ if (cap_imports_num_opening > 0)
+ return true;
+
+ // called by rejoin_gather_finish() ?
+ if (rejoin_gather.count(mds->get_nodeid()) == 0) {
+ if (!rejoin_client_map.empty() &&
+ rejoin_session_map.empty()) {
+ C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
+ version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
+ rejoin_client_metadata_map,
+ finish->session_map);
+ ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
+ std::move(rejoin_client_metadata_map));
+ mds->mdlog->start_submit_entry(le, finish);
+ mds->mdlog->flush();
+ rejoin_client_map.clear();
+ rejoin_client_metadata_map.clear();
+ return true;
+ }
+
+ // process caps that were exported by slave rename
+ for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
+ p != rejoin_slave_exports.end();
+ ++p) {
+ CInode *in = get_inode(p->first);
+ ceph_assert(in);
+ for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
+ q != p->second.second.end();
+ ++q) {
+ auto r = rejoin_session_map.find(q->first);
+ if (r == rejoin_session_map.end())
+ continue;
+
+ Session *session = r->second.first;
+ Capability *cap = in->get_client_cap(q->first);
+ if (!cap) {
+ cap = in->add_client_cap(q->first, session);
+ // add empty item to reconnected_caps
+ (void)reconnected_caps[p->first][q->first];
+ }
+ cap->merge(q->second, true);
+
+ Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
+ ceph_assert(cap->get_last_seq() == im.issue_seq);
+ ceph_assert(cap->get_mseq() == im.mseq);
+ cap->set_cap_id(im.cap_id);
+ // send cap import because we assigned a new cap ID
+ do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
+ p->second.first, CEPH_CAP_FLAG_AUTH);
+ }
+ }
+ rejoin_slave_exports.clear();
+ rejoin_imported_caps.clear();
+
+ // process cap imports
+ // ino -> client -> frommds -> capex
+ for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
+ CInode *in = get_inode(p->first);
+ if (!in) {
+ dout(10) << " still missing ino " << p->first
+ << ", will try again after replayed client requests" << dendl;
+ ++p;
+ continue;
+ }
+ ceph_assert(in->is_auth());
+ for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+ Session *session;
+ {
+ auto r = rejoin_session_map.find(q->first);
+ session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
+ }
+
+ for (auto r = q->second.begin(); r != q->second.end(); ++r) {
+ if (!session) {
+ if (r->first >= 0)
+ (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
+ continue;
+ }
+
+ Capability *cap = in->reconnect_cap(q->first, r->second, session);
+ add_reconnected_cap(q->first, in->ino(), r->second);
+ if (r->first >= 0) {
+ if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
+ cap->inc_mseq();
+ do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
+
+ Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
+ im.cap_id = cap->get_cap_id();
+ im.issue_seq = cap->get_last_seq();
+ im.mseq = cap->get_mseq();
+ }
+ }
+ }
+ cap_imports.erase(p++); // remove and move on
+ }
+ } else {
+ trim_non_auth();
+
+ ceph_assert(rejoin_gather.count(mds->get_nodeid()));
+ rejoin_gather.erase(mds->get_nodeid());
+ ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
+ maybe_send_pending_rejoins();
+ }
+ return false;
+}
+
+void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
+ client_t client, snapid_t snap_follows)
+{
+ dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
+
+ if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
+ return;
+
+ const set<snapid_t>& snaps = realm->get_snaps();
+ snapid_t follows = snap_follows;
+
+ while (true) {
+ CInode *in = pick_inode_snap(head_in, follows);
+ if (in == head_in)
+ break;
+
+ bool need_snapflush = false;
+ for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
+ p != snaps.end() && *p <= in->last;
+ ++p) {
+ head_in->add_need_snapflush(in, *p, client);
+ need_snapflush = true;
+ }
+ follows = in->last;
+ if (!need_snapflush)
+ continue;
+
+ dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
+
+ if (in->client_snap_caps.empty()) {
+ for (int i = 0; i < num_cinode_locks; i++) {
+ int lockid = cinode_lock_info[i].lock;
+ SimpleLock *lock = in->get_lock(lockid);
+ ceph_assert(lock);
+ in->auth_pin(lock);
+ lock->set_state(LOCK_SNAP_SYNC);
+ lock->get_wrlock(true);
+ }
+ }
+ in->client_snap_caps.insert(client);
+ mds->locker->mark_need_snapflush_inode(in);
+ }
+}
+
+/*
+ * choose lock states based on reconnected caps
+ */
+void MDCache::choose_lock_states_and_reconnect_caps()
+{
+ dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
+
+ int count = 0;
+ for (auto p : inode_map) {
+ CInode *in = p.second;
+ if (in->last != CEPH_NOSNAP)
+ continue;
+
+ if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
+ in->mark_dirty_rstat();
+
+ int dirty_caps = 0;
+ auto q = reconnected_caps.find(in->ino());
+ if (q != reconnected_caps.end()) {
+ for (const auto &it : q->second)
+ dirty_caps |= it.second.dirty_caps;
+ }
+ in->choose_lock_states(dirty_caps);
+ dout(15) << " chose lock states on " << *in << dendl;
+
+ if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
+ in->get(CInode::PIN_OPENINGSNAPPARENTS);
+ rejoin_pending_snaprealms.insert(in);
+ }
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+}
+
+void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
+ map<client_t,MClientSnap::ref>& splits)
+{
+ MClientSnap::ref snap;
+ auto it = splits.find(client);
+ if (it != splits.end()) {
+ snap = it->second;
+ snap->head.op = CEPH_SNAP_OP_SPLIT;
+ } else {
+ snap = MClientSnap::create(CEPH_SNAP_OP_SPLIT);
+ splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
+ snap->head.split = realm->inode->ino();
+ snap->bl = realm->get_snap_trace();
+
+ for (const auto& child : realm->open_children)
+ snap->split_realms.push_back(child->inode->ino());
+ }
+ snap->split_inos.push_back(ino);
+}
+
+void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
+ map<client_t,MClientSnap::ref>& splits)
+{
+ ceph_assert(parent_realm);
+
+ vector<inodeno_t> split_inos;
+ vector<inodeno_t> split_realms;
+
+ for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
+ !p.end();
+ ++p)
+ split_inos.push_back((*p)->ino());
+ for (set<SnapRealm*>::iterator p = realm->open_children.begin();
+ p != realm->open_children.end();
+ ++p)
+ split_realms.push_back((*p)->inode->ino());
+
+ for (const auto& p : realm->client_caps) {
+ ceph_assert(!p.second->empty());
+ auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
+ if (em.second) {
+ auto update = MClientSnap::create(CEPH_SNAP_OP_SPLIT);
+ update->head.split = parent_realm->inode->ino();
+ update->split_inos = split_inos;
+ update->split_realms = split_realms;
+ update->bl = parent_realm->get_snap_trace();
+ em.first->second = std::move(update);
+ }
+ }
+}
+
+void MDCache::send_snaps(map<client_t,MClientSnap::ref>& splits)
+{
+ dout(10) << "send_snaps" << dendl;
+
+ for (auto &p : splits) {
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
+ if (session) {
+ dout(10) << " client." << p.first
+ << " split " << p.second->head.split
+ << " inos " << p.second->split_inos
+ << dendl;
+ mds->send_message_client_counted(p.second, session);
+ } else {
+ dout(10) << " no session for client." << p.first << dendl;
+ }
+ }
+ splits.clear();
+}
+
+
+/*
+ * remove any items from logsegment open_file lists that don't have
+ * any caps
+ */
+void MDCache::clean_open_file_lists()
+{
+ dout(10) << "clean_open_file_lists" << dendl;
+
+ for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
+ p != mds->mdlog->segments.end();
+ ++p) {
+ LogSegment *ls = p->second;
+
+ elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
+ while (!q.end()) {
+ CInode *in = *q;
+ ++q;
+ if (in->last == CEPH_NOSNAP) {
+ dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
+ in->item_open_file.remove_myself();
+ } else {
+ if (in->client_snap_caps.empty()) {
+ dout(10) << " unlisting flushed snap inode " << *in << dendl;
+ in->item_open_file.remove_myself();
+ }
+ }
+ }
+ }
+}
+
+void MDCache::dump_openfiles(Formatter *f)
+{
+ f->open_array_section("openfiles");
+ for (auto p = mds->mdlog->segments.begin();
+ p != mds->mdlog->segments.end();
+ ++p) {
+ LogSegment *ls = p->second;
+
+ auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
+ while (!q.end()) {
+ CInode *in = *q;
+ ++q;
+ if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
+ || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty()))
+ continue;
+ f->open_object_section("file");
+ in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
+ f->close_section();
+ }
+ }
+ f->close_section();
+}
+
+Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
+{
+ dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
+ << " on " << *in << dendl;
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
+ if (!session) {
+ dout(10) << " no session for client." << client << dendl;
+ return NULL;
+ }
+
+ Capability *cap = in->reconnect_cap(client, icr, session);
+
+ if (frommds >= 0) {
+ if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
+ cap->inc_mseq();
+ do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
+ }
+
+ return cap;
+}
+
+void MDCache::export_remaining_imported_caps()
+{
+ dout(10) << "export_remaining_imported_caps" << dendl;
+
+ stringstream warn_str;
+
+ int count = 0;
+ for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
+ warn_str << " ino " << p->first << "\n";
+ for (auto q = p->second.begin(); q != p->second.end(); ++q) {
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+ if (session) {
+ // mark client caps stale.
+ auto stale = MClientCaps::create(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
+ stale->set_cap_peer(0, 0, 0, -1, 0);
+ mds->send_message_client_counted(stale, q->first);
+ }
+ }
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+
+ for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
+ p != cap_reconnect_waiters.end();
+ ++p)
+ mds->queue_waiters(p->second);
+
+ cap_imports.clear();
+ cap_reconnect_waiters.clear();
+
+ if (warn_str.peek() != EOF) {
+ mds->clog->warn() << "failed to reconnect caps for missing inodes:";
+ mds->clog->warn(warn_str);
+ }
+}
+
+Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
+{
+ client_t client = session->info.get_client();
+ Capability *cap = nullptr;
+ const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
+ if (rc) {
+ cap = in->reconnect_cap(client, *rc, session);
+ dout(10) << "try_reconnect_cap client." << client
+ << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
+ << " issue " << ccap_string(rc->capinfo.issued)
+ << " on " << *in << dendl;
+ remove_replay_cap_reconnect(in->ino(), client);
+
+ if (in->is_replicated()) {
+ mds->locker->try_eval(in, CEPH_CAP_LOCKS);
+ } else {
+ int dirty_caps = 0;
+ auto p = reconnected_caps.find(in->ino());
+ if (p != reconnected_caps.end()) {
+ auto q = p->second.find(client);
+ if (q != p->second.end())
+ dirty_caps = q->second.dirty_caps;
+ }
+ in->choose_lock_states(dirty_caps);
+ dout(15) << " chose lock states on " << *in << dendl;
+ }
+
+ map<inodeno_t, MDSContext::vec >::iterator it =
+ cap_reconnect_waiters.find(in->ino());
+ if (it != cap_reconnect_waiters.end()) {
+ mds->queue_waiters(it->second);
+ cap_reconnect_waiters.erase(it);
+ }
+ }
+ return cap;
+}
+
+
+
+// -------
+// cap imports and delayed snap parent opens
+
+void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
+ uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
+ int peer, int p_flags)
+{
+ SnapRealm *realm = in->find_snaprealm();
+ if (realm->have_past_parents_open()) {
+ dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
+ if (cap->get_last_seq() == 0) // reconnected cap
+ cap->inc_last_seq();
+ cap->set_last_issue();
+ cap->set_last_issue_stamp(ceph_clock_now());
+ cap->clear_new();
+ auto reap = MClientCaps::create(CEPH_CAP_OP_IMPORT, in->ino(), realm->inode->ino(), cap->get_cap_id(), cap->get_last_seq(), cap->pending(), cap->wanted(), 0, cap->get_mseq(), mds->get_osd_epoch_barrier());
+ in->encode_cap_message(reap, cap);
+ reap->snapbl = realm->get_snap_trace();
+ reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
+ mds->send_message_client_counted(reap, session);
+ } else {
+ ceph_abort();
+ }
+}
+
+void MDCache::do_delayed_cap_imports()
+{
+ dout(10) << "do_delayed_cap_imports" << dendl;
+
+ ceph_assert(delayed_imported_caps.empty());
+}
+
+struct C_MDC_OpenSnapRealms : public MDCacheContext {
+ explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
+ void finish(int r) override {
+ mdcache->open_snaprealms();
+ }
+};
+
+void MDCache::open_snaprealms()
+{
+ dout(10) << "open_snaprealms" << dendl;
+
+ MDSGatherBuilder gather(g_ceph_context);
+
+ auto it = rejoin_pending_snaprealms.begin();
+ while (it != rejoin_pending_snaprealms.end()) {
+ CInode *in = *it;
+ SnapRealm *realm = in->snaprealm;
+ ceph_assert(realm);
+ if (realm->have_past_parents_open() ||
+ realm->open_parents(gather.new_sub())) {
+ dout(10) << " past parents now open on " << *in << dendl;
+
+ map<client_t,MClientSnap::ref> splits;
+ // finish off client snaprealm reconnects?
+ map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
+ if (q != reconnected_snaprealms.end()) {
+ for (const auto& r : q->second)
+ finish_snaprealm_reconnect(r.first, realm, r.second, splits);
+ reconnected_snaprealms.erase(q);
+ }
+
+ for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
+ !p.end(); ++p) {
+ CInode *child = *p;
+ auto q = reconnected_caps.find(child->ino());
+ ceph_assert(q != reconnected_caps.end());
+ for (auto r = q->second.begin(); r != q->second.end(); ++r) {
+ Capability *cap = child->get_client_cap(r->first);
+ if (!cap)
+ continue;
+ if (r->second.snap_follows > 0) {
+ if (r->second.snap_follows < child->first - 1) {
+ rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
+ } else if (r->second.snapflush) {
+ // When processing a cap flush message that is re-sent, it's possble
+ // that the sender has already released all WR caps. So we should
+ // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
+ cap->mark_needsnapflush();
+ }
+ }
+ // make sure client's cap is in the correct snaprealm.
+ if (r->second.realm_ino != in->ino()) {
+ prepare_realm_split(realm, r->first, child->ino(), splits);
+ }
+ }
+ }
+
+ rejoin_pending_snaprealms.erase(it++);
+ in->put(CInode::PIN_OPENINGSNAPPARENTS);
+
+ send_snaps(splits);
+ } else {
+ dout(10) << " opening past parents on " << *in << dendl;
+ ++it;
+ }
+ }
+
+ if (gather.has_subs()) {
+ if (gather.num_subs_remaining() == 0) {
+ // cleanup gather
+ gather.set_finisher(new C_MDSInternalNoop);
+ gather.activate();
+ } else {
+ // for multimds, must succeed the first time
+ ceph_assert(recovery_set.empty());
+
+ dout(10) << "open_snaprealms - waiting for "
+ << gather.num_subs_remaining() << dendl;
+ gather.set_finisher(new C_MDC_OpenSnapRealms(this));
+ gather.activate();
+ return;
+ }
+ }
+
+ notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
+
+ if (!reconnected_snaprealms.empty()) {
+ dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
+ for (auto& p : reconnected_snaprealms) {
+ stringstream warn_str;
+ warn_str << " " << p.first << " {";
+ bool first = true;
+ for (auto& q : p.second) {
+ if (!first)
+ warn_str << ", ";
+ warn_str << "client." << q.first << "/" << q.second;
+ }
+ warn_str << "}";
+ dout(5) << warn_str.str() << dendl;
+ }
+ }
+ ceph_assert(rejoin_waiters.empty());
+ ceph_assert(rejoin_pending_snaprealms.empty());
+ dout(10) << "open_snaprealms - all open" << dendl;
+ do_delayed_cap_imports();
+
+ ceph_assert(rejoin_done);
+ rejoin_done.release()->complete(0);
+ reconnected_caps.clear();
+}
+
+bool MDCache::open_undef_inodes_dirfrags()
+{
+ dout(10) << "open_undef_inodes_dirfrags "
+ << rejoin_undef_inodes.size() << " inodes "
+ << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
+
+ set<CDir*> fetch_queue = rejoin_undef_dirfrags;
+
+ for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
+ p != rejoin_undef_inodes.end();
+ ++p) {
+ CInode *in = *p;
+ ceph_assert(!in->is_base());
+ fetch_queue.insert(in->get_parent_dir());
+ }
+
+ if (fetch_queue.empty())
+ return false;
+
+ MDSGatherBuilder gather(g_ceph_context,
+ new MDSInternalContextWrapper(mds,
+ new FunctionContext([this](int r) {
+ if (rejoin_gather.empty())
+ rejoin_gather_finish();
+ })
+ )
+ );
+
+ for (set<CDir*>::iterator p = fetch_queue.begin();
+ p != fetch_queue.end();
+ ++p) {
+ CDir *dir = *p;
+ CInode *diri = dir->get_inode();
+ if (diri->state_test(CInode::STATE_REJOINUNDEF))
+ continue;
+ if (dir->state_test(CDir::STATE_REJOINUNDEF))
+ ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
+ dir->fetch(gather.new_sub());
+ }
+ ceph_assert(gather.has_subs());
+ gather.activate();
+ return true;
+}
+
+void MDCache::opened_undef_inode(CInode *in) {
+ dout(10) << "opened_undef_inode " << *in << dendl;
+ rejoin_undef_inodes.erase(in);
+ if (in->is_dir()) {
+ // FIXME: re-hash dentries if necessary
+ ceph_assert(in->inode.dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
+ if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
+ CDir *dir = in->get_dirfrag(frag_t());
+ ceph_assert(dir);
+ rejoin_undef_dirfrags.erase(dir);
+ in->force_dirfrags();
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
+ rejoin_undef_dirfrags.insert(*p);
+ }
+ }
+}
+
+void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
+ map<client_t,MClientSnap::ref>& updates)
+{
+ if (seq < realm->get_newest_seq()) {
+ dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
+ << realm->get_newest_seq() << " on " << *realm << dendl;
+ auto snap = MClientSnap::create(CEPH_SNAP_OP_UPDATE);
+ snap->bl = realm->get_snap_trace();
+ for (const auto& child : realm->open_children)
+ snap->split_realms.push_back(child->inode->ino());
+ updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
+ } else {
+ dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
+ << " on " << *realm << dendl;
+ }
+}
+
+
+
+void MDCache::rejoin_send_acks()
+{
+ dout(7) << "rejoin_send_acks" << dendl;
+
+ // replicate stray
+ for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
+ p != rejoin_unlinked_inodes.end();
+ ++p) {
+ for (set<CInode*>::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q) {
+ CInode *in = *q;
+ dout(7) << " unlinked inode " << *in << dendl;
+ // inode expired
+ if (!in->is_replica(p->first))
+ continue;
+ while (1) {
+ CDentry *dn = in->get_parent_dn();
+ if (dn->is_replica(p->first))
+ break;
+ dn->add_replica(p->first);
+ CDir *dir = dn->get_dir();
+ if (dir->is_replica(p->first))
+ break;
+ dir->add_replica(p->first);
+ in = dir->get_inode();
+ if (in->is_replica(p->first))
+ break;
+ in->add_replica(p->first);
+ if (in->is_base())
+ break;
+ }
+ }
+ }
+ rejoin_unlinked_inodes.clear();
+
+ // send acks to everyone in the recovery set
+ map<mds_rank_t,MMDSCacheRejoin::ref> acks;
+ for (set<mds_rank_t>::iterator p = recovery_set.begin();
+ p != recovery_set.end();
+ ++p) {
+ if (rejoin_ack_sent.count(*p))
+ continue;
+ acks[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_ACK);
+ }
+
+ rejoin_ack_sent = recovery_set;
+
+ // walk subtrees
+ for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ CDir *dir = p->first;
+ if (!dir->is_auth())
+ continue;
+ dout(10) << "subtree " << *dir << dendl;
+
+ // auth items in this subtree
+ list<CDir*> dq;
+ dq.push_back(dir);
+
+ while (!dq.empty()) {
+ CDir *dir = dq.front();
+ dq.pop_front();
+
+ // dir
+ for (auto &r : dir->get_replicas()) {
+ auto it = acks.find(r.first);
+ if (it == acks.end())
+ continue;
+ it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
+ it->second->add_dirfrag_base(dir);
+ }
+
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
+ CDentry::linkage_t *dnl = dn->get_linkage();
+
+ // inode
+ CInode *in = NULL;
+ if (dnl->is_primary())
+ in = dnl->get_inode();
+
+ // dentry
+ for (auto &r : dn->get_replicas()) {
+ auto it = acks.find(r.first);
+ if (it == acks.end())
+ continue;
+ it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
+ dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
+ dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
+ dnl->is_remote() ? dnl->get_remote_d_type():0,
+ ++r.second,
+ dn->lock.get_replica_state());
+ // peer missed MDentrylink message ?
+ if (in && !in->is_replica(r.first))
+ in->add_replica(r.first);
+ }
+
+ if (!in)
+ continue;
+
+ for (auto &r : in->get_replicas()) {
+ auto it = acks.find(r.first);
+ if (it == acks.end())
+ continue;
+ it->second->add_inode_base(in, mds->mdsmap->get_up_features());
+ bufferlist bl;
+ in->_encode_locks_state_for_rejoin(bl, r.first);
+ it->second->add_inode_locks(in, ++r.second, bl);
+ }
+
+ // subdirs in this subtree?
+ in->get_nested_dirfrags(dq);
+ }
+ }
+ }
+
+ // base inodes too
+ if (root && root->is_auth())
+ for (auto &r : root->get_replicas()) {
+ auto it = acks.find(r.first);
+ if (it == acks.end())
+ continue;
+ it->second->add_inode_base(root, mds->mdsmap->get_up_features());
+ bufferlist bl;
+ root->_encode_locks_state_for_rejoin(bl, r.first);
+ it->second->add_inode_locks(root, ++r.second, bl);
+ }
+ if (myin)
+ for (auto &r : myin->get_replicas()) {
+ auto it = acks.find(r.first);
+ if (it == acks.end())
+ continue;
+ it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
+ bufferlist bl;
+ myin->_encode_locks_state_for_rejoin(bl, r.first);
+ it->second->add_inode_locks(myin, ++r.second, bl);
+ }
+
+ // include inode base for any inodes whose scatterlocks may have updated
+ for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
+ p != rejoin_potential_updated_scatterlocks.end();
+ ++p) {
+ CInode *in = *p;
+ for (const auto &r : in->get_replicas()) {
+ auto it = acks.find(r.first);
+ if (it == acks.end())
+ continue;
+ it->second->add_inode_base(in, mds->mdsmap->get_up_features());
+ }
+ }
+
+ // send acks
+ for (auto p = acks.begin(); p != acks.end(); ++p) {
+ encode(rejoin_imported_caps[p->first], p->second->imported_caps);
+ mds->send_message_mds(p->second, p->first);
+ }
+
+ rejoin_imported_caps.clear();
+}
+
+class C_MDC_ReIssueCaps : public MDCacheContext {
+ CInode *in;
+public:
+ C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
+ MDCacheContext(mdc), in(i)
+ {
+ in->get(CInode::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
+ mdcache->mds->locker->issue_caps(in);
+ in->put(CInode::PIN_PTRWAITER);
+ }
+};
+
+void MDCache::reissue_all_caps()
+{
+ dout(10) << "reissue_all_caps" << dendl;
+
+ int count = 0;
+ for (auto &p : inode_map) {
+ int n = 1;
+ CInode *in = p.second;
+ if (in->is_head() && in->is_any_caps()) {
+ // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
+ if (in->is_frozen_inode()) {
+ in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
+ continue;
+ }
+ if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
+ n += mds->locker->issue_caps(in);
+ }
+
+ if ((count % 1000) + n >= 1000)
+ mds->heartbeat_reset();
+ count += n;
+ }
+}
+
+
+// ===============================================================================
+
+struct C_MDC_QueuedCow : public MDCacheContext {
+ CInode *in;
+ MutationRef mut;
+ C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
+ MDCacheContext(mdc), in(i), mut(m) {}
+ void finish(int r) override {
+ mdcache->_queued_file_recover_cow(in, mut);
+ }
+};
+
+
+void MDCache::queue_file_recover(CInode *in)
+{
+ dout(10) << "queue_file_recover " << *in << dendl;
+ ceph_assert(in->is_auth());
+
+ // cow?
+ /*
+ SnapRealm *realm = in->find_snaprealm();
+ set<snapid_t> s = realm->get_snaps();
+ while (!s.empty() && *s.begin() < in->first)
+ s.erase(s.begin());
+ while (!s.empty() && *s.rbegin() > in->last)
+ s.erase(*s.rbegin());
+ dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
+ if (s.size() > 1) {
+ CInode::mempool_inode pi = in->project_inode();
+ pi->version = in->pre_dirty();
+
+ auto mut(std::make_shared<MutationImpl>());
+ mut->ls = mds->mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
+ mds->mdlog->start_entry(le);
+ predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
+
+ s.erase(*s.begin());
+ while (!s.empty()) {
+ snapid_t snapid = *s.begin();
+ CInode *cow_inode = 0;
+ journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
+ ceph_assert(cow_inode);
+ recovery_queue.enqueue(cow_inode);
+ s.erase(*s.begin());
+ }
+
+ in->parent->first = in->first;
+ le->metablob.add_primary_dentry(in->parent, in, true);
+ mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
+ mds->mdlog->flush();
+ }
+ */
+
+ recovery_queue.enqueue(in);
+}
+
+void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
+{
+ in->pop_and_dirty_projected_inode(mut->ls);
+ mut->apply();
+ mds->locker->drop_locks(mut.get());
+ mut->cleanup();
+}
+
+
+/*
+ * called after recovery to recover file sizes for previously opened (for write)
+ * files. that is, those where max_size > size.
+ */
+void MDCache::identify_files_to_recover()
+{
+ dout(10) << "identify_files_to_recover" << dendl;
+ int count = 0;
+ for (auto &p : inode_map) {
+ CInode *in = p.second;
+ if (!in->is_auth())
+ continue;
+
+ if (in->last != CEPH_NOSNAP)
+ continue;
+
+ // Only normal files need file size recovery
+ if (!in->is_file()) {
+ continue;
+ }
+
+ bool recover = false;
+ for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
+ p != in->inode.client_ranges.end();
+ ++p) {
+ Capability *cap = in->get_client_cap(p->first);
+ if (cap) {
+ cap->mark_clientwriteable();
+ } else {
+ dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
+ recover = true;
+ break;
+ }
+ }
+
+ if (recover) {
+ if (in->filelock.is_stable()) {
+ in->auth_pin(&in->filelock);
+ } else {
+ ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
+ }
+ in->filelock.set_state(LOCK_PRE_SCAN);
+ rejoin_recover_q.push_back(in);
+ } else {
+ rejoin_check_q.push_back(in);
+ }
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+}
+
+void MDCache::start_files_to_recover()
+{
+ for (CInode *in : rejoin_check_q) {
+ if (in->filelock.get_state() == LOCK_XLOCKSNAP)
+ mds->locker->issue_caps(in);
+ mds->locker->check_inode_max_size(in);
+ }
+ rejoin_check_q.clear();
+ for (CInode *in : rejoin_recover_q) {
+ mds->locker->file_recover(&in->filelock);
+ }
+ if (!rejoin_recover_q.empty()) {
+ rejoin_recover_q.clear();
+ do_file_recover();
+ }
+}
+
+void MDCache::do_file_recover()
+{
+ recovery_queue.advance();
+}
+
+// ===============================================================================
+
+
+// ----------------------------
+// truncate
+
+class C_MDC_RetryTruncate : public MDCacheContext {
+ CInode *in;
+ LogSegment *ls;
+public:
+ C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
+ MDCacheContext(c), in(i), ls(l) {}
+ void finish(int r) override {
+ mdcache->_truncate_inode(in, ls);
+ }
+};
+
+void MDCache::truncate_inode(CInode *in, LogSegment *ls)
+{
+ auto pi = in->get_projected_inode();
+ dout(10) << "truncate_inode "
+ << pi->truncate_from << " -> " << pi->truncate_size
+ << " on " << *in
+ << dendl;
+
+ ls->truncating_inodes.insert(in);
+ in->get(CInode::PIN_TRUNCATING);
+ in->auth_pin(this);
+
+ if (!in->client_need_snapflush.empty() &&
+ (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
+ ceph_assert(in->filelock.is_xlocked());
+ in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
+ mds->locker->issue_caps(in);
+ return;
+ }
+
+ _truncate_inode(in, ls);
+}
+
+struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
+ CInode *in;
+ LogSegment *ls;
+ C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
+ MDCacheIOContext(c, false), in(i), ls(l) {
+ }
+ void finish(int r) override {
+ ceph_assert(r == 0 || r == -ENOENT);
+ mdcache->truncate_inode_finish(in, ls);
+ }
+ void print(ostream& out) const override {
+ out << "file_truncate(" << in->ino() << ")";
+ }
+};
+
+void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
+{
+ auto pi = &in->inode;
+ dout(10) << "_truncate_inode "
+ << pi->truncate_from << " -> " << pi->truncate_size
+ << " on " << *in << dendl;
+
+ ceph_assert(pi->is_truncating());
+ ceph_assert(pi->truncate_size < (1ULL << 63));
+ ceph_assert(pi->truncate_from < (1ULL << 63));
+ ceph_assert(pi->truncate_size < pi->truncate_from);
+
+
+ SnapRealm *realm = in->find_snaprealm();
+ SnapContext nullsnap;
+ const SnapContext *snapc;
+ if (realm) {
+ dout(10) << " realm " << *realm << dendl;
+ snapc = &realm->get_snap_context();
+ } else {
+ dout(10) << " NO realm, using null context" << dendl;
+ snapc = &nullsnap;
+ ceph_assert(in->last == CEPH_NOSNAP);
+ }
+ dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
+ filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
+ pi->truncate_size, pi->truncate_from-pi->truncate_size,
+ pi->truncate_seq, ceph::real_time::min(), 0,
+ new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
+ mds->finisher));
+}
+
+struct C_MDC_TruncateLogged : public MDCacheLogContext {
+ CInode *in;
+ MutationRef mut;
+ C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
+ MDCacheLogContext(m), in(i), mut(mu) {}
+ void finish(int r) override {
+ mdcache->truncate_inode_logged(in, mut);
+ }
+};
+
+void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
+{
+ dout(10) << "truncate_inode_finish " << *in << dendl;
+
+ set<CInode*>::iterator p = ls->truncating_inodes.find(in);
+ ceph_assert(p != ls->truncating_inodes.end());
+ ls->truncating_inodes.erase(p);
+
+ // update
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
+ pi.inode.truncate_from = 0;
+ pi.inode.truncate_pending--;
+
+ MutationRef mut(new MutationImpl());
+ mut->ls = mds->mdlog->get_current_segment();
+ mut->add_projected_inode(in);
+
+ EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
+ mds->mdlog->start_entry(le);
+ CDentry *dn = in->get_projected_parent_dn();
+ le->metablob.add_dir_context(dn->get_dir());
+ le->metablob.add_primary_dentry(dn, in, true);
+ le->metablob.add_truncate_finish(in->ino(), ls->seq);
+
+ journal_dirty_inode(mut.get(), &le->metablob, in);
+ mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
+
+ // flush immediately if there are readers/writers waiting
+ if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
+ (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+ mds->mdlog->flush();
+}
+
+void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
+{
+ dout(10) << "truncate_inode_logged " << *in << dendl;
+ mut->apply();
+ mds->locker->drop_locks(mut.get());
+ mut->cleanup();
+
+ in->put(CInode::PIN_TRUNCATING);
+ in->auth_unpin(this);
+
+ MDSContext::vec waiters;
+ in->take_waiting(CInode::WAIT_TRUNC, waiters);
+ mds->queue_waiters(waiters);
+}
+
+
+void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
+{
+ dout(20) << "add_recovered_truncate " << *in << " in log segment "
+ << ls->seq << "/" << ls->offset << dendl;
+ ls->truncating_inodes.insert(in);
+ in->get(CInode::PIN_TRUNCATING);
+}
+
+void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
+{
+ dout(20) << "remove_recovered_truncate " << *in << " in log segment "
+ << ls->seq << "/" << ls->offset << dendl;
+ // if we have the logseg the truncate started in, it must be in our list.
+ set<CInode*>::iterator p = ls->truncating_inodes.find(in);
+ ceph_assert(p != ls->truncating_inodes.end());
+ ls->truncating_inodes.erase(p);
+ in->put(CInode::PIN_TRUNCATING);
+}
+
+void MDCache::start_recovered_truncates()
+{
+ dout(10) << "start_recovered_truncates" << dendl;
+ for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
+ p != mds->mdlog->segments.end();
+ ++p) {
+ LogSegment *ls = p->second;
+ for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
+ q != ls->truncating_inodes.end();
+ ++q) {
+ CInode *in = *q;
+ in->auth_pin(this);
+
+ if (!in->client_need_snapflush.empty() &&
+ (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
+ ceph_assert(in->filelock.is_stable());
+ in->filelock.set_state(LOCK_XLOCKDONE);
+ in->auth_pin(&in->filelock);
+ in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
+ // start_files_to_recover will revoke caps
+ continue;
+ }
+ _truncate_inode(in, ls);
+ }
+ }
+}
+
+
+
+
+
+
+// ================================================================================
+// cache trimming
+
+std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
+{
+ bool is_standby_replay = mds->is_standby_replay();
+ std::vector<CDentry *> unexpirables;
+ uint64_t trimmed = 0;
+
+ auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
+
+ dout(7) << "trim_lru trimming " << count
+ << " items from LRU"
+ << " size=" << lru.lru_get_size()
+ << " mid=" << lru.lru_get_top()
+ << " pintail=" << lru.lru_get_pintail()
+ << " pinned=" << lru.lru_get_num_pinned()
+ << dendl;
+
+ const uint64_t trim_counter_start = trim_counter.get();
+ bool throttled = false;
+ while (1) {
+ throttled |= trim_counter_start+trimmed >= trim_threshold;
+ if (throttled) break;
+ CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
+ if (!dn)
+ break;
+ if (trim_dentry(dn, expiremap)) {
+ unexpirables.push_back(dn);
+ } else {
+ trimmed++;
+ }
+ }
+
+ for (auto &dn : unexpirables) {
+ bottom_lru.lru_insert_mid(dn);
+ }
+ unexpirables.clear();
+
+ // trim dentries from the LRU until count is reached
+ // if mds is in standbyreplay and will trim all inodes which aren't in segments
+ while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) {
+ throttled |= trim_counter_start+trimmed >= trim_threshold;
+ if (throttled) break;
+ CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
+ if (!dn) {
+ break;
+ }
+ if ((is_standby_replay && dn->get_linkage()->inode &&
+ dn->get_linkage()->inode->item_open_file.is_on_list())) {
+ // we move the inodes that need to be trimmed to the end of the lru queue.
+ // refer to MDCache::standby_trim_segment
+ lru.lru_insert_bot(dn);
+ break;
+ } else if (trim_dentry(dn, expiremap)) {
+ unexpirables.push_back(dn);
+ } else {
+ trimmed++;
+ if (count > 0) count--;
+ }
+ }
+ trim_counter.hit(trimmed);
+
+ for (auto &dn : unexpirables) {
+ lru.lru_insert_mid(dn);
+ }
+ unexpirables.clear();
+
+ dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
+ return std::pair<bool, uint64_t>(throttled, trimmed);
+}
+
+/*
+ * note: only called while MDS is active or stopping... NOT during recovery.
+ * however, we may expire a replica whose authority is recovering.
+ *
+ * @param count is number of dentries to try to expire
+ */
+std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
+{
+ uint64_t used = cache_size();
+ uint64_t limit = cache_memory_limit;
+ expiremap expiremap;
+
+ dout(7) << "trim bytes_used=" << bytes2str(used)
+ << " limit=" << bytes2str(limit)
+ << " reservation=" << cache_reservation
+ << "% count=" << count << dendl;
+
+ // process delayed eval_stray()
+ stray_manager.advance_delayed();
+
+ auto result = trim_lru(count, expiremap);
+ auto& trimmed = result.second;
+
+ // trim non-auth, non-bound subtrees
+ for (auto p = subtrees.begin(); p != subtrees.end();) {
+ CDir *dir = p->first;
+ ++p;
+ CInode *diri = dir->get_inode();
+ if (dir->is_auth()) {
+ if (!diri->is_auth() && !diri->is_base() &&
+ dir->get_num_head_items() == 0) {
+ if (dir->state_test(CDir::STATE_EXPORTING) ||
+ !(mds->is_active() || mds->is_stopping()) ||
+ dir->is_freezing() || dir->is_frozen())
+ continue;
+
+ migrator->export_empty_import(dir);
+ ++trimmed;
+ }
+ } else {
+ if (!diri->is_auth()) {
+ if (dir->get_num_ref() > 1) // only subtree pin
+ continue;
+ if (diri->get_num_ref() > diri->get_num_subtree_roots())
+ continue;
+
+ // don't trim subtree root if its auth MDS is recovering.
+ // This simplify the cache rejoin code.
+ if (dir->is_subtree_root() &&
+ rejoin_ack_gather.count(dir->get_dir_auth().first))
+ continue;
+ trim_dirfrag(dir, 0, expiremap);
+ ++trimmed;
+ }
+ }
+ }
+
+ // trim root?
+ if (mds->is_stopping() && root) {
+ list<CDir*> ls;
+ root->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ CDir *dir = *p;
+ if (dir->get_num_ref() == 1) { // subtree pin
+ trim_dirfrag(dir, 0, expiremap);
+ ++trimmed;
+ }
+ }
+ if (root->get_num_ref() == 0) {
+ trim_inode(0, root, 0, expiremap);
+ ++trimmed;
+ }
+ }
+
+ std::set<mds_rank_t> stopping;
+ mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
+ stopping.erase(mds->get_nodeid());
+ for (auto rank : stopping) {
+ CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
+ if (!mdsdir_in)
+ continue;
+
+ auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
+ if (em.second) {
+ em.first->second = MCacheExpire::create(mds->get_nodeid());
+ }
+
+ dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
+
+ const bool aborted = expire_recursive(mdsdir_in, expiremap);
+ if (!aborted) {
+ dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
+ list<CDir*> ls;
+ mdsdir_in->get_dirfrags(ls);
+ for (auto dir : ls) {
+ if (dir->get_num_ref() == 1) { // subtree pin
+ trim_dirfrag(dir, dir, expiremap);
+ ++trimmed;
+ }
+ }
+ if (mdsdir_in->get_num_ref() == 0) {
+ trim_inode(NULL, mdsdir_in, NULL, expiremap);
+ ++trimmed;
+ }
+ } else {
+ dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
+ }
+ }
+
+ // Other rank's base inodes (when I'm stopping)
+ if (mds->is_stopping()) {
+ for (set<CInode*>::iterator p = base_inodes.begin();
+ p != base_inodes.end();) {
+ CInode *base_in = *p;
+ ++p;
+ if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
+ MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
+ dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
+ if (base_in->get_num_ref() == 0) {
+ trim_inode(NULL, base_in, NULL, expiremap);
+ ++trimmed;
+ }
+ }
+ }
+ }
+
+ // send any expire messages
+ send_expire_messages(expiremap);
+
+ return result;
+}
+
+void MDCache::send_expire_messages(expiremap& expiremap)
+{
+ // send expires
+ for (const auto &p : expiremap) {
+ if (mds->is_cluster_degraded() &&
+ (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
+ (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
+ rejoin_sent.count(p.first) == 0))) {
+ continue;
+ }
+ dout(7) << "sending cache_expire to " << p.first << dendl;
+ mds->send_message_mds(p.second, p.first);
+ }
+ expiremap.clear();
+}
+
+
+bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
+{
+ dout(12) << "trim_dentry " << *dn << dendl;
+
+ CDentry::linkage_t *dnl = dn->get_linkage();
+
+ CDir *dir = dn->get_dir();
+ ceph_assert(dir);
+
+ CDir *con = get_subtree_root(dir);
+ if (con)
+ dout(12) << " in container " << *con << dendl;
+ else {
+ dout(12) << " no container; under a not-yet-linked dir" << dendl;
+ ceph_assert(dn->is_auth());
+ }
+
+ // If replica dentry is not readable, it's likely we will receive
+ // MDentryLink/MDentryUnlink message soon (It's possible we first
+ // receive a MDentryUnlink message, then MDentryLink message)
+ // MDentryLink message only replicates an inode, so we should
+ // avoid trimming the inode's parent dentry. This is because that
+ // unconnected replicas are problematic for subtree migration.
+ if (!dn->is_auth() && !dn->lock.can_read(-1) &&
+ !dn->get_dir()->get_inode()->is_stray())
+ return true;
+
+ // adjust the dir state
+ // NOTE: we can safely remove a clean, null dentry without effecting
+ // directory completeness.
+ // (check this _before_ we unlink the inode, below!)
+ bool clear_complete = false;
+ if (!(dnl->is_null() && dn->is_clean()))
+ clear_complete = true;
+
+ // unlink the dentry
+ if (dnl->is_remote()) {
+ // just unlink.
+ dir->unlink_inode(dn, false);
+ } else if (dnl->is_primary()) {
+ // expire the inode, too.
+ CInode *in = dnl->get_inode();
+ ceph_assert(in);
+ if (trim_inode(dn, in, con, expiremap))
+ return true; // purging stray instead of trimming
+ } else {
+ ceph_assert(dnl->is_null());
+ }
+
+ if (!dn->is_auth()) {
+ // notify dentry authority.
+ mds_authority_t auth = dn->authority();
+
+ for (int p=0; p<2; p++) {
+ mds_rank_t a = auth.first;
+ if (p) a = auth.second;
+ if (a < 0 || (p == 1 && auth.second == auth.first)) break;
+ if (mds->get_nodeid() == auth.second &&
+ con->is_importing()) break; // don't send any expire while importing.
+ if (a == mds->get_nodeid()) continue; // on export, ignore myself.
+
+ dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
+ ceph_assert(a != mds->get_nodeid());
+ auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
+ if (em.second)
+ em.first->second = MCacheExpire::create(mds->get_nodeid());
+ em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
+ }
+ }
+
+ // remove dentry
+ if (dn->last == CEPH_NOSNAP && dir->is_auth())
+ dir->add_to_bloom(dn);
+ dir->remove_dentry(dn);
+
+ if (clear_complete)
+ dir->state_clear(CDir::STATE_COMPLETE);
+
+ if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
+ return false;
+}
+
+
+void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
+{
+ dout(15) << "trim_dirfrag " << *dir << dendl;
+
+ if (dir->is_subtree_root()) {
+ ceph_assert(!dir->is_auth() ||
+ (!dir->is_replicated() && dir->inode->is_base()));
+ remove_subtree(dir); // remove from subtree map
+ }
+ ceph_assert(dir->get_num_ref() == 0);
+
+ CInode *in = dir->get_inode();
+
+ if (!dir->is_auth()) {
+ mds_authority_t auth = dir->authority();
+
+ // was this an auth delegation? (if so, slightly modified container)
+ dirfrag_t condf;
+ if (dir->is_subtree_root()) {
+ dout(12) << " subtree root, container is " << *dir << dendl;
+ con = dir;
+ condf = dir->dirfrag();
+ } else {
+ condf = con->dirfrag();
+ }
+
+ for (int p=0; p<2; p++) {
+ mds_rank_t a = auth.first;
+ if (p) a = auth.second;
+ if (a < 0 || (p == 1 && auth.second == auth.first)) break;
+ if (mds->get_nodeid() == auth.second &&
+ con->is_importing()) break; // don't send any expire while importing.
+ if (a == mds->get_nodeid()) continue; // on export, ignore myself.
+
+ dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
+ ceph_assert(a != mds->get_nodeid());
+ auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
+ if (em.second)
+ em.first->second = MCacheExpire::create(mds->get_nodeid()); /* new */
+ em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
+ }
+ }
+
+ in->close_dirfrag(dir->dirfrag().frag);
+}
+
+/**
+ * Try trimming an inode from the cache
+ *
+ * @return true if the inode is still in cache, else false if it was trimmed
+ */
+bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
+{
+ dout(15) << "trim_inode " << *in << dendl;
+ ceph_assert(in->get_num_ref() == 0);
+
+ if (in->is_dir()) {
+ // If replica inode's dirfragtreelock is not readable, it's likely
+ // some dirfrags of the inode are being fragmented and we will receive
+ // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
+ // dirfrags, so we should avoid trimming these dirfrags' parent inode.
+ // This is because that unconnected replicas are problematic for
+ // subtree migration.
+ //
+ if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1, nullptr)) {
+ return true;
+ }
+
+ // DIR
+ list<CDir*> dfls;
+ in->get_dirfrags(dfls);
+ for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
+ CDir *dir = *p;
+ ceph_assert(!dir->is_subtree_root());
+ trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
+ }
+ }
+
+ // INODE
+ if (in->is_auth()) {
+ // eval stray after closing dirfrags
+ if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
+ maybe_eval_stray(in);
+ if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
+ return true;
+ }
+ } else {
+ mds_authority_t auth = in->authority();
+
+ dirfrag_t df;
+ if (con)
+ df = con->dirfrag();
+ else
+ df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
+
+ for (int p=0; p<2; p++) {
+ mds_rank_t a = auth.first;
+ if (p) a = auth.second;
+ if (a < 0 || (p == 1 && auth.second == auth.first)) break;
+ if (con && mds->get_nodeid() == auth.second &&
+ con->is_importing()) break; // don't send any expire while importing.
+ if (a == mds->get_nodeid()) continue; // on export, ignore myself.
+
+ dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
+ ceph_assert(a != mds->get_nodeid());
+ auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
+ if (em.second)
+ em.first->second = MCacheExpire::create(mds->get_nodeid()); /* new */
+ em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
+ }
+ }
+
+ /*
+ if (in->is_auth()) {
+ if (in->hack_accessed)
+ mds->logger->inc("outt");
+ else {
+ mds->logger->inc("outut");
+ mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
+ }
+ }
+ */
+
+ // unlink
+ if (dn)
+ dn->get_dir()->unlink_inode(dn, false);
+ remove_inode(in);
+ return false;
+}
+
+
+/**
+ * trim_non_auth - remove any non-auth items from our cache
+ *
+ * this reduces the amount of non-auth metadata in our cache, reducing the
+ * load incurred by the rejoin phase.
+ *
+ * the only non-auth items that remain are those that are needed to
+ * attach our own subtrees to the root.
+ *
+ * when we are done, all dentries will be in the top bit of the lru.
+ *
+ * why we have to do this:
+ * we may not have accurate linkage for non-auth items. which means we will
+ * know which subtree it falls into, and can not be sure to declare it to the
+ * correct authority.
+ */
+void MDCache::trim_non_auth()
+{
+ dout(7) << "trim_non_auth" << dendl;
+
+ // temporarily pin all subtree roots
+ for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p)
+ p->first->get(CDir::PIN_SUBTREETEMP);
+
+ list<CDentry*> auth_list;
+
+ // trim non-auth items from the lru
+ for (;;) {
+ CDentry *dn = NULL;
+ if (bottom_lru.lru_get_size() > 0)
+ dn = static_cast<CDentry*>(bottom_lru.lru_expire());
+ if (!dn && lru.lru_get_size() > 0)
+ dn = static_cast<CDentry*>(lru.lru_expire());
+ if (!dn)
+ break;
+
+ CDentry::linkage_t *dnl = dn->get_linkage();
+
+ if (dn->is_auth()) {
+ // add back into lru (at the top)
+ auth_list.push_back(dn);
+
+ if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
+ dn->unlink_remote(dnl);
+ } else {
+ // non-auth. expire.
+ CDir *dir = dn->get_dir();
+ ceph_assert(dir);
+
+ // unlink the dentry
+ dout(10) << " removing " << *dn << dendl;
+ if (dnl->is_remote()) {
+ dir->unlink_inode(dn, false);
+ }
+ else if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ dout(10) << " removing " << *in << dendl;
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ CDir *subdir = *p;
+ ceph_assert(!subdir->is_subtree_root());
+ in->close_dirfrag(subdir->dirfrag().frag);
+ }
+ dir->unlink_inode(dn, false);
+ remove_inode(in);
+ }
+ else {
+ ceph_assert(dnl->is_null());
+ }
+
+ ceph_assert(!dir->has_bloom());
+ dir->remove_dentry(dn);
+ // adjust the dir state
+ dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
+ // close empty non-auth dirfrag
+ if (!dir->is_subtree_root() && dir->get_num_any() == 0)
+ dir->inode->close_dirfrag(dir->get_frag());
+ }
+ }
+
+ for (auto dn : auth_list) {
+ if (dn->state_test(CDentry::STATE_BOTTOMLRU))
+ bottom_lru.lru_insert_mid(dn);
+ else
+ lru.lru_insert_top(dn);
+ }
+
+ // move everything in the pintail to the top bit of the lru.
+ lru.lru_touch_entire_pintail();
+
+ // unpin all subtrees
+ for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p)
+ p->first->put(CDir::PIN_SUBTREETEMP);
+
+ if (lru.lru_get_size() == 0 &&
+ bottom_lru.lru_get_size() == 0) {
+ // root, stray, etc.?
+ auto p = inode_map.begin();
+ while (p != inode_map.end()) {
+ CInode *in = p->second;
+ ++p;
+ if (!in->is_auth()) {
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin();
+ p != ls.end();
+ ++p) {
+ dout(10) << " removing " << **p << dendl;
+ ceph_assert((*p)->get_num_ref() == 1); // SUBTREE
+ remove_subtree((*p));
+ in->close_dirfrag((*p)->dirfrag().frag);
+ }
+ dout(10) << " removing " << *in << dendl;
+ ceph_assert(!in->get_parent_dn());
+ ceph_assert(in->get_num_ref() == 0);
+ remove_inode(in);
+ }
+ }
+ }
+
+ show_subtrees();
+}
+
+/**
+ * Recursively trim the subtree rooted at directory to remove all
+ * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
+ * of those links. This is used to clear invalid data out of the cache.
+ * Note that it doesn't clear the passed-in directory, since that's not
+ * always safe.
+ */
+bool MDCache::trim_non_auth_subtree(CDir *dir)
+{
+ dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
+
+ bool keep_dir = !can_trim_non_auth_dirfrag(dir);
+
+ auto j = dir->begin();
+ auto i = j;
+ while (j != dir->end()) {
+ i = j++;
+ CDentry *dn = i->second;
+ dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ if (dnl->is_primary()) { // check for subdirectories, etc
+ CInode *in = dnl->get_inode();
+ bool keep_inode = false;
+ if (in->is_dir()) {
+ list<CDir*> subdirs;
+ in->get_dirfrags(subdirs);
+ for (list<CDir*>::iterator subdir = subdirs.begin();
+ subdir != subdirs.end();
+ ++subdir) {
+ if ((*subdir)->is_subtree_root()) {
+ keep_inode = true;
+ dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl;
+ } else {
+ if (trim_non_auth_subtree(*subdir))
+ keep_inode = true;
+ else {
+ in->close_dirfrag((*subdir)->get_frag());
+ dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
+ }
+ }
+ }
+
+ }
+ if (!keep_inode) { // remove it!
+ dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
+ dir->unlink_inode(dn, false);
+ remove_inode(in);
+ ceph_assert(!dir->has_bloom());
+ dir->remove_dentry(dn);
+ } else {
+ dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
+ dn->state_clear(CDentry::STATE_AUTH);
+ in->state_clear(CInode::STATE_AUTH);
+ }
+ } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
+ dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
+ } else { // just remove it
+ dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
+ if (dnl->is_remote())
+ dir->unlink_inode(dn, false);
+ dir->remove_dentry(dn);
+ }
+ }
+ dir->state_clear(CDir::STATE_AUTH);
+ /**
+ * We've now checked all our children and deleted those that need it.
+ * Now return to caller, and tell them if *we're* a keeper.
+ */
+ return keep_dir || dir->get_num_any();
+}
+
+/*
+ * during replay, when we determine a subtree is no longer ours, we
+ * try to trim it from our cache. because subtrees must be connected
+ * to the root, the fact that we can trim this tree may mean that our
+ * children or parents can also be trimmed.
+ */
+void MDCache::try_trim_non_auth_subtree(CDir *dir)
+{
+ dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
+
+ // can we now trim child subtrees?
+ set<CDir*> bounds;
+ get_subtree_bounds(dir, bounds);
+ for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
+ CDir *bd = *p;
+ if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
+ bd->get_num_any() == 0 && // and empty
+ can_trim_non_auth_dirfrag(bd)) {
+ CInode *bi = bd->get_inode();
+ dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
+ remove_subtree(bd);
+ bd->mark_clean();
+ bi->close_dirfrag(bd->get_frag());
+ }
+ }
+
+ if (trim_non_auth_subtree(dir)) {
+ // keep
+ try_subtree_merge(dir);
+ } else {
+ // can we trim this subtree (and possibly our ancestors) too?
+ while (true) {
+ CInode *diri = dir->get_inode();
+ if (diri->is_base()) {
+ if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
+ dout(10) << " closing empty non-auth subtree " << *dir << dendl;
+ remove_subtree(dir);
+ dir->mark_clean();
+ diri->close_dirfrag(dir->get_frag());
+
+ dout(10) << " removing " << *diri << dendl;
+ ceph_assert(!diri->get_parent_dn());
+ ceph_assert(diri->get_num_ref() == 0);
+ remove_inode(diri);
+ }
+ break;
+ }
+
+ CDir *psub = get_subtree_root(diri->get_parent_dir());
+ dout(10) << " parent subtree is " << *psub << dendl;
+ if (psub->get_dir_auth().first == mds->get_nodeid())
+ break; // we are auth, keep.
+
+ dout(10) << " closing empty non-auth subtree " << *dir << dendl;
+ remove_subtree(dir);
+ dir->mark_clean();
+ diri->close_dirfrag(dir->get_frag());
+
+ dout(10) << " parent subtree also non-auth: " << *psub << dendl;
+ if (trim_non_auth_subtree(psub))
+ break;
+ dir = psub;
+ }
+ }
+
+ show_subtrees();
+}
+
+void MDCache::standby_trim_segment(LogSegment *ls)
+{
+ auto try_trim_inode = [this](CInode *in) {
+ if (in->get_num_ref() == 0 &&
+ !in->item_open_file.is_on_list() &&
+ in->parent != NULL &&
+ in->parent->get_num_ref() == 0){
+ touch_dentry_bottom(in->parent);
+ }
+ };
+
+ auto try_trim_dentry = [this](CDentry *dn) {
+ if (dn->get_num_ref() > 0)
+ return;
+ auto in = dn->get_linkage()->inode;
+ if(in && in->item_open_file.is_on_list())
+ return;
+ touch_dentry_bottom(dn);
+ };
+
+ ls->new_dirfrags.clear_list();
+ ls->open_files.clear_list();
+
+ while (!ls->dirty_dirfrags.empty()) {
+ CDir *dir = ls->dirty_dirfrags.front();
+ dir->mark_clean();
+ if (dir->inode)
+ try_trim_inode(dir->inode);
+ }
+ while (!ls->dirty_inodes.empty()) {
+ CInode *in = ls->dirty_inodes.front();
+ in->mark_clean();
+ try_trim_inode(in);
+ }
+ while (!ls->dirty_dentries.empty()) {
+ CDentry *dn = ls->dirty_dentries.front();
+ dn->mark_clean();
+ try_trim_dentry(dn);
+ }
+ while (!ls->dirty_parent_inodes.empty()) {
+ CInode *in = ls->dirty_parent_inodes.front();
+ in->clear_dirty_parent();
+ try_trim_inode(in);
+ }
+ while (!ls->dirty_dirfrag_dir.empty()) {
+ CInode *in = ls->dirty_dirfrag_dir.front();
+ in->filelock.remove_dirty();
+ try_trim_inode(in);
+ }
+ while (!ls->dirty_dirfrag_nest.empty()) {
+ CInode *in = ls->dirty_dirfrag_nest.front();
+ in->nestlock.remove_dirty();
+ try_trim_inode(in);
+ }
+ while (!ls->dirty_dirfrag_dirfragtree.empty()) {
+ CInode *in = ls->dirty_dirfrag_dirfragtree.front();
+ in->dirfragtreelock.remove_dirty();
+ try_trim_inode(in);
+ }
+ while (!ls->truncating_inodes.empty()) {
+ auto it = ls->truncating_inodes.begin();
+ CInode *in = *it;
+ ls->truncating_inodes.erase(it);
+ in->put(CInode::PIN_TRUNCATING);
+ try_trim_inode(in);
+ }
+}
+
+void MDCache::handle_cache_expire(const MCacheExpire::const_ref &m)
+{
+ mds_rank_t from = mds_rank_t(m->get_from());
+
+ dout(7) << "cache_expire from mds." << from << dendl;
+
+ if (mds->get_state() < MDSMap::STATE_REJOIN) {
+ return;
+ }
+
+ set<SimpleLock *> gather_locks;
+ // loop over realms
+ for (const auto &p : m->realms) {
+ // check container?
+ if (p.first.ino > 0) {
+ CInode *expired_inode = get_inode(p.first.ino);
+ ceph_assert(expired_inode); // we had better have this.
+ CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
+ ceph_assert(parent_dir);
+
+ int export_state = -1;
+ if (parent_dir->is_auth() && parent_dir->is_exporting()) {
+ export_state = migrator->get_export_state(parent_dir);
+ ceph_assert(export_state >= 0);
+ }
+
+ if (!parent_dir->is_auth() ||
+ (export_state != -1 &&
+ ((export_state == Migrator::EXPORT_WARNING &&
+ migrator->export_has_warned(parent_dir,from)) ||
+ export_state == Migrator::EXPORT_EXPORTING ||
+ export_state == Migrator::EXPORT_LOGGINGFINISH ||
+ (export_state == Migrator::EXPORT_NOTIFYING &&
+ !migrator->export_has_notified(parent_dir,from))))) {
+
+ // not auth.
+ dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
+ ceph_assert(parent_dir->is_frozen_tree_root());
+
+ // make a message container
+
+ auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
+ if (em.second)
+ em.first->second = MCacheExpire::create(from); /* new */
+
+ // merge these expires into it
+ em.first->second->add_realm(p.first, p.second);
+ continue;
+ }
+ ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
+ (export_state == Migrator::EXPORT_WARNING &&
+ !migrator->export_has_warned(parent_dir, from)));
+
+ dout(7) << "expires for " << *parent_dir << dendl;
+ } else {
+ dout(7) << "containerless expires (root, stray inodes)" << dendl;
+ }
+
+ // INODES
+ for (const auto &q : p.second.inodes) {
+ CInode *in = get_inode(q.first);
+ unsigned nonce = q.second;
+
+ if (!in) {
+ dout(0) << " inode expire on " << q.first << " from " << from
+ << ", don't have it" << dendl;
+ ceph_assert(in);
+ }
+ ceph_assert(in->is_auth());
+ dout(20) << __func__ << ": expiring inode " << *in << dendl;
+
+ // check nonce
+ if (nonce == in->get_replica_nonce(from)) {
+ // remove from our cached_by
+ dout(7) << " inode expire on " << *in << " from mds." << from
+ << " cached_by was " << in->get_replicas() << dendl;
+ inode_remove_replica(in, from, false, gather_locks);
+ }
+ else {
+ // this is an old nonce, ignore expire.
+ dout(7) << " inode expire on " << *in << " from mds." << from
+ << " with old nonce " << nonce
+ << " (current " << in->get_replica_nonce(from) << "), dropping"
+ << dendl;
+ }
+ }
+
+ // DIRS
+ for (const auto &q : p.second.dirs) {
+ CDir *dir = get_dirfrag(q.first);
+ unsigned nonce = q.second;
+
+ if (!dir) {
+ CInode *diri = get_inode(q.first.ino);
+ if (diri) {
+ if (mds->is_rejoin() &&
+ rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
+ !diri->is_replica(from)) {
+ list<CDir*> ls;
+ diri->get_nested_dirfrags(ls);
+ dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
+ << " while rejoining, inode isn't replicated" << dendl;
+ for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
+ dir = *q;
+ if (dir->is_replica(from)) {
+ dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
+ dir->remove_replica(from);
+ }
+ }
+ continue;
+ }
+ CDir *other = diri->get_approx_dirfrag(q.first.frag);
+ if (other) {
+ dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
+ << " have " << *other << ", mismatched frags, dropping" << dendl;
+ continue;
+ }
+ }
+ dout(0) << " dir expire on " << q.first << " from " << from
+ << ", don't have it" << dendl;
+ ceph_assert(dir);
+ }
+ dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
+
+ ceph_assert(dir->is_auth());
+
+ // check nonce
+ if (nonce == dir->get_replica_nonce(from)) {
+ // remove from our cached_by
+ dout(7) << " dir expire on " << *dir << " from mds." << from
+ << " replicas was " << dir->get_replicas() << dendl;
+ dir->remove_replica(from);
+ }
+ else {
+ // this is an old nonce, ignore expire.
+ dout(7) << " dir expire on " << *dir << " from mds." << from
+ << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
+ << "), dropping" << dendl;
+ }
+ }
+
+ // DENTRIES
+ for (const auto &pd : p.second.dentries) {
+ dout(10) << " dn expires in dir " << pd.first << dendl;
+ CInode *diri = get_inode(pd.first.ino);
+ ceph_assert(diri);
+ CDir *dir = diri->get_dirfrag(pd.first.frag);
+
+ if (!dir) {
+ dout(0) << " dn expires on " << pd.first << " from " << from
+ << ", must have refragmented" << dendl;
+ } else {
+ ceph_assert(dir->is_auth());
+ }
+
+ for (const auto &p : pd.second) {
+ unsigned nonce = p.second;
+ CDentry *dn;
+
+ if (dir) {
+ dn = dir->lookup(p.first.first, p.first.second);
+ } else {
+ // which dirfrag for this dentry?
+ CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
+ ceph_assert(dir);
+ ceph_assert(dir->is_auth());
+ dn = dir->lookup(p.first.first, p.first.second);
+ }
+
+ if (!dn) {
+ if (dir)
+ dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
+ else
+ dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
+ }
+ ceph_assert(dn);
+
+ if (nonce == dn->get_replica_nonce(from)) {
+ dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
+ dentry_remove_replica(dn, from, gather_locks);
+ }
+ else {
+ dout(7) << " dentry_expire on " << *dn << " from mds." << from
+ << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
+ << "), dropping" << dendl;
+ }
+ }
+ }
+ }
+
+ for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
+ if (!(*p)->is_stable())
+ mds->locker->eval_gather(*p);
+ }
+}
+
+void MDCache::process_delayed_expire(CDir *dir)
+{
+ dout(7) << "process_delayed_expire on " << *dir << dendl;
+ for (const auto &p : delayed_expire[dir]) {
+ handle_cache_expire(p.second);
+ }
+ delayed_expire.erase(dir);
+}
+
+void MDCache::discard_delayed_expire(CDir *dir)
+{
+ dout(7) << "discard_delayed_expire on " << *dir << dendl;
+ delayed_expire.erase(dir);
+}
+
+void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
+ set<SimpleLock *>& gather_locks)
+{
+ in->remove_replica(from);
+ in->set_mds_caps_wanted(from, 0);
+
+ // note: this code calls _eval more often than it needs to!
+ // fix lock
+ if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
+ if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
+ if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
+ if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
+ if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
+ if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
+
+ // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
+ // Don't remove the recovering mds from lock's gathering list because
+ // it may hold rejoined wrlocks.
+ if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
+ if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
+ if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
+}
+
+void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
+{
+ dn->remove_replica(from);
+
+ // fix lock
+ if (dn->lock.remove_replica(from))
+ gather_locks.insert(&dn->lock);
+
+ // Replicated strays might now be elegible for purge
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ if (dnl->is_primary()) {
+ maybe_eval_stray(dnl->get_inode());
+ }
+}
+
+void MDCache::trim_client_leases()
+{
+ utime_t now = ceph_clock_now();
+
+ dout(10) << "trim_client_leases" << dendl;
+
+ std::size_t pool = 0;
+ for (const auto& list : client_leases) {
+ pool += 1;
+ if (list.empty())
+ continue;
+
+ auto before = list.size();
+ while (!list.empty()) {
+ ClientLease *r = list.front();
+ if (r->ttl > now) break;
+ CDentry *dn = static_cast<CDentry*>(r->parent);
+ dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
+ dn->remove_client_lease(r, mds->locker);
+ }
+ auto after = list.size();
+ dout(10) << "trim_client_leases pool " << pool << " trimmed "
+ << (before-after) << " leases, " << after << " left" << dendl;
+ }
+}
+
+
+void MDCache::check_memory_usage()
+{
+ static MemoryModel mm(g_ceph_context);
+ static MemoryModel::snap last;
+ mm.sample(&last);
+ static MemoryModel::snap baseline = last;
+
+ // check client caps
+ ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
+ double caps_per_inode = 0.0;
+ if (CInode::count())
+ caps_per_inode = (double)Capability::count() / (double)CInode::count();
+
+ dout(2) << "Memory usage: "
+ << " total " << last.get_total()
+ << ", rss " << last.get_rss()
+ << ", heap " << last.get_heap()
+ << ", baseline " << baseline.get_heap()
+ << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
+ << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
+ << dendl;
+
+ mds->update_mlogger();
+ mds->mlogger->set(l_mdm_rss, last.get_rss());
+ mds->mlogger->set(l_mdm_heap, last.get_heap());
+
+ if (cache_toofull()) {
+ mds->server->recall_client_state(nullptr, Server::RecallFlags::TRIM);
+ }
+
+ // If the cache size had exceeded its limit, but we're back in bounds
+ // now, free any unused pool memory so that our memory usage isn't
+ // permanently bloated.
+ if (exceeded_size_limit && !cache_toofull()) {
+ // Only do this once we are back in bounds: otherwise the releases would
+ // slow down whatever process caused us to exceed bounds to begin with
+ if (ceph_using_tcmalloc()) {
+ dout(5) << "check_memory_usage: releasing unused space from tcmalloc"
+ << dendl;
+ ceph_heap_release_free_memory();
+ }
+ exceeded_size_limit = false;
+ }
+}
+
+
+
+// =========================================================================================
+// shutdown
+
+class C_MDC_ShutdownCheck : public MDCacheContext {
+public:
+ explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
+ void finish(int) override {
+ mdcache->shutdown_check();
+ }
+};
+
+void MDCache::shutdown_check()
+{
+ dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
+
+ // cache
+ char old_val[32] = { 0 };
+ char *o = old_val;
+ g_conf().get_val("debug_mds", &o, sizeof(old_val));
+ g_conf().set_val("debug_mds", "10");
+ g_conf().apply_changes(nullptr);
+ show_cache();
+ g_conf().set_val("debug_mds", old_val);
+ g_conf().apply_changes(nullptr);
+ mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
+
+ // this
+ dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
+ dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
+
+
+ if (mds->objecter->is_active()) {
+ dout(0) << "objecter still active" << dendl;
+ mds->objecter->dump_active();
+ }
+}
+
+
+void MDCache::shutdown_start()
+{
+ dout(5) << "shutdown_start" << dendl;
+
+ if (g_conf()->mds_shutdown_check)
+ mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
+
+ // g_conf()->debug_mds = 10;
+}
+
+
+
+bool MDCache::shutdown_pass()
+{
+ dout(7) << "shutdown_pass" << dendl;
+
+ if (mds->is_stopped()) {
+ dout(7) << " already shut down" << dendl;
+ show_cache();
+ show_subtrees();
+ return true;
+ }
+
+ // empty stray dir
+ bool strays_all_exported = shutdown_export_strays();
+
+ // trim cache
+ trim(UINT64_MAX);
+ dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
+
+ // Export all subtrees to another active (usually rank 0) if not rank 0
+ int num_auth_subtree = 0;
+ if (!subtrees.empty() &&
+ mds->get_nodeid() != 0) {
+ dout(7) << "looking for subtrees to export to mds0" << dendl;
+ list<CDir*> ls;
+ for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
+ it != subtrees.end();
+ ++it) {
+ CDir *dir = it->first;
+ if (dir->get_inode()->is_mdsdir())
+ continue;
+ if (dir->is_auth()) {
+ num_auth_subtree++;
+ if (dir->is_frozen() ||
+ dir->is_freezing() ||
+ dir->is_ambiguous_dir_auth() ||
+ dir->state_test(CDir::STATE_EXPORTING))
+ continue;
+ ls.push_back(dir);
+ }
+ }
+
+ migrator->clear_export_queue();
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ CDir *dir = *p;
+ mds_rank_t dest = dir->get_inode()->authority().first;
+ if (dest > 0 && !mds->mdsmap->is_active(dest))
+ dest = 0;
+ dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
+ migrator->export_dir_nicely(dir, dest);
+ }
+ }
+
+ if (!strays_all_exported) {
+ dout(7) << "waiting for strays to migrate" << dendl;
+ return false;
+ }
+
+ if (num_auth_subtree > 0) {
+ ceph_assert(mds->get_nodeid() > 0);
+ dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
+ show_subtrees();
+ return false;
+ }
+
+ // close out any sessions (and open files!) before we try to trim the log, etc.
+ if (mds->sessionmap.have_unclosed_sessions()) {
+ if (!mds->server->terminating_sessions)
+ mds->server->terminate_sessions();
+ return false;
+ }
+
+ // Fully trim the log so that all objects in cache are clean and may be
+ // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
+ // trim the log such that the cache eventually becomes clean.
+ if (mds->mdlog->get_num_segments() > 0) {
+ auto ls = mds->mdlog->get_current_segment();
+ if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
+ // Current segment contains events other than subtreemap or
+ // there are dirty dirfrags (see CDir::log_mark_dirty())
+ mds->mdlog->start_new_segment();
+ mds->mdlog->flush();
+ }
+ }
+ mds->mdlog->trim_all();
+ if (mds->mdlog->get_num_segments() > 1) {
+ dout(7) << "still >1 segments, waiting for log to trim" << dendl;
+ return false;
+ }
+
+ // drop our reference to our stray dir inode
+ for (int i = 0; i < NUM_STRAY; ++i) {
+ if (strays[i] &&
+ strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
+ strays[i]->state_clear(CInode::STATE_STRAYPINNED);
+ strays[i]->put(CInode::PIN_STRAY);
+ strays[i]->put_stickydirs();
+ }
+ }
+
+ CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
+ if (mydir && !mydir->is_subtree_root())
+ mydir = NULL;
+
+ // subtrees map not empty yet?
+ if (subtrees.size() > (mydir ? 1 : 0)) {
+ dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
+ show_subtrees();
+ migrator->show_importing();
+ migrator->show_exporting();
+ if (!migrator->is_importing() && !migrator->is_exporting())
+ show_cache();
+ return false;
+ }
+ ceph_assert(!migrator->is_exporting());
+ ceph_assert(!migrator->is_importing());
+
+ // replicas may dirty scatter locks
+ if (myin && myin->is_replicated()) {
+ dout(7) << "still have replicated objects" << dendl;
+ return false;
+ }
+
+ if ((myin && myin->get_num_auth_pins()) ||
+ (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
+ dout(7) << "still have auth pinned objects" << dendl;
+ return false;
+ }
+
+ // (only do this once!)
+ if (!mds->mdlog->is_capped()) {
+ dout(7) << "capping the log" << dendl;
+ mds->mdlog->cap();
+ }
+
+ if (!mds->mdlog->empty())
+ mds->mdlog->trim(0);
+
+ if (!mds->mdlog->empty()) {
+ dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
+ << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
+ return false;
+ }
+
+ if (!did_shutdown_log_cap) {
+ // flush journal header
+ dout(7) << "writing header for (now-empty) journal" << dendl;
+ ceph_assert(mds->mdlog->empty());
+ mds->mdlog->write_head(0);
+ // NOTE: filer active checker below will block us until this completes.
+ did_shutdown_log_cap = true;
+ return false;
+ }
+
+ // filer active?
+ if (mds->objecter->is_active()) {
+ dout(7) << "objecter still active" << dendl;
+ mds->objecter->dump_active();
+ return false;
+ }
+
+ // trim what we can from the cache
+ if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
+ dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
+ show_cache();
+ //dump();
+ return false;
+ }
+
+ // make mydir subtree go away
+ if (mydir) {
+ if (mydir->get_num_ref() > 1) { // subtree pin
+ dout(7) << "there's still reference to mydir " << *mydir << dendl;
+ show_cache();
+ return false;
+ }
+
+ remove_subtree(mydir);
+ myin->close_dirfrag(mydir->get_frag());
+ }
+ ceph_assert(subtrees.empty());
+
+ if (myin) {
+ remove_inode(myin);
+ ceph_assert(!myin);
+ }
+
+ if (global_snaprealm) {
+ remove_inode(global_snaprealm->inode);
+ global_snaprealm = nullptr;
+ }
+
+ // done!
+ dout(5) << "shutdown done." << dendl;
+ return true;
+}
+
+bool MDCache::shutdown_export_strays()
+{
+ static const unsigned MAX_EXPORTING = 100;
+
+ if (mds->get_nodeid() == 0)
+ return true;
+
+ if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
+ return false;
+
+ dout(10) << "shutdown_export_strays " << shutdown_export_next.first
+ << " '" << shutdown_export_next.second << "'" << dendl;
+
+ bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
+ bool all_exported = false;
+
+again:
+ auto next = shutdown_export_next;
+
+ for (int i = 0; i < NUM_STRAY; ++i) {
+ CInode *strayi = strays[i];
+ if (!strayi ||
+ !strayi->state_test(CInode::STATE_STRAYPINNED))
+ continue;
+ if (strayi->ino() < next.first.ino)
+ continue;
+
+ deque<CDir*> dfls;
+ strayi->get_dirfrags(dfls);
+
+ while (!dfls.empty()) {
+ CDir *dir = dfls.front();
+ dfls.pop_front();
+
+ if (dir->dirfrag() < next.first)
+ continue;
+ if (next.first < dir->dirfrag()) {
+ next.first = dir->dirfrag();
+ next.second.clear();
+ }
+
+ if (!dir->is_complete()) {
+ MDSContext *fin = nullptr;
+ if (shutdown_exporting_strays.empty()) {
+ fin = new MDSInternalContextWrapper(mds,
+ new FunctionContext([this](int r) {
+ shutdown_export_strays();
+ })
+ );
+ }
+ dir->fetch(fin);
+ goto done;
+ }
+
+ CDir::dentry_key_map::iterator it;
+ if (next.second.empty()) {
+ it = dir->begin();
+ } else {
+ auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
+ it = dir->lower_bound(dentry_key_t(0, next.second, hash));
+ }
+
+ for (; it != dir->end(); ++it) {
+ CDentry *dn = it->second;
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ if (dnl->is_null())
+ continue;
+
+ if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
+ next.second = it->first.name;
+ goto done;
+ }
+
+ auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
+ if (!ret.second) {
+ dout(10) << "already exporting/purging " << *dn << dendl;
+ continue;
+ }
+
+ // Don't try to migrate anything that is actually
+ // being purged right now
+ if (!dn->state_test(CDentry::STATE_PURGING))
+ stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
+
+ if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
+ ++it;
+ if (it != dir->end()) {
+ next.second = it->first.name;
+ } else {
+ if (dfls.empty())
+ next.first.ino.val++;
+ else
+ next.first = dfls.front()->dirfrag();
+ next.second.clear();
+ }
+ goto done;
+ }
+ }
+ }
+ }
+
+ if (shutdown_exporting_strays.empty()) {
+ dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
+ if (first_df < shutdown_export_next.first ||
+ !shutdown_export_next.second.empty()) {
+ shutdown_export_next.first = first_df;
+ shutdown_export_next.second.clear();
+ goto again;
+ }
+ all_exported = true;
+ }
+
+done:
+ shutdown_export_next = next;
+ return all_exported;
+}
+
+// ========= messaging ==============
+
+void MDCache::dispatch(const Message::const_ref &m)
+{
+ switch (m->get_type()) {
+
+ // RESOLVE
+ case MSG_MDS_RESOLVE:
+ handle_resolve(MMDSResolve::msgref_cast(m));
+ break;
+ case MSG_MDS_RESOLVEACK:
+ handle_resolve_ack(MMDSResolveAck::msgref_cast(m));
+ break;
+
+ // REJOIN
+ case MSG_MDS_CACHEREJOIN:
+ handle_cache_rejoin(MMDSCacheRejoin::msgref_cast(m));
+ break;
+
+ case MSG_MDS_DISCOVER:
+ handle_discover(MDiscover::msgref_cast(m));
+ break;
+ case MSG_MDS_DISCOVERREPLY:
+ handle_discover_reply(MDiscoverReply::msgref_cast(m));
+ break;
+
+ case MSG_MDS_DIRUPDATE:
+ handle_dir_update(MDirUpdate::msgref_cast(m));
+ break;
+
+ case MSG_MDS_CACHEEXPIRE:
+ handle_cache_expire(MCacheExpire::msgref_cast(m));
+ break;
+
+ case MSG_MDS_DENTRYLINK:
+ handle_dentry_link(MDentryLink::msgref_cast(m));
+ break;
+ case MSG_MDS_DENTRYUNLINK:
+ handle_dentry_unlink(MDentryUnlink::msgref_cast(m));
+ break;
+
+ case MSG_MDS_FRAGMENTNOTIFY:
+ handle_fragment_notify(MMDSFragmentNotify::msgref_cast(m));
+ break;
+ case MSG_MDS_FRAGMENTNOTIFYACK:
+ handle_fragment_notify_ack(MMDSFragmentNotifyAck::msgref_cast(m));
+ break;
+
+ case MSG_MDS_FINDINO:
+ handle_find_ino(MMDSFindIno::msgref_cast(m));
+ break;
+ case MSG_MDS_FINDINOREPLY:
+ handle_find_ino_reply(MMDSFindInoReply::msgref_cast(m));
+ break;
+
+ case MSG_MDS_OPENINO:
+ handle_open_ino(MMDSOpenIno::msgref_cast(m));
+ break;
+ case MSG_MDS_OPENINOREPLY:
+ handle_open_ino_reply(MMDSOpenInoReply::msgref_cast(m));
+ break;
+
+ case MSG_MDS_SNAPUPDATE:
+ handle_snap_update(MMDSSnapUpdate::msgref_cast(m));
+ break;
+
+ default:
+ derr << "cache unknown message " << m->get_type() << dendl;
+ ceph_abort_msg("cache unknown message");
+ }
+}
+
+int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, // who
+ const filepath& path, // what
+ vector<CDentry*> *pdnvec, // result
+ CInode **pin,
+ int onfail)
+{
+ bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
+ bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
+ bool forward = (onfail == MDS_TRAVERSE_FORWARD);
+
+ ceph_assert(!forward || mdr); // forward requires a request
+
+ snapid_t snapid = CEPH_NOSNAP;
+ if (mdr)
+ mdr->snapid = snapid;
+
+ client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
+
+ if (mds->logger) mds->logger->inc(l_mds_traverse);
+
+ dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
+ CInode *cur = get_inode(path.get_ino());
+ if (cur == NULL) {
+ if (MDS_INO_IS_MDSDIR(path.get_ino()))
+ open_foreign_mdsdir(path.get_ino(), cf.build());
+ else {
+ //ceph_abort(); // hrm.. broken
+ return -ESTALE;
+ }
+ return 1;
+ }
+ if (cur->state_test(CInode::STATE_PURGING))
+ return -ESTALE;
+
+ // make sure snaprealm are open...
+ if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
+ !cur->snaprealm->open_parents(cf.build())) {
+ return 1;
+ }
+
+ // start trace
+ if (pdnvec)
+ pdnvec->clear();
+ if (pin)
+ *pin = cur;
+
+ unsigned depth = 0;
+ while (depth < path.depth()) {
+ dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
+ << "' snapid " << snapid << dendl;
+
+ if (!cur->is_dir()) {
+ dout(7) << "traverse: " << *cur << " not a dir " << dendl;
+ return -ENOTDIR;
+ }
+
+ // walk into snapdir?
+ if (path[depth].length() == 0) {
+ dout(10) << "traverse: snapdir" << dendl;
+ if (!mdr)
+ return -EINVAL;
+ snapid = CEPH_SNAPDIR;
+ mdr->snapid = snapid;
+ depth++;
+ continue;
+ }
+ // walk thru snapdir?
+ if (snapid == CEPH_SNAPDIR) {
+ if (!mdr)
+ return -EINVAL;
+ SnapRealm *realm = cur->find_snaprealm();
+ snapid = realm->resolve_snapname(path[depth], cur->ino());
+ dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
+ if (!snapid) {
+ CInode *t = cur;
+ while (t) {
+ // if snaplock isn't readable, it's possible that other mds is creating
+ // snapshot, but snap update message hasn't been received.
+ if (!t->snaplock.can_read(client)) {
+ dout(10) << " non-readable snaplock on " << *t << dendl;
+ t->snaplock.add_waiter(SimpleLock::WAIT_RD, cf.build());
+ return 1;
+ }
+ CDentry *pdn = t->get_projected_parent_dn();
+ t = pdn ? pdn->get_dir()->get_inode() : NULL;
+ }
+ return -ENOENT;
+ }
+ mdr->snapid = snapid;
+ depth++;
+ continue;
+ }
+
+ // open dir
+ frag_t fg = cur->pick_dirfrag(path[depth]);
+ CDir *curdir = cur->get_dirfrag(fg);
+ if (!curdir) {
+ if (cur->is_auth()) {
+ // parent dir frozen_dir?
+ if (cur->is_frozen()) {
+ dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
+ cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
+ return 1;
+ }
+ curdir = cur->get_or_open_dirfrag(this, fg);
+ } else {
+ // discover?
+ dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
+ discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
+ null_okay);
+ if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
+ return 1;
+ }
+ }
+ ceph_assert(curdir);
+
+#ifdef MDS_VERIFY_FRAGSTAT
+ if (curdir->is_complete())
+ curdir->verify_fragstat();
+#endif
+
+ // frozen?
+ /*
+ if (curdir->is_frozen()) {
+ // doh!
+ // FIXME: traverse is allowed?
+ dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
+ curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
+ if (onfinish) delete onfinish;
+ return 1;
+ }
+ */
+
+ // Before doing dirfrag->dn lookup, compare with DamageTable's
+ // record of which dentries were unreadable
+ if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
+ dout(4) << "traverse: stopped lookup at damaged dentry "
+ << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
+ return -EIO;
+ }
+
+ // dentry
+ CDentry *dn = curdir->lookup(path[depth], snapid);
+ CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
+
+ // null and last_bit and xlocked by me?
+ if (dnl && dnl->is_null() && null_okay) {
+ dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl;
+ if (pdnvec)
+ pdnvec->push_back(dn);
+ if (pin)
+ *pin = 0;
+ break; // done!
+ }
+
+ if (dnl &&
+ dn->lock.is_xlocked() &&
+ dn->lock.get_xlock_by() != mdr &&
+ !dn->lock.can_read(client) &&
+ (dnl->is_null() || forward)) {
+ dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
+ dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
+ if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
+ mds->mdlog->flush();
+ return 1;
+ }
+
+ // can we conclude ENOENT?
+ if (dnl && dnl->is_null()) {
+ if (dn->lock.can_read(client) ||
+ (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
+ dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl;
+ if (pdnvec) {
+ if (depth == path.depth() - 1)
+ pdnvec->push_back(dn);
+ else
+ pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
+ }
+ return -ENOENT;
+ } else {
+ dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
+ dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
+ return 1;
+ }
+ }
+
+ if (dnl && !dnl->is_null()) {
+ CInode *in = dnl->get_inode();
+
+ // do we have inode?
+ if (!in) {
+ ceph_assert(dnl->is_remote());
+ // do i have it?
+ in = get_inode(dnl->get_remote_ino());
+ if (in) {
+ dout(7) << "linking in remote in " << *in << dendl;
+ dn->link_remote(dnl, in);
+ } else {
+ dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
+ ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
+ if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
+ dout(4) << "traverse: remote dentry points to damaged ino "
+ << *dn << dendl;
+ return -EIO;
+ }
+ open_remote_dentry(dn, true, cf.build(),
+ (null_okay && depth == path.depth() - 1));
+ if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
+ return 1;
+ }
+ }
+
+ cur = in;
+ // make sure snaprealm are open...
+ if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
+ !cur->snaprealm->open_parents(cf.build())) {
+ return 1;
+ }
+
+ // add to trace, continue.
+ touch_inode(cur);
+ if (pdnvec)
+ pdnvec->push_back(dn);
+ if (pin)
+ *pin = cur;
+ depth++;
+ continue;
+ }
+
+
+ // MISS. dentry doesn't exist.
+ dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
+
+ if (curdir->is_auth()) {
+ // dentry is mine.
+ if (curdir->is_complete() ||
+ (snapid == CEPH_NOSNAP &&
+ curdir->has_bloom() &&
+ !curdir->is_in_bloom(path[depth]))) {
+ // file not found
+ if (pdnvec) {
+ // instantiate a null dn?
+ if (depth < path.depth()-1){
+ dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
+ dn = NULL;
+ } else if (dn) {
+ ceph_abort(); // should have fallen out in ->is_null() check above
+ } else if (curdir->is_frozen()) {
+ dout(20) << " not adding null to frozen dir " << dendl;
+ } else if (snapid < CEPH_MAXSNAP) {
+ dout(20) << " not adding null for snapid " << snapid << dendl;
+ } else {
+ // create a null dentry
+ dn = curdir->add_null_dentry(path[depth]);
+ dout(20) << " added null " << *dn << dendl;
+ }
+ if (dn)
+ pdnvec->push_back(dn);
+ else
+ pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
+ }
+ return -ENOENT;
+ } else {
+
+ // Check DamageTable for missing fragments before trying to fetch
+ // this
+ if (mds->damage_table.is_dirfrag_damaged(curdir)) {
+ dout(4) << "traverse: damaged dirfrag " << *curdir
+ << ", blocking fetch" << dendl;
+ return -EIO;
+ }
+
+ // directory isn't complete; reload
+ dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
+ touch_inode(cur);
+ curdir->fetch(cf.build(), path[depth]);
+ if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
+ return 1;
+ }
+ } else {
+ // dirfrag/dentry is not mine.
+ mds_authority_t dauth = curdir->authority();
+
+ if (!forward_all_requests_to_auth &&
+ forward &&
+ mdr && mdr->client_request &&
+ (int)depth < mdr->client_request->get_num_fwd()){
+ dout(7) << "traverse: snap " << snapid << " and depth " << depth
+ << " < fwd " << mdr->client_request->get_num_fwd()
+ << ", discovering instead of forwarding" << dendl;
+ discover = true;
+ }
+
+ if ((discover || null_okay)) {
+ dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
+ discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
+ null_okay);
+ if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
+ return 1;
+ }
+ if (forward) {
+ // forward
+ dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
+
+ if (curdir->is_ambiguous_auth()) {
+ // wait
+ dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
+ curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
+ return 1;
+ }
+
+ dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
+
+ request_forward(mdr, dauth.first);
+
+ if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
+ return 2;
+ }
+ }
+
+ ceph_abort(); // i shouldn't get here
+ }
+
+ // success.
+ if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
+ dout(10) << "path_traverse finish on snapid " << snapid << dendl;
+ if (mdr)
+ ceph_assert(mdr->snapid == snapid);
+ return 0;
+}
+
+CInode *MDCache::cache_traverse(const filepath& fp)
+{
+ dout(10) << "cache_traverse " << fp << dendl;
+
+ CInode *in;
+ if (fp.get_ino())
+ in = get_inode(fp.get_ino());
+ else
+ in = root;
+ if (!in)
+ return NULL;
+
+ for (unsigned i = 0; i < fp.depth(); i++) {
+ std::string_view dname = fp[i];
+ frag_t fg = in->pick_dirfrag(dname);
+ dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
+ CDir *curdir = in->get_dirfrag(fg);
+ if (!curdir)
+ return NULL;
+ CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
+ if (!dn)
+ return NULL;
+ in = dn->get_linkage()->get_inode();
+ if (!in)
+ return NULL;
+ }
+ dout(10) << " got " << *in << dendl;
+ return in;
+}
+
+
+/**
+ * open_remote_dir -- open up a remote dirfrag
+ *
+ * @param diri base inode
+ * @param approxfg approximate fragment.
+ * @param fin completion callback
+ */
+void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin)
+{
+ dout(10) << "open_remote_dir on " << *diri << dendl;
+ ceph_assert(diri->is_dir());
+ ceph_assert(!diri->is_auth());
+ ceph_assert(diri->get_dirfrag(approxfg) == 0);
+
+ discover_dir_frag(diri, approxfg, fin);
+}
+
+
+/**
+ * get_dentry_inode - get or open inode
+ *
+ * @param dn the dentry
+ * @param mdr current request
+ *
+ * will return inode for primary, or link up/open up remote link's inode as necessary.
+ * If it's not available right now, puts mdr on wait list and returns null.
+ */
+CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
+{
+ CDentry::linkage_t *dnl;
+ if (projected)
+ dnl = dn->get_projected_linkage();
+ else
+ dnl = dn->get_linkage();
+
+ ceph_assert(!dnl->is_null());
+
+ if (dnl->is_primary())
+ return dnl->inode;
+
+ ceph_assert(dnl->is_remote());
+ CInode *in = get_inode(dnl->get_remote_ino());
+ if (in) {
+ dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
+ dn->link_remote(dnl, in);
+ return in;
+ } else {
+ dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
+ open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
+ return 0;
+ }
+}
+
+struct C_MDC_OpenRemoteDentry : public MDCacheContext {
+ CDentry *dn;
+ inodeno_t ino;
+ MDSContext *onfinish;
+ bool want_xlocked;
+ C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
+ MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
+ dn->get(MDSCacheObject::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
+ dn->put(MDSCacheObject::PIN_PTRWAITER);
+ }
+};
+
+void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
+{
+ dout(10) << "open_remote_dentry " << *dn << dendl;
+ CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
+ inodeno_t ino = dnl->get_remote_ino();
+ int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
+ open_ino(ino, pool,
+ new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
+}
+
+void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
+ bool want_xlocked, int r)
+{
+ if (r < 0) {
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
+ dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
+ dn->state_set(CDentry::STATE_BADREMOTEINO);
+
+ std::string path;
+ CDir *dir = dn->get_dir();
+ if (dir) {
+ dir->get_inode()->make_path_string(path);
+ path += "/";
+ path += dn->get_name();
+ }
+
+ bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
+ if (fatal) {
+ mds->damaged();
+ ceph_abort(); // unreachable, damaged() respawns us
+ }
+ } else {
+ r = 0;
+ }
+ }
+ fin->complete(r < 0 ? r : 0);
+}
+
+
+void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
+{
+ // empty trace if we're a base inode
+ if (in->is_base())
+ return;
+
+ CInode *parent = in->get_parent_inode();
+ ceph_assert(parent);
+ make_trace(trace, parent);
+
+ CDentry *dn = in->get_parent_dn();
+ dout(15) << "make_trace adding " << *dn << dendl;
+ trace.push_back(dn);
+}
+
+
+// -------------------------------------------------------------------------------
+// Open inode by inode number
+
+class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
+ inodeno_t ino;
+ public:
+ bufferlist bl;
+ C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
+ MDCacheIOContext(c), ino(i) {}
+ void finish(int r) override {
+ mdcache->_open_ino_backtrace_fetched(ino, bl, r);
+ }
+ void print(ostream& out) const override {
+ out << "openino_backtrace_fetch" << ino << ")";
+ }
+};
+
+struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
+ inodeno_t ino;
+ MMDSOpenIno::const_ref msg;
+ bool parent;
+ public:
+ C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const MMDSOpenIno::const_ref &m, bool p) :
+ MDCacheContext(c), ino(i), msg(m), parent(p) {}
+ void finish(int r) override {
+ if (r < 0 && !parent)
+ r = -EAGAIN;
+ if (msg) {
+ mdcache->handle_open_ino(msg, r);
+ return;
+ }
+ auto& info = mdcache->opening_inodes.at(ino);
+ mdcache->_open_ino_traverse_dir(ino, info, r);
+ }
+};
+
+struct C_MDC_OpenInoParentOpened : public MDCacheContext {
+ inodeno_t ino;
+ public:
+ C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
+ void finish(int r) override {
+ mdcache->_open_ino_parent_opened(ino, r);
+ }
+};
+
+void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
+{
+ dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
+
+ open_ino_info_t& info = opening_inodes.at(ino);
+
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " found cached " << *in << dendl;
+ open_ino_finish(ino, info, in->authority().first);
+ return;
+ }
+
+ inode_backtrace_t backtrace;
+ if (err == 0) {
+ try {
+ decode(backtrace, bl);
+ } catch (const buffer::error &decode_exc) {
+ derr << "corrupt backtrace on ino x0" << std::hex << ino
+ << std::dec << ": " << decode_exc << dendl;
+ open_ino_finish(ino, info, -EIO);
+ return;
+ }
+ if (backtrace.pool != info.pool && backtrace.pool != -1) {
+ dout(10) << " old object in pool " << info.pool
+ << ", retrying pool " << backtrace.pool << dendl;
+ info.pool = backtrace.pool;
+ C_IO_MDC_OpenInoBacktraceFetched *fin =
+ new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
+ fetch_backtrace(ino, info.pool, fin->bl,
+ new C_OnFinisher(fin, mds->finisher));
+ return;
+ }
+ } else if (err == -ENOENT) {
+ int64_t meta_pool = mds->mdsmap->get_metadata_pool();
+ if (info.pool != meta_pool) {
+ dout(10) << " no object in pool " << info.pool
+ << ", retrying pool " << meta_pool << dendl;
+ info.pool = meta_pool;
+ C_IO_MDC_OpenInoBacktraceFetched *fin =
+ new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
+ fetch_backtrace(ino, info.pool, fin->bl,
+ new C_OnFinisher(fin, mds->finisher));
+ return;
+ }
+ err = 0; // backtrace.ancestors.empty() is checked below
+ }
+
+ if (err == 0) {
+ if (backtrace.ancestors.empty()) {
+ dout(10) << " got empty backtrace " << dendl;
+ err = -ESTALE;
+ } else if (!info.ancestors.empty()) {
+ if (info.ancestors[0] == backtrace.ancestors[0]) {
+ dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
+ err = -EINVAL;
+ } else {
+ info.last_err = 0;
+ }
+ }
+ }
+ if (err) {
+ dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
+ if (info.last_err)
+ err = info.last_err;
+ open_ino_finish(ino, info, err);
+ return;
+ }
+
+ dout(10) << " got backtrace " << backtrace << dendl;
+ info.ancestors = backtrace.ancestors;
+
+ _open_ino_traverse_dir(ino, info, 0);
+}
+
+void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
+{
+ dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
+
+ open_ino_info_t& info = opening_inodes.at(ino);
+
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " found cached " << *in << dendl;
+ open_ino_finish(ino, info, in->authority().first);
+ return;
+ }
+
+ if (ret == mds->get_nodeid()) {
+ _open_ino_traverse_dir(ino, info, 0);
+ } else {
+ if (ret >= 0) {
+ mds_rank_t checked_rank = mds_rank_t(ret);
+ info.check_peers = true;
+ info.auth_hint = checked_rank;
+ info.checked.erase(checked_rank);
+ }
+ do_open_ino(ino, info, ret);
+ }
+}
+
+void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
+{
+ dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
+
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " found cached " << *in << dendl;
+ open_ino_finish(ino, info, in->authority().first);
+ return;
+ }
+
+ if (ret) {
+ do_open_ino(ino, info, ret);
+ return;
+ }
+
+ mds_rank_t hint = info.auth_hint;
+ ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
+ info.discover, info.want_xlocked, &hint);
+ if (ret > 0)
+ return;
+ if (hint != mds->get_nodeid())
+ info.auth_hint = hint;
+ do_open_ino(ino, info, ret);
+}
+
+void MDCache::_open_ino_fetch_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m, CDir *dir, bool parent)
+{
+ if (dir->state_test(CDir::STATE_REJOINUNDEF))
+ ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
+ dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
+ if (mds->logger)
+ mds->logger->inc(l_mds_openino_dir_fetch);
+}
+
+int MDCache::open_ino_traverse_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m,
+ const vector<inode_backpointer_t>& ancestors,
+ bool discover, bool want_xlocked, mds_rank_t *hint)
+{
+ dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
+ int err = 0;
+ for (unsigned i = 0; i < ancestors.size(); i++) {
+ const auto& ancestor = ancestors.at(i);
+ CInode *diri = get_inode(ancestor.dirino);
+
+ if (!diri) {
+ if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
+ open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
+ return 1;
+ }
+ continue;
+ }
+
+ if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
+ CDir *dir = diri->get_parent_dir();
+ while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
+ dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
+ dir = dir->get_inode()->get_parent_dir();
+ _open_ino_fetch_dir(ino, m, dir, i == 0);
+ return 1;
+ }
+
+ if (!diri->is_dir()) {
+ dout(10) << " " << *diri << " is not dir" << dendl;
+ if (i == 0)
+ err = -ENOTDIR;
+ break;
+ }
+
+ const string& name = ancestor.dname;
+ frag_t fg = diri->pick_dirfrag(name);
+ CDir *dir = diri->get_dirfrag(fg);
+ if (!dir) {
+ if (diri->is_auth()) {
+ if (diri->is_frozen()) {
+ dout(10) << " " << *diri << " is frozen, waiting " << dendl;
+ diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
+ return 1;
+ }
+ dir = diri->get_or_open_dirfrag(this, fg);
+ } else if (discover) {
+ open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
+ return 1;
+ }
+ }
+ if (dir) {
+ inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
+ CDentry *dn = dir->lookup(name);
+ CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
+ if (dir->is_auth()) {
+ if (dnl && dnl->is_primary() &&
+ dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
+ dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
+ _open_ino_fetch_dir(ino, m, dir, i == 0);
+ return 1;
+ }
+
+ if (!dnl && !dir->is_complete() &&
+ (!dir->has_bloom() || dir->is_in_bloom(name))) {
+ dout(10) << " fetching incomplete " << *dir << dendl;
+ _open_ino_fetch_dir(ino, m, dir, i == 0);
+ return 1;
+ }
+
+ dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
+ if (i == 0)
+ err = -ENOENT;
+ } else if (discover) {
+ if (!dnl) {
+ filepath path(name, 0);
+ discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
+ (i == 0 && want_xlocked));
+ return 1;
+ }
+ if (dnl->is_null() && !dn->lock.can_read(-1)) {
+ dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
+ dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
+ return 1;
+ }
+ dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
+ if (i == 0)
+ err = -ENOENT;
+ }
+ }
+ if (hint && i == 0)
+ *hint = dir ? dir->authority().first : diri->authority().first;
+ break;
+ }
+ return err;
+}
+
+void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
+{
+ dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
+
+ MDSContext::vec waiters;
+ waiters.swap(info.waiters);
+ opening_inodes.erase(ino);
+ finish_contexts(g_ceph_context, waiters, ret);
+}
+
+void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
+{
+ if (err < 0 && err != -EAGAIN) {
+ info.checked.clear();
+ info.checking = MDS_RANK_NONE;
+ info.check_peers = true;
+ info.fetch_backtrace = true;
+ if (info.discover) {
+ info.discover = false;
+ info.ancestors.clear();
+ }
+ if (err != -ENOENT && err != -ENOTDIR)
+ info.last_err = err;
+ }
+
+ if (info.check_peers || info.discover) {
+ if (info.discover) {
+ // got backtrace from peer, but failed to find inode. re-check peers
+ info.discover = false;
+ info.ancestors.clear();
+ info.checked.clear();
+ }
+ info.check_peers = false;
+ info.checking = MDS_RANK_NONE;
+ do_open_ino_peer(ino, info);
+ } else if (info.fetch_backtrace) {
+ info.check_peers = true;
+ info.fetch_backtrace = false;
+ info.checking = mds->get_nodeid();
+ info.checked.clear();
+ C_IO_MDC_OpenInoBacktraceFetched *fin =
+ new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
+ fetch_backtrace(ino, info.pool, fin->bl,
+ new C_OnFinisher(fin, mds->finisher));
+ } else {
+ ceph_assert(!info.ancestors.empty());
+ info.checking = mds->get_nodeid();
+ open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
+ new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
+ }
+}
+
+void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
+{
+ set<mds_rank_t> all, active;
+ mds->mdsmap->get_mds_set(all);
+ if (mds->get_state() == MDSMap::STATE_REJOIN)
+ mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
+ else
+ mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
+
+ dout(10) << "do_open_ino_peer " << ino << " active " << active
+ << " all " << all << " checked " << info.checked << dendl;
+
+ mds_rank_t whoami = mds->get_nodeid();
+ mds_rank_t peer = MDS_RANK_NONE;
+ if (info.auth_hint >= 0 && info.auth_hint != whoami) {
+ if (active.count(info.auth_hint)) {
+ peer = info.auth_hint;
+ info.auth_hint = MDS_RANK_NONE;
+ }
+ } else {
+ for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
+ if (*p != whoami && info.checked.count(*p) == 0) {
+ peer = *p;
+ break;
+ }
+ }
+ if (peer < 0) {
+ all.erase(whoami);
+ if (all != info.checked) {
+ dout(10) << " waiting for more peers to be active" << dendl;
+ } else {
+ dout(10) << " all MDS peers have been checked " << dendl;
+ do_open_ino(ino, info, 0);
+ }
+ } else {
+ info.checking = peer;
+ vector<inode_backpointer_t> *pa = NULL;
+ // got backtrace from peer or backtrace just fetched
+ if (info.discover || !info.fetch_backtrace)
+ pa = &info.ancestors;
+ mds->send_message_mds(MMDSOpenIno::create(info.tid, ino, pa), peer);
+ if (mds->logger)
+ mds->logger->inc(l_mds_openino_peer_discover);
+ }
+}
+
+void MDCache::handle_open_ino(const MMDSOpenIno::const_ref &m, int err)
+{
+ if (mds->get_state() < MDSMap::STATE_REJOIN &&
+ mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
+ return;
+ }
+
+ dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
+
+ auto from = mds_rank_t(m->get_source().num());
+ inodeno_t ino = m->ino;
+ MMDSOpenInoReply::ref reply;
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " have " << *in << dendl;
+ reply = MMDSOpenInoReply::create(m->get_tid(), ino, mds_rank_t(0));
+ if (in->is_auth()) {
+ touch_inode(in);
+ while (1) {
+ CDentry *pdn = in->get_parent_dn();
+ if (!pdn)
+ break;
+ CInode *diri = pdn->get_dir()->get_inode();
+ reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
+ in->inode.version));
+ in = diri;
+ }
+ } else {
+ reply->hint = in->authority().first;
+ }
+ } else if (err < 0) {
+ reply = MMDSOpenInoReply::create(m->get_tid(), ino, MDS_RANK_NONE, err);
+ } else {
+ mds_rank_t hint = MDS_RANK_NONE;
+ int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
+ if (ret > 0)
+ return;
+ reply = MMDSOpenInoReply::create(m->get_tid(), ino, hint, ret);
+ }
+ mds->send_message_mds(reply, from);
+}
+
+void MDCache::handle_open_ino_reply(const MMDSOpenInoReply::const_ref &m)
+{
+ dout(10) << "handle_open_ino_reply " << *m << dendl;
+
+ inodeno_t ino = m->ino;
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+ auto it = opening_inodes.find(ino);
+ if (it != opening_inodes.end() && it->second.checking == from) {
+ open_ino_info_t& info = it->second;
+ info.checking = MDS_RANK_NONE;
+ info.checked.insert(from);
+
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " found cached " << *in << dendl;
+ open_ino_finish(ino, info, in->authority().first);
+ } else if (!m->ancestors.empty()) {
+ dout(10) << " found ino " << ino << " on mds." << from << dendl;
+ if (!info.want_replica) {
+ open_ino_finish(ino, info, from);
+ return;
+ }
+
+ info.ancestors = m->ancestors;
+ info.auth_hint = from;
+ info.checking = mds->get_nodeid();
+ info.discover = true;
+ _open_ino_traverse_dir(ino, info, 0);
+ } else if (m->error) {
+ dout(10) << " error " << m->error << " from mds." << from << dendl;
+ do_open_ino(ino, info, m->error);
+ } else {
+ if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
+ info.auth_hint = m->hint;
+ info.checked.erase(m->hint);
+ }
+ do_open_ino_peer(ino, info);
+ }
+ }
+}
+
+void MDCache::kick_open_ino_peers(mds_rank_t who)
+{
+ dout(10) << "kick_open_ino_peers mds." << who << dendl;
+
+ for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
+ p != opening_inodes.end();
+ ++p) {
+ open_ino_info_t& info = p->second;
+ if (info.checking == who) {
+ dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
+ info.checking = MDS_RANK_NONE;
+ do_open_ino_peer(p->first, info);
+ } else if (info.checking == MDS_RANK_NONE) {
+ dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
+ do_open_ino_peer(p->first, info);
+ }
+ }
+}
+
+void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
+ bool want_replica, bool want_xlocked)
+{
+ dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
+ << want_replica << dendl;
+
+ auto it = opening_inodes.find(ino);
+ if (it != opening_inodes.end()) {
+ open_ino_info_t& info = it->second;
+ if (want_replica) {
+ info.want_replica = true;
+ if (want_xlocked && !info.want_xlocked) {
+ if (!info.ancestors.empty()) {
+ CInode *diri = get_inode(info.ancestors[0].dirino);
+ if (diri) {
+ frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
+ CDir *dir = diri->get_dirfrag(fg);
+ if (dir && !dir->is_auth()) {
+ filepath path(info.ancestors[0].dname, 0);
+ discover_path(dir, CEPH_NOSNAP, path, NULL, true);
+ }
+ }
+ }
+ info.want_xlocked = true;
+ }
+ }
+ info.waiters.push_back(fin);
+ } else {
+ open_ino_info_t& info = opening_inodes[ino];
+ info.want_replica = want_replica;
+ info.want_xlocked = want_xlocked;
+ info.tid = ++open_ino_last_tid;
+ info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
+ info.waiters.push_back(fin);
+ if (mds->is_rejoin() &&
+ open_file_table.get_ancestors(ino, info.ancestors, info.auth_hint)) {
+ info.fetch_backtrace = false;
+ info.checking = mds->get_nodeid();
+ _open_ino_traverse_dir(ino, info, 0);
+ } else {
+ do_open_ino(ino, info, 0);
+ }
+ }
+}
+
+/* ---------------------------- */
+
+/*
+ * search for a given inode on MDS peers. optionally start with the given node.
+
+
+ TODO
+ - recover from mds node failure, recovery
+ - traverse path
+
+ */
+void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c, mds_rank_t hint)
+{
+ dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
+ CInode *in = get_inode(ino);
+ if (in && in->state_test(CInode::STATE_PURGING)) {
+ c->complete(-ESTALE);
+ return;
+ }
+ ceph_assert(!in);
+
+ ceph_tid_t tid = ++find_ino_peer_last_tid;
+ find_ino_peer_info_t& fip = find_ino_peer[tid];
+ fip.ino = ino;
+ fip.tid = tid;
+ fip.fin = c;
+ fip.hint = hint;
+ _do_find_ino_peer(fip);
+}
+
+void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
+{
+ set<mds_rank_t> all, active;
+ mds->mdsmap->get_mds_set(all);
+ mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
+
+ dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
+ << " active " << active << " all " << all
+ << " checked " << fip.checked
+ << dendl;
+
+ mds_rank_t m = MDS_RANK_NONE;
+ if (fip.hint >= 0) {
+ m = fip.hint;
+ fip.hint = MDS_RANK_NONE;
+ } else {
+ for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
+ if (*p != mds->get_nodeid() &&
+ fip.checked.count(*p) == 0) {
+ m = *p;
+ break;
+ }
+ }
+ if (m == MDS_RANK_NONE) {
+ all.erase(mds->get_nodeid());
+ if (all != fip.checked) {
+ dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
+ } else {
+ dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
+ fip.fin->complete(-ESTALE);
+ find_ino_peer.erase(fip.tid);
+ }
+ } else {
+ fip.checking = m;
+ mds->send_message_mds(MMDSFindIno::create(fip.tid, fip.ino), m);
+ }
+}
+
+void MDCache::handle_find_ino(const MMDSFindIno::const_ref &m)
+{
+ if (mds->get_state() < MDSMap::STATE_REJOIN) {
+ return;
+ }
+
+ dout(10) << "handle_find_ino " << *m << dendl;
+ auto r = MMDSFindInoReply::create(m->tid);
+ CInode *in = get_inode(m->ino);
+ if (in) {
+ in->make_path(r->path);
+ dout(10) << " have " << r->path << " " << *in << dendl;
+ }
+ mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
+}
+
+
+void MDCache::handle_find_ino_reply(const MMDSFindInoReply::const_ref &m)
+{
+ map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
+ if (p != find_ino_peer.end()) {
+ dout(10) << "handle_find_ino_reply " << *m << dendl;
+ find_ino_peer_info_t& fip = p->second;
+
+ // success?
+ if (get_inode(fip.ino)) {
+ dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
+ mds->queue_waiter(fip.fin);
+ find_ino_peer.erase(p);
+ return;
+ }
+
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+ if (fip.checking == from)
+ fip.checking = MDS_RANK_NONE;
+ fip.checked.insert(from);
+
+ if (!m->path.empty()) {
+ // we got a path!
+ vector<CDentry*> trace;
+ CF_MDS_RetryMessageFactory cf(mds, m);
+ MDRequestRef null_ref;
+ int r = path_traverse(null_ref, cf, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
+ if (r > 0)
+ return;
+ dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
+ << ", retrying" << dendl;
+ fip.checked.clear();
+ _do_find_ino_peer(fip);
+ } else {
+ // nope, continue.
+ _do_find_ino_peer(fip);
+ }
+ } else {
+ dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
+ }
+}
+
+void MDCache::kick_find_ino_peers(mds_rank_t who)
+{
+ // find_ino_peers requests we should move on from
+ for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
+ p != find_ino_peer.end();
+ ++p) {
+ find_ino_peer_info_t& fip = p->second;
+ if (fip.checking == who) {
+ dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
+ fip.checking = MDS_RANK_NONE;
+ _do_find_ino_peer(fip);
+ } else if (fip.checking == MDS_RANK_NONE) {
+ dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
+ _do_find_ino_peer(fip);
+ }
+ }
+}
+
+/* ---------------------------- */
+
+int MDCache::get_num_client_requests()
+{
+ int count = 0;
+ for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+ p != active_requests.end();
+ ++p) {
+ MDRequestRef& mdr = p->second;
+ if (mdr->reqid.name.is_client() && !mdr->is_slave())
+ count++;
+ }
+ return count;
+}
+
+MDRequestRef MDCache::request_start(const MClientRequest::const_ref& req)
+{
+ // did we win a forward race against a slave?
+ if (active_requests.count(req->get_reqid())) {
+ MDRequestRef& mdr = active_requests[req->get_reqid()];
+ ceph_assert(mdr);
+ if (mdr->is_slave()) {
+ dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
+ mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
+ } else {
+ dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
+ }
+ return MDRequestRef();
+ }
+
+ // register new client request
+ MDRequestImpl::Params params;
+ params.reqid = req->get_reqid();
+ params.attempt = req->get_num_fwd();
+ params.client_req = req;
+ params.initiated = req->get_recv_stamp();
+ params.throttled = req->get_throttle_stamp();
+ params.all_read = req->get_recv_complete_stamp();
+ params.dispatched = req->get_dispatch_stamp();
+
+ MDRequestRef mdr =
+ mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
+ active_requests[params.reqid] = mdr;
+ mdr->set_op_stamp(req->get_stamp());
+ dout(7) << "request_start " << *mdr << dendl;
+ return mdr;
+}
+
+MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, const Message::const_ref &m)
+{
+ int by = m->get_source().num();
+ MDRequestImpl::Params params;
+ params.reqid = ri;
+ params.attempt = attempt;
+ params.triggering_slave_req = m;
+ params.slave_to = by;
+ params.initiated = m->get_recv_stamp();
+ params.throttled = m->get_throttle_stamp();
+ params.all_read = m->get_recv_complete_stamp();
+ params.dispatched = m->get_dispatch_stamp();
+ MDRequestRef mdr =
+ mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
+ ceph_assert(active_requests.count(mdr->reqid) == 0);
+ active_requests[mdr->reqid] = mdr;
+ dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
+ return mdr;
+}
+
+MDRequestRef MDCache::request_start_internal(int op)
+{
+ utime_t now = ceph_clock_now();
+ MDRequestImpl::Params params;
+ params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
+ params.reqid.tid = mds->issue_tid();
+ params.initiated = now;
+ params.throttled = now;
+ params.all_read = now;
+ params.dispatched = now;
+ params.internal_op = op;
+ MDRequestRef mdr =
+ mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
+
+ ceph_assert(active_requests.count(mdr->reqid) == 0);
+ active_requests[mdr->reqid] = mdr;
+ dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
+ return mdr;
+}
+
+MDRequestRef MDCache::request_get(metareqid_t rid)
+{
+ ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
+ ceph_assert(p != active_requests.end());
+ dout(7) << "request_get " << rid << " " << *p->second << dendl;
+ return p->second;
+}
+
+void MDCache::request_finish(MDRequestRef& mdr)
+{
+ dout(7) << "request_finish " << *mdr << dendl;
+ mdr->mark_event("finishing request");
+
+ // slave finisher?
+ if (mdr->has_more() && mdr->more()->slave_commit) {
+ Context *fin = mdr->more()->slave_commit;
+ mdr->more()->slave_commit = 0;
+ int ret;
+ if (mdr->aborted) {
+ mdr->aborted = false;
+ ret = -1;
+ mdr->more()->slave_rolling_back = true;
+ } else {
+ ret = 0;
+ mdr->committing = true;
+ }
+ fin->complete(ret); // this must re-call request_finish.
+ return;
+ }
+
+ switch(mdr->internal_op) {
+ case CEPH_MDS_OP_FRAGMENTDIR:
+ logger->inc(l_mdss_ireq_fragmentdir);
+ break;
+ case CEPH_MDS_OP_EXPORTDIR:
+ logger->inc(l_mdss_ireq_exportdir);
+ break;
+ case CEPH_MDS_OP_ENQUEUE_SCRUB:
+ logger->inc(l_mdss_ireq_enqueue_scrub);
+ break;
+ case CEPH_MDS_OP_FLUSH:
+ logger->inc(l_mdss_ireq_flush);
+ break;
+ case CEPH_MDS_OP_REPAIR_FRAGSTATS:
+ logger->inc(l_mdss_ireq_fragstats);
+ break;
+ case CEPH_MDS_OP_REPAIR_INODESTATS:
+ logger->inc(l_mdss_ireq_inodestats);
+ break;
+ }
+
+ request_cleanup(mdr);
+}
+
+
+void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
+{
+ CachedStackStringStream css;
+ *css << "forwarding request to mds." << who;
+ mdr->mark_event(css->strv());
+ if (mdr->client_request && mdr->client_request->get_source().is_client()) {
+ dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
+ << *mdr->client_request << dendl;
+ mds->forward_message_mds(mdr->release_client_request(), who);
+ if (mds->logger) mds->logger->inc(l_mds_forward);
+ } else if (mdr->internal_op >= 0) {
+ dout(10) << "request_forward on internal op; cancelling" << dendl;
+ mdr->internal_op_finish->complete(-EXDEV);
+ } else {
+ dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
+ << " was from mds" << dendl;
+ }
+ request_cleanup(mdr);
+}
+
+
+void MDCache::dispatch_request(MDRequestRef& mdr)
+{
+ if (mdr->client_request) {
+ mds->server->dispatch_client_request(mdr);
+ } else if (mdr->slave_request) {
+ mds->server->dispatch_slave_request(mdr);
+ } else {
+ switch (mdr->internal_op) {
+ case CEPH_MDS_OP_FRAGMENTDIR:
+ dispatch_fragment_dir(mdr);
+ break;
+ case CEPH_MDS_OP_EXPORTDIR:
+ migrator->dispatch_export_dir(mdr, 0);
+ break;
+ case CEPH_MDS_OP_ENQUEUE_SCRUB:
+ enqueue_scrub_work(mdr);
+ break;
+ case CEPH_MDS_OP_FLUSH:
+ flush_dentry_work(mdr);
+ break;
+ case CEPH_MDS_OP_REPAIR_FRAGSTATS:
+ repair_dirfrag_stats_work(mdr);
+ break;
+ case CEPH_MDS_OP_REPAIR_INODESTATS:
+ repair_inode_stats_work(mdr);
+ break;
+ case CEPH_MDS_OP_UPGRADE_SNAPREALM:
+ upgrade_inode_snaprealm_work(mdr);
+ break;
+ default:
+ ceph_abort();
+ }
+ }
+}
+
+
+void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
+{
+ if (!mdr->has_more())
+ return;
+
+ // clean up slaves
+ // (will implicitly drop remote dn pins)
+ for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
+ p != mdr->more()->slaves.end();
+ ++p) {
+ auto r = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt,
+ MMDSSlaveRequest::OP_FINISH);
+
+ if (mdr->killed && !mdr->committing) {
+ r->mark_abort();
+ } else if (mdr->more()->srcdn_auth_mds == *p &&
+ mdr->more()->inode_import.length() > 0) {
+ // information about rename imported caps
+ r->inode_export.claim(mdr->more()->inode_import);
+ }
+
+ mds->send_message_mds(r, *p);
+ }
+
+ /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
+ * implicitly. Note that we don't call the finishers -- there shouldn't
+ * be any on a remote lock and the request finish wakes up all
+ * the waiters anyway! */
+
+ for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
+ SimpleLock *lock = it->lock;
+ if (it->is_xlock() && !lock->get_parent()->is_auth()) {
+ dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
+ << " on " << lock->get_parent() << dendl;
+ lock->put_xlock();
+ mdr->locks.erase(it++);
+ } else if (it->is_remote_wrlock()) {
+ dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
+ << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
+ if (it->is_wrlock()) {
+ it->clear_remote_wrlock();
+ ++it;
+ } else {
+ mdr->locks.erase(it++);
+ }
+ } else {
+ ++it;
+ }
+ }
+
+ mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
+ * leaving them in can cause double-notifies as
+ * this function can get called more than once */
+}
+
+void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
+{
+ request_drop_foreign_locks(mdr);
+ mds->locker->drop_non_rdlocks(mdr.get());
+}
+
+void MDCache::request_drop_locks(MDRequestRef& mdr)
+{
+ request_drop_foreign_locks(mdr);
+ mds->locker->drop_locks(mdr.get());
+}
+
+void MDCache::request_cleanup(MDRequestRef& mdr)
+{
+ dout(15) << "request_cleanup " << *mdr << dendl;
+
+ if (mdr->has_more()) {
+ if (mdr->more()->is_ambiguous_auth)
+ mdr->clear_ambiguous_auth();
+ if (!mdr->more()->waiting_for_finish.empty())
+ mds->queue_waiters(mdr->more()->waiting_for_finish);
+ }
+
+ request_drop_locks(mdr);
+
+ // drop (local) auth pins
+ mdr->drop_local_auth_pins();
+
+ // drop stickydirs
+ mdr->put_stickydirs();
+
+ mds->locker->kick_cap_releases(mdr);
+
+ // drop cache pins
+ mdr->drop_pins();
+
+ // remove from session
+ mdr->item_session_request.remove_myself();
+
+ // remove from map
+ active_requests.erase(mdr->reqid);
+
+ if (mds->logger)
+ log_stat();
+
+ mdr->mark_event("cleaned up request");
+}
+
+void MDCache::request_kill(MDRequestRef& mdr)
+{
+ // rollback slave requests is tricky. just let the request proceed.
+ if (mdr->has_more() &&
+ (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
+ if (!mdr->done_locking) {
+ ceph_assert(mdr->more()->witnessed.empty());
+ mdr->aborted = true;
+ dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
+ } else {
+ dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
+ }
+
+ ceph_assert(mdr->used_prealloc_ino == 0);
+ ceph_assert(mdr->prealloc_inos.empty());
+
+ mdr->session = NULL;
+ mdr->item_session_request.remove_myself();
+ return;
+ }
+
+ mdr->killed = true;
+ mdr->mark_event("killing request");
+
+ if (mdr->committing) {
+ dout(10) << "request_kill " << *mdr << " -- already committing, remove it from sesssion requests" << dendl;
+ mdr->item_session_request.remove_myself();
+ } else {
+ dout(10) << "request_kill " << *mdr << dendl;
+ request_cleanup(mdr);
+ }
+}
+
+// -------------------------------------------------------------------------------
+// SNAPREALMS
+
+void MDCache::create_global_snaprealm()
+{
+ CInode *in = new CInode(this); // dummy inode
+ create_unlinked_system_inode(in, MDS_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
+ add_inode(in);
+ global_snaprealm = in->snaprealm;
+}
+
+void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
+{
+ dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
+
+ vector<inodeno_t> split_inos;
+ vector<inodeno_t> split_realms;
+
+ if (notify_clients) {
+ ceph_assert(in->snaprealm->have_past_parents_open());
+ if (snapop == CEPH_SNAP_OP_SPLIT) {
+ // notify clients of update|split
+ for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
+ !p.end(); ++p)
+ split_inos.push_back((*p)->ino());
+
+ for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
+ p != in->snaprealm->open_children.end();
+ ++p)
+ split_realms.push_back((*p)->inode->ino());
+ }
+ }
+
+ set<SnapRealm*> past_children;
+ map<client_t, MClientSnap::ref> updates;
+ list<SnapRealm*> q;
+ q.push_back(in->snaprealm);
+ while (!q.empty()) {
+ SnapRealm *realm = q.front();
+ q.pop_front();
+
+ dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
+ realm->invalidate_cached_snaps();
+
+ if (notify_clients) {
+ for (const auto& p : realm->client_caps) {
+ const auto& client = p.first;
+ const auto& caps = p.second;
+ ceph_assert(!caps->empty());
+
+ auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
+ if (em.second) {
+ auto update = MClientSnap::create(CEPH_SNAP_OP_SPLIT);
+ update->head.split = in->ino();
+ update->split_inos = split_inos;
+ update->split_realms = split_realms;
+ update->bl = in->snaprealm->get_snap_trace();
+ em.first->second = std::move(update);
+ }
+ }
+ }
+
+ if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
+ for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
+ p != realm->open_past_children.end();
+ ++p)
+ past_children.insert(*p);
+ }
+
+ // notify for active children, too.
+ dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
+ for (set<SnapRealm*>::iterator p = realm->open_children.begin();
+ p != realm->open_children.end();
+ ++p)
+ q.push_back(*p);
+ }
+
+ if (notify_clients)
+ send_snaps(updates);
+
+ // notify past children and their descendants if we update/delete old snapshots
+ for (set<SnapRealm*>::iterator p = past_children.begin();
+ p != past_children.end();
+ ++p)
+ q.push_back(*p);
+
+ while (!q.empty()) {
+ SnapRealm *realm = q.front();
+ q.pop_front();
+
+ realm->invalidate_cached_snaps();
+
+ for (set<SnapRealm*>::iterator p = realm->open_children.begin();
+ p != realm->open_children.end();
+ ++p) {
+ if (past_children.count(*p) == 0)
+ q.push_back(*p);
+ }
+
+ for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
+ p != realm->open_past_children.end();
+ ++p) {
+ if (past_children.count(*p) == 0) {
+ q.push_back(*p);
+ past_children.insert(*p);
+ }
+ }
+ }
+
+ if (snapop == CEPH_SNAP_OP_DESTROY) {
+ // eval stray inodes if we delete snapshot from their past ancestor snaprealm
+ for (set<SnapRealm*>::iterator p = past_children.begin();
+ p != past_children.end();
+ ++p)
+ maybe_eval_stray((*p)->inode, true);
+ }
+}
+
+void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
+{
+ dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
+ ceph_assert(in->is_auth());
+
+ set<mds_rank_t> mds_set;
+ if (stid > 0) {
+ mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
+ mds_set.erase(mds->get_nodeid());
+ } else {
+ in->list_replicas(mds_set);
+ }
+
+ if (!mds_set.empty()) {
+ bufferlist snap_blob;
+ in->encode_snap(snap_blob);
+
+ for (auto p : mds_set) {
+ auto m = MMDSSnapUpdate::create(in->ino(), stid, snap_op);
+ m->snap_blob = snap_blob;
+ mds->send_message_mds(m, p);
+ }
+ }
+
+ if (stid > 0)
+ notify_global_snaprealm_update(snap_op);
+}
+
+void MDCache::handle_snap_update(const MMDSSnapUpdate::const_ref &m)
+{
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+ dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
+
+ if (mds->get_state() < MDSMap::STATE_RESOLVE &&
+ mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
+ return;
+ }
+
+ // null rejoin_done means open_snaprealms() has already been called
+ bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
+ (mds->is_rejoin() && !rejoin_done);
+
+ if (m->get_tid() > 0) {
+ mds->snapclient->notify_commit(m->get_tid());
+ if (notify_clients)
+ notify_global_snaprealm_update(m->get_snap_op());
+ }
+
+ CInode *in = get_inode(m->get_ino());
+ if (in) {
+ ceph_assert(!in->is_auth());
+ if (mds->get_state() > MDSMap::STATE_REJOIN ||
+ (mds->is_rejoin() && !in->is_rejoining())) {
+ auto p = m->snap_blob.cbegin();
+ in->decode_snap(p);
+
+ if (!notify_clients) {
+ if (!rejoin_pending_snaprealms.count(in)) {
+ in->get(CInode::PIN_OPENINGSNAPPARENTS);
+ rejoin_pending_snaprealms.insert(in);
+ }
+ }
+ do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
+ }
+ }
+}
+
+void MDCache::notify_global_snaprealm_update(int snap_op)
+{
+ if (snap_op != CEPH_SNAP_OP_DESTROY)
+ snap_op = CEPH_SNAP_OP_UPDATE;
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ for (auto &session : sessions) {
+ if (!session->is_open() && !session->is_stale())
+ continue;
+ auto update = MClientSnap::create(snap_op);
+ update->head.split = global_snaprealm->inode->ino();
+ update->bl = global_snaprealm->get_snap_trace();
+ mds->send_message_client_counted(update, session);
+ }
+}
+
+// -------------------------------------------------------------------------------
+// STRAYS
+
+struct C_MDC_RetryScanStray : public MDCacheContext {
+ dirfrag_t next;
+ C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
+ void finish(int r) override {
+ mdcache->scan_stray_dir(next);
+ }
+};
+
+void MDCache::scan_stray_dir(dirfrag_t next)
+{
+ dout(10) << "scan_stray_dir " << next << dendl;
+
+ list<CDir*> ls;
+ for (int i = 0; i < NUM_STRAY; ++i) {
+ if (strays[i]->ino() < next.ino)
+ continue;
+ strays[i]->get_dirfrags(ls);
+ }
+
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ CDir *dir = *p;
+ if (dir->dirfrag() < next)
+ continue;
+ if (!dir->is_complete()) {
+ dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
+ return;
+ }
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
+ dn->state_set(CDentry::STATE_STRAY);
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ if (in->inode.nlink == 0)
+ in->state_set(CInode::STATE_ORPHAN);
+ maybe_eval_stray(in);
+ }
+ }
+ }
+}
+
+void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
+{
+ object_t oid = CInode::get_object_name(ino, frag_t(), "");
+ mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
+ if (mds->logger)
+ mds->logger->inc(l_mds_openino_backtrace_fetch);
+}
+
+
+
+
+
+// ========================================================================================
+// DISCOVER
+/*
+
+ - for all discovers (except base_inos, e.g. root, stray), waiters are attached
+ to the parent metadata object in the cache (pinning it).
+
+ - all discovers are tracked by tid, so that we can ignore potentially dup replies.
+
+*/
+
+void MDCache::_send_discover(discover_info_t& d)
+{
+ auto dis = MDiscover::create(d.ino, d.frag, d.snap, d.want_path, d.want_base_dir, d.want_xlocked);
+ dis->set_tid(d.tid);
+ mds->send_message_mds(dis, d.mds);
+}
+
+void MDCache::discover_base_ino(inodeno_t want_ino,
+ MDSContext *onfinish,
+ mds_rank_t from)
+{
+ dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
+ if (waiting_for_base_ino[from].count(want_ino) == 0) {
+ discover_info_t& d = _create_discover(from);
+ d.ino = want_ino;
+ _send_discover(d);
+ }
+ waiting_for_base_ino[from][want_ino].push_back(onfinish);
+}
+
+
+void MDCache::discover_dir_frag(CInode *base,
+ frag_t approx_fg,
+ MDSContext *onfinish,
+ mds_rank_t from)
+{
+ if (from < 0)
+ from = base->authority().first;
+
+ dirfrag_t df(base->ino(), approx_fg);
+ dout(7) << "discover_dir_frag " << df
+ << " from mds." << from << dendl;
+
+ if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
+ discover_info_t& d = _create_discover(from);
+ d.pin_base(base);
+ d.ino = base->ino();
+ d.frag = approx_fg;
+ d.want_base_dir = true;
+ _send_discover(d);
+ }
+
+ if (onfinish)
+ base->add_dir_waiter(approx_fg, onfinish);
+}
+
+struct C_MDC_RetryDiscoverPath : public MDCacheContext {
+ CInode *base;
+ snapid_t snapid;
+ filepath path;
+ mds_rank_t from;
+ C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
+ MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
+ void finish(int r) override {
+ mdcache->discover_path(base, snapid, path, 0, from);
+ }
+};
+
+void MDCache::discover_path(CInode *base,
+ snapid_t snap,
+ filepath want_path,
+ MDSContext *onfinish,
+ bool want_xlocked,
+ mds_rank_t from)
+{
+ if (from < 0)
+ from = base->authority().first;
+
+ dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
+ << (want_xlocked ? " want_xlocked":"")
+ << dendl;
+
+ if (base->is_ambiguous_auth()) {
+ dout(10) << " waiting for single auth on " << *base << dendl;
+ if (!onfinish)
+ onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
+ base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
+ return;
+ } else if (from == mds->get_nodeid()) {
+ MDSContext::vec finished;
+ base->take_waiting(CInode::WAIT_DIR, finished);
+ mds->queue_waiters(finished);
+ return;
+ }
+
+ frag_t fg = base->pick_dirfrag(want_path[0]);
+ if ((want_xlocked && want_path.depth() == 1) ||
+ !base->is_waiting_for_dir(fg) || !onfinish) {
+ discover_info_t& d = _create_discover(from);
+ d.ino = base->ino();
+ d.pin_base(base);
+ d.frag = fg;
+ d.snap = snap;
+ d.want_path = want_path;
+ d.want_base_dir = true;
+ d.want_xlocked = want_xlocked;
+ _send_discover(d);
+ }
+
+ // register + wait
+ if (onfinish)
+ base->add_dir_waiter(fg, onfinish);
+}
+
+struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
+ CDir *base;
+ snapid_t snapid;
+ filepath path;
+ C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
+ MDCacheContext(c), base(b), snapid(s), path(p) {}
+ void finish(int r) override {
+ mdcache->discover_path(base, snapid, path, 0);
+ }
+};
+
+void MDCache::discover_path(CDir *base,
+ snapid_t snap,
+ filepath want_path,
+ MDSContext *onfinish,
+ bool want_xlocked)
+{
+ mds_rank_t from = base->authority().first;
+
+ dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
+ << (want_xlocked ? " want_xlocked":"")
+ << dendl;
+
+ if (base->is_ambiguous_auth()) {
+ dout(7) << " waiting for single auth on " << *base << dendl;
+ if (!onfinish)
+ onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
+ base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
+ return;
+ } else if (from == mds->get_nodeid()) {
+ MDSContext::vec finished;
+ base->take_sub_waiting(finished);
+ mds->queue_waiters(finished);
+ return;
+ }
+
+ if ((want_xlocked && want_path.depth() == 1) ||
+ !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
+ discover_info_t& d = _create_discover(from);
+ d.ino = base->ino();
+ d.pin_base(base->inode);
+ d.frag = base->get_frag();
+ d.snap = snap;
+ d.want_path = want_path;
+ d.want_base_dir = false;
+ d.want_xlocked = want_xlocked;
+ _send_discover(d);
+ }
+
+ // register + wait
+ if (onfinish)
+ base->add_dentry_waiter(want_path[0], snap, onfinish);
+}
+
+void MDCache::kick_discovers(mds_rank_t who)
+{
+ for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
+ p != discovers.end();
+ ++p) {
+ if (p->second.mds != who)
+ continue;
+ _send_discover(p->second);
+ }
+}
+
+
+void MDCache::handle_discover(const MDiscover::const_ref &dis)
+{
+ mds_rank_t whoami = mds->get_nodeid();
+ mds_rank_t from = mds_rank_t(dis->get_source().num());
+
+ ceph_assert(from != whoami);
+
+ if (mds->get_state() <= MDSMap::STATE_REJOIN) {
+ if (mds->get_state() < MDSMap::STATE_REJOIN &&
+ mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
+ return;
+ }
+
+ // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
+ // delay processing request from survivor because we may not yet choose lock states.
+ if (!mds->mdsmap->is_rejoin(from)) {
+ dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
+ return;
+ }
+ }
+
+
+ CInode *cur = 0;
+ auto reply = MDiscoverReply::create(*dis);
+
+ snapid_t snapid = dis->get_snapid();
+
+ // get started.
+ if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
+ !dis->wants_base_dir() && dis->get_want().depth() == 0) {
+ // wants root
+ dout(7) << "handle_discover from mds." << from
+ << " wants base + " << dis->get_want().get_path()
+ << " snap " << snapid
+ << dendl;
+
+ cur = get_inode(dis->get_base_ino());
+ ceph_assert(cur);
+
+ // add root
+ reply->starts_with = MDiscoverReply::INODE;
+ replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
+ dout(10) << "added base " << *cur << dendl;
+ }
+ else {
+ // there's a base inode
+ cur = get_inode(dis->get_base_ino(), snapid);
+ if (!cur && snapid != CEPH_NOSNAP) {
+ cur = get_inode(dis->get_base_ino());
+ if (cur && !cur->is_multiversion())
+ cur = NULL; // nope!
+ }
+
+ if (!cur) {
+ dout(7) << "handle_discover mds." << from
+ << " don't have base ino " << dis->get_base_ino() << "." << snapid
+ << dendl;
+ if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
+ reply->set_error_dentry(dis->get_dentry(0));
+ reply->set_flag_error_dir();
+ } else if (dis->wants_base_dir()) {
+ dout(7) << "handle_discover mds." << from
+ << " wants basedir+" << dis->get_want().get_path()
+ << " has " << *cur
+ << dendl;
+ } else {
+ dout(7) << "handle_discover mds." << from
+ << " wants " << dis->get_want().get_path()
+ << " has " << *cur
+ << dendl;
+ }
+ }
+
+ ceph_assert(reply);
+
+ // add content
+ // do some fidgeting to include a dir if they asked for the base dir, or just root.
+ for (unsigned i = 0;
+ cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
+ i++) {
+
+ // -- figure out the dir
+
+ // is *cur even a dir at all?
+ if (!cur->is_dir()) {
+ dout(7) << *cur << " not a dir" << dendl;
+ reply->set_flag_error_dir();
+ break;
+ }
+
+ // pick frag
+ frag_t fg;
+ if (dis->get_want().depth()) {
+ // dentry specifies
+ fg = cur->pick_dirfrag(dis->get_dentry(i));
+ } else {
+ // requester explicity specified the frag
+ ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
+ fg = dis->get_base_dir_frag();
+ if (!cur->dirfragtree.is_leaf(fg))
+ fg = cur->dirfragtree[fg.value()];
+ }
+ CDir *curdir = cur->get_dirfrag(fg);
+
+ if ((!curdir && !cur->is_auth()) ||
+ (curdir && !curdir->is_auth())) {
+
+ /* before:
+ * ONLY set flag if empty!!
+ * otherwise requester will wake up waiter(s) _and_ continue with discover,
+ * resulting in duplicate discovers in flight,
+ * which can wreak havoc when discovering rename srcdn (which may move)
+ */
+
+ if (reply->is_empty()) {
+ // only hint if empty.
+ // someday this could be better, but right now the waiter logic isn't smart enough.
+
+ // hint
+ if (curdir) {
+ dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
+ reply->set_dir_auth_hint(curdir->authority().first);
+ } else {
+ dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
+ << *cur << dendl;
+ reply->set_dir_auth_hint(cur->authority().first);
+ }
+
+ // note error dentry, if any
+ // NOTE: important, as it allows requester to issue an equivalent discover
+ // to whomever we hint at.
+ if (dis->get_want().depth() > i)
+ reply->set_error_dentry(dis->get_dentry(i));
+ }
+
+ break;
+ }
+
+ if (!curdir) { // open dir?
+ if (cur->is_frozen()) {
+ if (!reply->is_empty()) {
+ dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
+ break;
+ }
+ dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
+ cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
+ return;
+ }
+ curdir = cur->get_or_open_dirfrag(this, fg);
+ } else if (curdir->is_frozen_tree() ||
+ (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
+ if (!reply->is_empty()) {
+ dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
+ break;
+ }
+ if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
+ dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
+ reply->set_flag_error_dir();
+ break;
+ }
+ dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
+ curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
+ return;
+ }
+
+ // add dir
+ if (curdir->get_version() == 0) {
+ // fetch newly opened dir
+ } else if (reply->is_empty() && !dis->wants_base_dir()) {
+ dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
+ // make sure the base frag is correct, though, in there was a refragment since the
+ // original request was sent.
+ reply->set_base_dir_frag(curdir->get_frag());
+ } else {
+ ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
+ if (!reply->trace.length())
+ reply->starts_with = MDiscoverReply::DIR;
+ replicate_dir(curdir, from, reply->trace);
+ dout(7) << "handle_discover added dir " << *curdir << dendl;
+ }
+
+ // lookup
+ CDentry *dn = 0;
+ if (curdir->get_version() == 0) {
+ // fetch newly opened dir
+ ceph_assert(!curdir->has_bloom());
+ } else if (dis->get_want().depth() > 0) {
+ // lookup dentry
+ dn = curdir->lookup(dis->get_dentry(i), snapid);
+ } else
+ break; // done!
+
+ // incomplete dir?
+ if (!dn) {
+ if (!curdir->is_complete() &&
+ !(snapid == CEPH_NOSNAP &&
+ curdir->has_bloom() &&
+ !curdir->is_in_bloom(dis->get_dentry(i)))) {
+ // readdir
+ dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
+ if (reply->is_empty()) {
+ // fetch and wait
+ curdir->fetch(new C_MDS_RetryMessage(mds, dis),
+ dis->wants_base_dir() && curdir->get_version() == 0);
+ return;
+ } else {
+ // initiate fetch, but send what we have so far
+ curdir->fetch(0);
+ break;
+ }
+ }
+
+ if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
+ dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
+ << " dne, non-empty reply, stopping" << dendl;
+ break;
+ }
+
+ // send null dentry
+ dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
+ << *curdir << dendl;
+ if (snapid == CEPH_NOSNAP)
+ dn = curdir->add_null_dentry(dis->get_dentry(i));
+ else
+ dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
+ }
+ ceph_assert(dn);
+
+ // don't add replica to purging dentry/inode
+ if (dn->state_test(CDentry::STATE_PURGING)) {
+ if (reply->is_empty())
+ reply->set_flag_error_dn(dis->get_dentry(i));
+ break;
+ }
+
+ CDentry::linkage_t *dnl = dn->get_linkage();
+
+ // xlocked dentry?
+ // ...always block on non-tail items (they are unrelated)
+ // ...allow xlocked tail disocvery _only_ if explicitly requested
+ bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
+ if (dn->lock.is_xlocked()) {
+ // is this the last (tail) item in the discover traversal?
+ if (tailitem && dis->wants_xlocked()) {
+ dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl;
+ } else if (reply->is_empty()) {
+ dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
+ dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
+ return;
+ } else {
+ dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
+ break;
+ }
+ }
+
+ // frozen inode?
+ if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
+ if (tailitem && dis->wants_xlocked()) {
+ dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
+ } else if (reply->is_empty()) {
+ dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
+ dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
+ return;
+ } else {
+ dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
+ break;
+ }
+ }
+
+ // add dentry
+ if (!reply->trace.length())
+ reply->starts_with = MDiscoverReply::DENTRY;
+ replicate_dentry(dn, from, reply->trace);
+ dout(7) << "handle_discover added dentry " << *dn << dendl;
+
+ if (!dnl->is_primary()) break; // stop on null or remote link.
+
+ // add inode
+ CInode *next = dnl->get_inode();
+ ceph_assert(next->is_auth());
+
+ replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
+ dout(7) << "handle_discover added inode " << *next << dendl;
+
+ // descend, keep going.
+ cur = next;
+ continue;
+ }
+
+ // how did we do?
+ ceph_assert(!reply->is_empty());
+ dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
+ mds->send_message(reply, dis->get_connection());
+}
+
+void MDCache::handle_discover_reply(const MDiscoverReply::const_ref &m)
+{
+ /*
+ if (mds->get_state() < MDSMap::STATE_ACTIVE) {
+ dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
+ return;
+ }
+ */
+ dout(7) << "discover_reply " << *m << dendl;
+ if (m->is_flag_error_dir())
+ dout(7) << " flag error, dir" << dendl;
+ if (m->is_flag_error_dn())
+ dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
+
+ MDSContext::vec finished, error;
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+
+ // starting point
+ CInode *cur = get_inode(m->get_base_ino());
+ auto p = m->trace.cbegin();
+
+ int next = m->starts_with;
+
+ // decrement discover counters
+ if (m->get_tid()) {
+ map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
+ if (p != discovers.end()) {
+ dout(10) << " found tid " << m->get_tid() << dendl;
+ discovers.erase(p);
+ } else {
+ dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
+ }
+ }
+
+ // discover may start with an inode
+ if (!p.end() && next == MDiscoverReply::INODE) {
+ cur = add_replica_inode(p, NULL, finished);
+ dout(7) << "discover_reply got base inode " << *cur << dendl;
+ ceph_assert(cur->is_base());
+
+ next = MDiscoverReply::DIR;
+
+ // take waiters?
+ if (cur->is_base() &&
+ waiting_for_base_ino[from].count(cur->ino())) {
+ finished.swap(waiting_for_base_ino[from][cur->ino()]);
+ waiting_for_base_ino[from].erase(cur->ino());
+ }
+ }
+ ceph_assert(cur);
+
+ // loop over discover results.
+ // indexes follow each ([[dir] dentry] inode)
+ // can start, end with any type.
+ while (!p.end()) {
+ // dir
+ frag_t fg;
+ CDir *curdir = 0;
+ if (next == MDiscoverReply::DIR) {
+ curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
+ if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
+ ceph_assert(m->get_wanted_base_dir());
+ cur->take_dir_waiting(m->get_base_dir_frag(), finished);
+ }
+ } else {
+ // note: this can only happen our first way around this loop.
+ if (p.end() && m->is_flag_error_dn()) {
+ fg = cur->pick_dirfrag(m->get_error_dentry());
+ curdir = cur->get_dirfrag(fg);
+ } else
+ curdir = cur->get_dirfrag(m->get_base_dir_frag());
+ }
+
+ if (p.end())
+ break;
+
+ // dentry
+ CDentry *dn = add_replica_dentry(p, curdir, finished);
+
+ if (p.end())
+ break;
+
+ // inode
+ cur = add_replica_inode(p, dn, finished);
+
+ next = MDiscoverReply::DIR;
+ }
+
+ // dir error?
+ // or dir_auth hint?
+ if (m->is_flag_error_dir() && !cur->is_dir()) {
+ // not a dir.
+ cur->take_waiting(CInode::WAIT_DIR, error);
+ } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
+ mds_rank_t who = m->get_dir_auth_hint();
+ if (who == mds->get_nodeid()) who = -1;
+ if (who >= 0)
+ dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
+
+
+ if (m->get_wanted_base_dir()) {
+ frag_t fg = m->get_base_dir_frag();
+ CDir *dir = cur->get_dirfrag(fg);
+
+ if (cur->is_waiting_for_dir(fg)) {
+ if (cur->is_auth())
+ cur->take_waiting(CInode::WAIT_DIR, finished);
+ else if (dir || !cur->dirfragtree.is_leaf(fg))
+ cur->take_dir_waiting(fg, finished);
+ else
+ discover_dir_frag(cur, fg, 0, who);
+ } else
+ dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
+ }
+
+ // try again?
+ if (m->get_error_dentry().length()) {
+ frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
+ CDir *dir = cur->get_dirfrag(fg);
+ // wanted a dentry
+ if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
+ if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
+ dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
+ m->get_wanted_snapid(), finished);
+ } else {
+ filepath relpath(m->get_error_dentry(), 0);
+ discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
+ }
+ } else
+ dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
+ << m->get_error_dentry() << dendl;
+ }
+ } else if (m->is_flag_error_dn()) {
+ frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
+ CDir *dir = cur->get_dirfrag(fg);
+ if (dir) {
+ if (dir->is_auth()) {
+ dir->take_sub_waiting(finished);
+ } else {
+ dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
+ m->get_wanted_snapid(), error);
+ }
+ }
+ }
+
+ // waiters
+ finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
+ mds->queue_waiters(finished);
+}
+
+
+
+// ----------------------------
+// REPLICAS
+
+
+void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
+{
+ dirfrag_t df = dir->dirfrag();
+ encode(df, bl);
+ dir->encode_replica(to, bl);
+}
+
+void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
+{
+ encode(dn->get_name(), bl);
+ encode(dn->last, bl);
+ dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
+}
+
+void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
+ uint64_t features)
+{
+ encode(in->inode.ino, bl); // bleh, minor assymetry here
+ encode(in->last, bl);
+ in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE);
+}
+
+CDir *MDCache::add_replica_dir(bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
+ MDSContext::vec& finished)
+{
+ dirfrag_t df;
+ decode(df, p);
+
+ ceph_assert(diri->ino() == df.ino);
+
+ // add it (_replica_)
+ CDir *dir = diri->get_dirfrag(df.frag);
+
+ if (dir) {
+ // had replica. update w/ new nonce.
+ dir->decode_replica(p);
+ dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl;
+ } else {
+ // force frag to leaf in the diri tree
+ if (!diri->dirfragtree.is_leaf(df.frag)) {
+ dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree "
+ << diri->dirfragtree << dendl;
+ diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
+ }
+
+ // add replica.
+ dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
+ dir->decode_replica(p);
+
+ // is this a dir_auth delegation boundary?
+ if (from != diri->authority().first ||
+ diri->is_ambiguous_auth() ||
+ diri->is_base())
+ adjust_subtree_auth(dir, from);
+
+ dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
+
+ // get waiters
+ diri->take_dir_waiting(df.frag, finished);
+ }
+
+ return dir;
+}
+
+CDentry *MDCache::add_replica_dentry(bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
+{
+ string name;
+ snapid_t last;
+ decode(name, p);
+ decode(last, p);
+
+ CDentry *dn = dir->lookup(name, last);
+
+ // have it?
+ if (dn) {
+ dn->decode_replica(p, false);
+ dout(7) << "add_replica_dentry had " << *dn << dendl;
+ } else {
+ dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
+ dn->decode_replica(p, true);
+ dout(7) << "add_replica_dentry added " << *dn << dendl;
+ }
+
+ dir->take_dentry_waiting(name, dn->first, dn->last, finished);
+
+ return dn;
+}
+
+CInode *MDCache::add_replica_inode(bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
+{
+ inodeno_t ino;
+ snapid_t last;
+ decode(ino, p);
+ decode(last, p);
+ CInode *in = get_inode(ino, last);
+ if (!in) {
+ in = new CInode(this, false, 1, last);
+ in->decode_replica(p, true);
+ add_inode(in);
+ if (in->ino() == MDS_INO_ROOT)
+ in->inode_auth.first = 0;
+ else if (in->is_mdsdir())
+ in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
+ dout(10) << "add_replica_inode added " << *in << dendl;
+ if (dn) {
+ ceph_assert(dn->get_linkage()->is_null());
+ dn->dir->link_primary_inode(dn, in);
+ }
+ } else {
+ in->decode_replica(p, false);
+ dout(10) << "add_replica_inode had " << *in << dendl;
+ }
+
+ if (dn) {
+ if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
+ dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl;
+ }
+
+ return in;
+}
+
+
+void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
+{
+ uint64_t features = mds->mdsmap->get_up_features();
+ replicate_inode(get_myin(), who, bl, features);
+ replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
+ replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
+ replicate_inode(straydn->get_dir()->inode, who, bl, features);
+ replicate_dir(straydn->get_dir(), who, bl);
+ replicate_dentry(straydn, who, bl);
+}
+
+CDentry *MDCache::add_replica_stray(const bufferlist &bl, mds_rank_t from)
+{
+ MDSContext::vec finished;
+ auto p = bl.cbegin();
+
+ CInode *mdsin = add_replica_inode(p, NULL, finished);
+ CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
+ CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished);
+ CInode *strayin = add_replica_inode(p, straydirdn, finished);
+ CDir *straydir = add_replica_dir(p, strayin, from, finished);
+ CDentry *straydn = add_replica_dentry(p, straydir, finished);
+ if (!finished.empty())
+ mds->queue_waiters(finished);
+
+ return straydn;
+}
+
+
+int MDCache::send_dir_updates(CDir *dir, bool bcast)
+{
+ // this is an FYI, re: replication
+
+ set<mds_rank_t> who;
+ if (bcast) {
+ mds->get_mds_map()->get_active_mds_set(who);
+ } else {
+ for (const auto &p : dir->get_replicas()) {
+ who.insert(p.first);
+ }
+ }
+
+ dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
+
+ filepath path;
+ dir->inode->make_path(path);
+
+ mds_rank_t whoami = mds->get_nodeid();
+ for (set<mds_rank_t>::iterator it = who.begin();
+ it != who.end();
+ ++it) {
+ if (*it == whoami) continue;
+ //if (*it == except) continue;
+ dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
+
+ std::set<int32_t> s;
+ for (const auto &r : dir->dir_rep_by) {
+ s.insert(r);
+ }
+ mds->send_message_mds(MDirUpdate::create(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, s, path, bcast), *it);
+ }
+
+ return 0;
+}
+
+void MDCache::handle_dir_update(const MDirUpdate::const_ref &m)
+{
+ dirfrag_t df = m->get_dirfrag();
+ CDir *dir = get_dirfrag(df);
+ if (!dir) {
+ dout(5) << "dir_update on " << df << ", don't have it" << dendl;
+
+ // discover it?
+ if (m->should_discover()) {
+ // only try once!
+ // this is key to avoid a fragtree update race, among other things.
+ m->inc_tried_discover();
+ vector<CDentry*> trace;
+ CInode *in;
+ filepath path = m->get_path();
+ dout(5) << "trying discover on dir_update for " << path << dendl;
+ CF_MDS_RetryMessageFactory cf(mds, m);
+ MDRequestRef null_ref;
+ int r = path_traverse(null_ref, cf, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
+ if (r > 0)
+ return;
+ if (r == 0 &&
+ in->ino() == df.ino &&
+ in->get_approx_dirfrag(df.frag) == NULL) {
+ open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ }
+
+ return;
+ }
+
+ if (!m->has_tried_discover()) {
+ // Update if it already exists. Othwerwise it got updated by discover reply.
+ dout(5) << "dir_update on " << *dir << dendl;
+ dir->dir_rep = m->get_dir_rep();
+ dir->dir_rep_by.clear();
+ for (const auto &e : m->get_dir_rep_by()) {
+ dir->dir_rep_by.insert(e);
+ }
+ }
+}
+
+
+
+
+
+// LINK
+
+void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
+{
+ dout(7) << "send_dentry_link " << *dn << dendl;
+
+ CDir *subtree = get_subtree_root(dn->get_dir());
+ for (const auto &p : dn->get_replicas()) {
+ // don't tell (rename) witnesses; they already know
+ if (mdr.get() && mdr->more()->witnessed.count(p.first))
+ continue;
+ if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
+ (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
+ rejoin_gather.count(p.first)))
+ continue;
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ auto m = MDentryLink::create(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
+ if (dnl->is_primary()) {
+ dout(10) << " primary " << *dnl->get_inode() << dendl;
+ replicate_inode(dnl->get_inode(), p.first, m->bl,
+ mds->mdsmap->get_up_features());
+ } else if (dnl->is_remote()) {
+ inodeno_t ino = dnl->get_remote_ino();
+ __u8 d_type = dnl->get_remote_d_type();
+ dout(10) << " remote " << ino << " " << d_type << dendl;
+ encode(ino, m->bl);
+ encode(d_type, m->bl);
+ } else
+ ceph_abort(); // aie, bad caller!
+ mds->send_message_mds(m, p.first);
+ }
+}
+
+void MDCache::handle_dentry_link(const MDentryLink::const_ref &m)
+{
+ CDentry *dn = NULL;
+ CDir *dir = get_dirfrag(m->get_dirfrag());
+ if (!dir) {
+ dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl;
+ } else {
+ dn = dir->lookup(m->get_dn());
+ if (!dn) {
+ dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
+ } else {
+ dout(7) << "handle_dentry_link on " << *dn << dendl;
+ CDentry::linkage_t *dnl = dn->get_linkage();
+
+ ceph_assert(!dn->is_auth());
+ ceph_assert(dnl->is_null());
+ }
+ }
+
+ auto p = m->bl.cbegin();
+ MDSContext::vec finished;
+ if (dn) {
+ if (m->get_is_primary()) {
+ // primary link.
+ add_replica_inode(p, dn, finished);
+ } else {
+ // remote link, easy enough.
+ inodeno_t ino;
+ __u8 d_type;
+ decode(ino, p);
+ decode(d_type, p);
+ dir->link_remote_inode(dn, ino, d_type);
+ }
+ } else {
+ ceph_abort();
+ }
+
+ if (!finished.empty())
+ mds->queue_waiters(finished);
+
+ return;
+}
+
+
+// UNLINK
+
+void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
+{
+ dout(10) << "send_dentry_unlink " << *dn << dendl;
+ // share unlink news with replicas
+ set<mds_rank_t> replicas;
+ dn->list_replicas(replicas);
+ bufferlist snapbl;
+ if (straydn) {
+ straydn->list_replicas(replicas);
+ CInode *strayin = straydn->get_linkage()->get_inode();
+ strayin->encode_snap_blob(snapbl);
+ }
+ for (set<mds_rank_t>::iterator it = replicas.begin();
+ it != replicas.end();
+ ++it) {
+ // don't tell (rmdir) witnesses; they already know
+ if (mdr.get() && mdr->more()->witnessed.count(*it))
+ continue;
+
+ if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
+ (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
+ rejoin_gather.count(*it)))
+ continue;
+
+ auto unlink = MDentryUnlink::create(dn->get_dir()->dirfrag(), dn->get_name());
+ if (straydn) {
+ replicate_stray(straydn, *it, unlink->straybl);
+ unlink->snapbl = snapbl;
+ }
+ mds->send_message_mds(unlink, *it);
+ }
+}
+
+void MDCache::handle_dentry_unlink(const MDentryUnlink::const_ref &m)
+{
+ // straydn
+ CDentry *straydn = NULL;
+ if (m->straybl.length())
+ straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num()));
+
+ CDir *dir = get_dirfrag(m->get_dirfrag());
+ if (!dir) {
+ dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl;
+ } else {
+ CDentry *dn = dir->lookup(m->get_dn());
+ if (!dn) {
+ dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
+ } else {
+ dout(7) << "handle_dentry_unlink on " << *dn << dendl;
+ CDentry::linkage_t *dnl = dn->get_linkage();
+
+ // open inode?
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ dn->dir->unlink_inode(dn);
+ ceph_assert(straydn);
+ straydn->dir->link_primary_inode(straydn, in);
+
+ // in->first is lazily updated on replica; drag it forward so
+ // that we always keep it in sync with the dnq
+ ceph_assert(straydn->first >= in->first);
+ in->first = straydn->first;
+
+ // update subtree map?
+ if (in->is_dir())
+ adjust_subtree_after_rename(in, dir, false);
+
+ if (m->snapbl.length()) {
+ bool hadrealm = (in->snaprealm ? true : false);
+ in->decode_snap_blob(m->snapbl);
+ ceph_assert(in->snaprealm);
+ ceph_assert(in->snaprealm->have_past_parents_open());
+ if (!hadrealm)
+ do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
+ }
+
+ // send caps to auth (if we're not already)
+ if (in->is_any_caps() &&
+ !in->state_test(CInode::STATE_EXPORTINGCAPS))
+ migrator->export_caps(in);
+
+ straydn = NULL;
+ } else {
+ ceph_assert(!straydn);
+ ceph_assert(dnl->is_remote());
+ dn->dir->unlink_inode(dn);
+ }
+ ceph_assert(dnl->is_null());
+ }
+ }
+
+ // race with trim_dentry()
+ if (straydn) {
+ ceph_assert(straydn->get_num_ref() == 0);
+ ceph_assert(straydn->get_linkage()->is_null());
+ expiremap ex;
+ trim_dentry(straydn, ex);
+ send_expire_messages(ex);
+ }
+}
+
+
+
+
+
+
+// ===================================================================
+
+
+
+// ===================================================================
+// FRAGMENT
+
+
+/**
+ * adjust_dir_fragments -- adjust fragmentation for a directory
+ *
+ * @param diri directory inode
+ * @param basefrag base fragment
+ * @param bits bit adjustment. positive for split, negative for merge.
+ */
+void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
+ list<CDir*>& resultfrags,
+ MDSContext::vec& waiters,
+ bool replay)
+{
+ dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
+ << " on " << *diri << dendl;
+
+ list<CDir*> srcfrags;
+ diri->get_dirfrags_under(basefrag, srcfrags);
+
+ adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay);
+}
+
+CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
+{
+ CDir *dir = diri->get_dirfrag(fg);
+ if (dir)
+ return dir;
+
+ dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
+
+ list<CDir*> src, result;
+ MDSContext::vec waiters;
+
+ // split a parent?
+ frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
+ while (1) {
+ CDir *pdir = diri->get_dirfrag(parent);
+ if (pdir) {
+ int split = fg.bits() - parent.bits();
+ dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
+ src.push_back(pdir);
+ adjust_dir_fragments(diri, src, parent, split, result, waiters, replay);
+ dir = diri->get_dirfrag(fg);
+ if (dir) {
+ dout(10) << "force_dir_fragment result " << *dir << dendl;
+ break;
+ }
+ }
+ if (parent == frag_t())
+ break;
+ frag_t last = parent;
+ parent = parent.parent();
+ dout(10) << " " << last << " parent is " << parent << dendl;
+ }
+
+ if (!dir) {
+ // hoover up things under fg?
+ diri->get_dirfrags_under(fg, src);
+ if (src.empty()) {
+ dout(10) << "force_dir_fragment no frags under " << fg << dendl;
+ } else {
+ dout(10) << " will combine frags under " << fg << ": " << src << dendl;
+ adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay);
+ dir = result.front();
+ dout(10) << "force_dir_fragment result " << *dir << dendl;
+ }
+ }
+ if (!replay)
+ mds->queue_waiters(waiters);
+ return dir;
+}
+
+void MDCache::adjust_dir_fragments(CInode *diri,
+ list<CDir*>& srcfrags,
+ frag_t basefrag, int bits,
+ list<CDir*>& resultfrags,
+ MDSContext::vec& waiters,
+ bool replay)
+{
+ dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
+ << " srcfrags " << srcfrags
+ << " on " << *diri << dendl;
+
+ // adjust fragtree
+ // yuck. we may have discovered the inode while it was being fragmented.
+ if (!diri->dirfragtree.is_leaf(basefrag))
+ diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
+
+ if (bits > 0)
+ diri->dirfragtree.split(basefrag, bits);
+ dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
+
+ if (srcfrags.empty())
+ return;
+
+ // split
+ CDir *parent_dir = diri->get_parent_dir();
+ CDir *parent_subtree = 0;
+ if (parent_dir)
+ parent_subtree = get_subtree_root(parent_dir);
+
+ if (bits > 0) {
+ // SPLIT
+ ceph_assert(srcfrags.size() == 1);
+ CDir *dir = srcfrags.front();
+
+ dir->split(bits, resultfrags, waiters, replay);
+
+ // did i change the subtree map?
+ if (dir->is_subtree_root()) {
+ // new frags are now separate subtrees
+ for (list<CDir*>::iterator p = resultfrags.begin();
+ p != resultfrags.end();
+ ++p)
+ subtrees[*p].clear(); // new frag is now its own subtree
+
+ // was i a bound?
+ if (parent_subtree) {
+ ceph_assert(subtrees[parent_subtree].count(dir));
+ subtrees[parent_subtree].erase(dir);
+ for (list<CDir*>::iterator p = resultfrags.begin();
+ p != resultfrags.end();
+ ++p) {
+ ceph_assert((*p)->is_subtree_root());
+ subtrees[parent_subtree].insert(*p);
+ }
+ }
+
+ // adjust my bounds.
+ set<CDir*> bounds;
+ bounds.swap(subtrees[dir]);
+ subtrees.erase(dir);
+ for (set<CDir*>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p) {
+ CDir *frag = get_subtree_root((*p)->get_parent_dir());
+ subtrees[frag].insert(*p);
+ }
+
+ show_subtrees(10);
+ }
+
+ diri->close_dirfrag(dir->get_frag());
+
+ } else {
+ // MERGE
+
+ // are my constituent bits subtrees? if so, i will be too.
+ // (it's all or none, actually.)
+ bool any_subtree = false, any_non_subtree = false;
+ for (CDir *dir : srcfrags) {
+ if (dir->is_subtree_root())
+ any_subtree = true;
+ else
+ any_non_subtree = true;
+ }
+ ceph_assert(!any_subtree || !any_non_subtree);
+
+ set<CDir*> new_bounds;
+ if (any_subtree) {
+ for (CDir *dir : srcfrags) {
+ // this simplifies the code that find subtrees underneath the dirfrag
+ if (!dir->is_subtree_root()) {
+ dir->state_set(CDir::STATE_AUXSUBTREE);
+ adjust_subtree_auth(dir, mds->get_nodeid());
+ }
+ }
+
+ for (CDir *dir : srcfrags) {
+ ceph_assert(dir->is_subtree_root());
+ dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
+ map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
+ set<CDir*>::iterator r = q->second.begin();
+ while (r != subtrees[dir].end()) {
+ new_bounds.insert(*r);
+ subtrees[dir].erase(r++);
+ }
+ subtrees.erase(q);
+
+ // remove myself as my parent's bound
+ if (parent_subtree)
+ subtrees[parent_subtree].erase(dir);
+ }
+ }
+
+ // merge
+ CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
+ f->merge(srcfrags, waiters, replay);
+
+ if (any_subtree) {
+ ceph_assert(f->is_subtree_root());
+ subtrees[f].swap(new_bounds);
+ if (parent_subtree)
+ subtrees[parent_subtree].insert(f);
+
+ show_subtrees(10);
+ }
+
+ resultfrags.push_back(f);
+ }
+}
+
+
+class C_MDC_FragmentFrozen : public MDSInternalContext {
+ MDCache *mdcache;
+ MDRequestRef mdr;
+public:
+ C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
+ MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
+ void finish(int r) override {
+ mdcache->fragment_frozen(mdr, r);
+ }
+};
+
+bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
+{
+ if (is_readonly()) {
+ dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
+ return false;
+ }
+ if (mds->is_cluster_degraded()) {
+ dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
+ return false;
+ }
+ if (diri->get_parent_dir() &&
+ diri->get_parent_dir()->get_inode()->is_stray()) {
+ dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
+ return false;
+ }
+ if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
+ dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
+ return false;
+ }
+
+ if (diri->scrub_is_in_progress()) {
+ dout(7) << "can_fragment: scrub in progress" << dendl;
+ return false;
+ }
+
+ for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
+ CDir *dir = *p;
+ if (dir->state_test(CDir::STATE_FRAGMENTING)) {
+ dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
+ return false;
+ }
+ if (!dir->is_auth()) {
+ dout(7) << "can_fragment: not auth on " << *dir << dendl;
+ return false;
+ }
+ if (dir->is_bad()) {
+ dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
+ return false;
+ }
+ if (dir->is_frozen() ||
+ dir->is_freezing()) {
+ dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void MDCache::split_dir(CDir *dir, int bits)
+{
+ dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
+ ceph_assert(dir->is_auth());
+ CInode *diri = dir->inode;
+
+ list<CDir*> dirs;
+ dirs.push_back(dir);
+
+ if (!can_fragment(diri, dirs)) {
+ dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
+ return;
+ }
+
+ if (dir->frag.bits() + bits > 24) {
+ dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
+ return;
+ }
+
+ MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
+ mdr->more()->fragment_base = dir->dirfrag();
+
+ ceph_assert(fragments.count(dir->dirfrag()) == 0);
+ fragment_info_t& info = fragments[dir->dirfrag()];
+ info.mdr = mdr;
+ info.dirs.push_back(dir);
+ info.bits = bits;
+ info.last_cum_auth_pins_change = ceph_clock_now();
+
+ fragment_freeze_dirs(dirs);
+ // initial mark+complete pass
+ fragment_mark_and_complete(mdr);
+}
+
+void MDCache::merge_dir(CInode *diri, frag_t frag)
+{
+ dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
+
+ list<CDir*> dirs;
+ if (!diri->get_dirfrags_under(frag, dirs)) {
+ dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
+ return;
+ }
+
+ if (diri->dirfragtree.is_leaf(frag)) {
+ dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
+ return;
+ }
+
+ if (!can_fragment(diri, dirs))
+ return;
+
+ CDir *first = dirs.front();
+ int bits = first->get_frag().bits() - frag.bits();
+ dout(10) << " we are merging by " << bits << " bits" << dendl;
+
+ dirfrag_t basedirfrag(diri->ino(), frag);
+ MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
+ mdr->more()->fragment_base = basedirfrag;
+
+ ceph_assert(fragments.count(basedirfrag) == 0);
+ fragment_info_t& info = fragments[basedirfrag];
+ info.mdr = mdr;
+ info.dirs = dirs;
+ info.bits = -bits;
+ info.last_cum_auth_pins_change = ceph_clock_now();
+
+ fragment_freeze_dirs(dirs);
+ // initial mark+complete pass
+ fragment_mark_and_complete(mdr);
+}
+
+void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
+{
+ bool any_subtree = false, any_non_subtree = false;
+ for (CDir* dir : dirs) {
+ dir->auth_pin(dir); // until we mark and complete them
+ dir->state_set(CDir::STATE_FRAGMENTING);
+ dir->freeze_dir();
+ ceph_assert(dir->is_freezing_dir());
+
+ if (dir->is_subtree_root())
+ any_subtree = true;
+ else
+ any_non_subtree = true;
+ }
+
+ if (any_subtree && any_non_subtree) {
+ // either all dirfrags are subtree roots or all are not.
+ for (CDir *dir : dirs) {
+ if (dir->is_subtree_root()) {
+ ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
+ } else {
+ dir->state_set(CDir::STATE_AUXSUBTREE);
+ adjust_subtree_auth(dir, mds->get_nodeid());
+ }
+ }
+ }
+}
+
+class C_MDC_FragmentMarking : public MDCacheContext {
+ MDRequestRef mdr;
+public:
+ C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
+ void finish(int r) override {
+ mdcache->fragment_mark_and_complete(mdr);
+ }
+};
+
+void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
+{
+ dirfrag_t basedirfrag = mdr->more()->fragment_base;
+ map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
+ if (it == fragments.end() || it->second.mdr != mdr) {
+ dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
+ request_finish(mdr);
+ return;
+ }
+
+ fragment_info_t& info = it->second;
+ CInode *diri = info.dirs.front()->get_inode();
+ dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
+
+ MDSGatherBuilder gather(g_ceph_context);
+
+ for (list<CDir*>::iterator p = info.dirs.begin();
+ p != info.dirs.end();
+ ++p) {
+ CDir *dir = *p;
+
+ bool ready = true;
+ if (!dir->is_complete()) {
+ dout(15) << " fetching incomplete " << *dir << dendl;
+ dir->fetch(gather.new_sub(), true); // ignore authpinnability
+ ready = false;
+ } else if (dir->get_frag() == frag_t()) {
+ // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
+ // the operation. To avoid CDir::fetch() complaining about missing object,
+ // we commit new dirfrag first.
+ if (dir->state_test(CDir::STATE_CREATING)) {
+ dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
+ dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
+ ready = false;
+ } else if (dir->is_new()) {
+ dout(15) << " committing new " << *dir << dendl;
+ ceph_assert(dir->is_dirty());
+ dir->commit(0, gather.new_sub(), true);
+ ready = false;
+ }
+ }
+ if (!ready)
+ continue;
+
+ if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
+ dout(15) << " marking " << *dir << dendl;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
+ dn->get(CDentry::PIN_FRAGMENTING);
+ ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
+ dn->state_set(CDentry::STATE_FRAGMENTING);
+ }
+ dir->state_set(CDir::STATE_DNPINNEDFRAG);
+ dir->auth_unpin(dir);
+ } else {
+ dout(15) << " already marked " << *dir << dendl;
+ }
+ }
+ if (gather.has_subs()) {
+ gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
+ gather.activate();
+ return;
+ }
+
+ for (list<CDir*>::iterator p = info.dirs.begin();
+ p != info.dirs.end();
+ ++p) {
+ CDir *dir = *p;
+ if (!dir->is_frozen_dir()) {
+ ceph_assert(dir->is_freezing_dir());
+ dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
+ }
+ }
+ if (gather.has_subs()) {
+ gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
+ gather.activate();
+ // flush log so that request auth_pins are retired
+ mds->mdlog->flush();
+ return;
+ }
+
+ fragment_frozen(mdr, 0);
+}
+
+void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
+{
+ dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
+ for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
+ CDir *dir = *p;
+ dout(10) << " frag " << *dir << dendl;
+
+ ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
+ dir->state_clear(CDir::STATE_FRAGMENTING);
+
+ if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
+ dir->state_clear(CDir::STATE_DNPINNEDFRAG);
+
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
+ ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
+ dn->state_clear(CDentry::STATE_FRAGMENTING);
+ dn->put(CDentry::PIN_FRAGMENTING);
+ }
+ } else {
+ dir->auth_unpin(dir);
+ }
+
+ dir->unfreeze_dir();
+ }
+}
+
+bool MDCache::fragment_are_all_frozen(CDir *dir)
+{
+ ceph_assert(dir->is_frozen_dir());
+ map<dirfrag_t,fragment_info_t>::iterator p;
+ for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
+ p != fragments.end() && p->first.ino == dir->ino();
+ ++p) {
+ if (p->first.frag.contains(dir->get_frag()))
+ return p->second.all_frozen;
+ }
+ ceph_abort();
+ return false;
+}
+
+void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
+{
+ map<dirfrag_t,fragment_info_t>::iterator p;
+ for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
+ p != fragments.end() && p->first.ino == dir->ino();
+ ++p) {
+ if (p->first.frag.contains(dir->get_frag())) {
+ p->second.num_remote_waiters++;
+ return;
+ }
+ }
+ ceph_abort();
+}
+
+void MDCache::find_stale_fragment_freeze()
+{
+ dout(10) << "find_stale_fragment_freeze" << dendl;
+ // see comment in Migrator::find_stale_export_freeze()
+ utime_t now = ceph_clock_now();
+ utime_t cutoff = now;
+ cutoff -= g_conf()->mds_freeze_tree_timeout;
+
+ for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
+ p != fragments.end(); ) {
+ dirfrag_t df = p->first;
+ fragment_info_t& info = p->second;
+ ++p;
+ if (info.all_frozen)
+ continue;
+ CDir *dir;
+ int total_auth_pins = 0;
+ for (list<CDir*>::iterator q = info.dirs.begin();
+ q != info.dirs.end();
+ ++q) {
+ dir = *q;
+ if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
+ total_auth_pins = -1;
+ break;
+ }
+ if (dir->is_frozen_dir())
+ continue;
+ total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
+ }
+ if (total_auth_pins < 0)
+ continue;
+ if (info.last_cum_auth_pins != total_auth_pins) {
+ info.last_cum_auth_pins = total_auth_pins;
+ info.last_cum_auth_pins_change = now;
+ continue;
+ }
+ if (info.last_cum_auth_pins_change >= cutoff)
+ continue;
+ dir = info.dirs.front();
+ if (info.num_remote_waiters > 0 ||
+ (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
+ dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
+ list<CDir*> dirs;
+ info.dirs.swap(dirs);
+ fragments.erase(df);
+ fragment_unmark_unfreeze_dirs(dirs);
+ }
+ }
+}
+
+class C_MDC_FragmentPrep : public MDCacheLogContext {
+ MDRequestRef mdr;
+public:
+ C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
+ void finish(int r) override {
+ mdcache->_fragment_logged(mdr);
+ }
+};
+
+class C_MDC_FragmentStore : public MDCacheContext {
+ MDRequestRef mdr;
+public:
+ C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
+ void finish(int r) override {
+ mdcache->_fragment_stored(mdr);
+ }
+};
+
+class C_MDC_FragmentCommit : public MDCacheLogContext {
+ dirfrag_t basedirfrag;
+ MDRequestRef mdr;
+public:
+ C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
+ MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
+ void finish(int r) override {
+ mdcache->_fragment_committed(basedirfrag, mdr);
+ }
+};
+
+class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
+ dirfrag_t basedirfrag;
+ int bits;
+ MDRequestRef mdr;
+public:
+ C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
+ const MDRequestRef& r) :
+ MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
+ void finish(int r) override {
+ ceph_assert(r == 0 || r == -ENOENT);
+ mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
+ }
+ void print(ostream& out) const override {
+ out << "fragment_purge_old(" << basedirfrag << ")";
+ }
+};
+
+void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
+{
+ dirfrag_t basedirfrag = mdr->more()->fragment_base;
+ map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
+ if (it == fragments.end() || it->second.mdr != mdr) {
+ dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
+ request_finish(mdr);
+ return;
+ }
+
+ ceph_assert(r == 0);
+ fragment_info_t& info = it->second;
+ dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
+ << " on " << info.dirs.front()->get_inode() << dendl;
+
+ info.all_frozen = true;
+ dispatch_fragment_dir(mdr);
+}
+
+void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
+{
+ dirfrag_t basedirfrag = mdr->more()->fragment_base;
+ map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
+ if (it == fragments.end() || it->second.mdr != mdr) {
+ dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
+ request_finish(mdr);
+ return;
+ }
+
+ fragment_info_t& info = it->second;
+ CInode *diri = info.dirs.front()->get_inode();
+
+ dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
+ << " on " << *diri << dendl;
+ if (!mdr->aborted) {
+ MutationImpl::LockOpVec lov;
+ lov.add_wrlock(&diri->dirfragtreelock);
+ // prevent a racing gather on any other scatterlocks too
+ lov.add_wrlock(&diri->nestlock);
+ lov.add_wrlock(&diri->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov, NULL, true))
+ if (!mdr->aborted)
+ return;
+ }
+
+ if (mdr->aborted) {
+ dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
+ << info.dirs.front()->dirfrag() << dendl;
+ if (info.bits > 0)
+ mds->balancer->queue_split(info.dirs.front(), false);
+ else
+ mds->balancer->queue_merge(info.dirs.front());
+ fragment_unmark_unfreeze_dirs(info.dirs);
+ fragments.erase(it);
+ request_finish(mdr);
+ return;
+ }
+
+ mdr->ls = mds->mdlog->get_current_segment();
+ EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
+ mds->mdlog->start_entry(le);
+
+ for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
+ CDir *dir = *p;
+ dirfrag_rollback rollback;
+ rollback.fnode = dir->fnode;
+ le->add_orig_frag(dir->get_frag(), &rollback);
+ }
+
+ // refragment
+ MDSContext::vec waiters;
+ adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
+ info.resultfrags, waiters, false);
+ if (g_conf()->mds_debug_frag)
+ diri->verify_dirfrags();
+ mds->queue_waiters(waiters);
+
+ for (const auto& fg : le->orig_frags)
+ ceph_assert(!diri->dirfragtree.is_leaf(fg));
+
+ le->metablob.add_dir_context(*info.resultfrags.begin());
+ for (list<CDir*>::iterator p = info.resultfrags.begin();
+ p != info.resultfrags.end();
+ ++p) {
+ if (diri->is_auth()) {
+ le->metablob.add_fragmented_dir(*p, false, false);
+ } else {
+ (*p)->state_set(CDir::STATE_DIRTYDFT);
+ le->metablob.add_fragmented_dir(*p, false, true);
+ }
+ }
+
+ // dft lock
+ if (diri->is_auth()) {
+ // journal dirfragtree
+ auto &pi = diri->project_inode();
+ pi.inode.version = diri->pre_dirty();
+ journal_dirty_inode(mdr.get(), &le->metablob, diri);
+ } else {
+ mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
+ mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
+ mdr->add_updated_lock(&diri->dirfragtreelock);
+ }
+
+ /*
+ // filelock
+ mds->locker->mark_updated_scatterlock(&diri->filelock);
+ mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
+ mut->add_updated_lock(&diri->filelock);
+
+ // dirlock
+ mds->locker->mark_updated_scatterlock(&diri->nestlock);
+ mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
+ mut->add_updated_lock(&diri->nestlock);
+ */
+
+ add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
+ mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
+ mdr, __func__);
+ mds->mdlog->flush();
+}
+
+void MDCache::_fragment_logged(MDRequestRef& mdr)
+{
+ dirfrag_t basedirfrag = mdr->more()->fragment_base;
+ auto& info = fragments.at(basedirfrag);
+ CInode *diri = info.resultfrags.front()->get_inode();
+
+ dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
+ << " on " << *diri << dendl;
+ mdr->mark_event("prepare logged");
+
+ if (diri->is_auth())
+ diri->pop_and_dirty_projected_inode(mdr->ls);
+
+ mdr->apply(); // mark scatterlock
+
+ // store resulting frags
+ MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
+
+ for (list<CDir*>::iterator p = info.resultfrags.begin();
+ p != info.resultfrags.end();
+ ++p) {
+ CDir *dir = *p;
+ dout(10) << " storing result frag " << *dir << dendl;
+
+ dir->mark_dirty(dir->pre_dirty(), mdr->ls);
+ dir->mark_new(mdr->ls);
+
+ // freeze and store them too
+ dir->auth_pin(this);
+ dir->state_set(CDir::STATE_FRAGMENTING);
+ dir->commit(0, gather.new_sub(), true); // ignore authpinnability
+ }
+
+ gather.activate();
+}
+
+void MDCache::_fragment_stored(MDRequestRef& mdr)
+{
+ dirfrag_t basedirfrag = mdr->more()->fragment_base;
+ fragment_info_t &info = fragments.at(basedirfrag);
+ CDir *first = info.resultfrags.front();
+ CInode *diri = first->get_inode();
+
+ dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
+ << " on " << *diri << dendl;
+ mdr->mark_event("new frags stored");
+
+ // tell peers
+ mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
+ diri->authority().first : CDIR_AUTH_UNKNOWN;
+ for (const auto &p : first->get_replicas()) {
+ if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
+ (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
+ rejoin_gather.count(p.first)))
+ continue;
+
+ auto notify = MMDSFragmentNotify::create(basedirfrag, info.bits, mdr->reqid.tid);
+ if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
+ diri_auth != p.first) { // not auth mds of diri
+ /*
+ * In the nornal case, mds does not trim dir inode whose child dirfrags
+ * are likely being fragmented (see trim_inode()). But when fragmenting
+ * subtree roots, following race can happen:
+ *
+ * - mds.a (auth mds of dirfrag) sends fragment_notify message to
+ * mds.c and drops wrlock on dirfragtreelock.
+ * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
+ * SYNC and send lock message mds.c
+ * - mds.c receives the lock message and changes dirfragtreelock state
+ * to SYNC
+ * - mds.c trim dirfrag and dir inode from its cache
+ * - mds.c receives the fragment_notify message
+ *
+ * So we need to ensure replicas have received the notify, then unlock
+ * the dirfragtreelock.
+ */
+ notify->mark_ack_wanted();
+ info.notify_ack_waiting.insert(p.first);
+ }
+
+ // freshly replicate new dirs to peers
+ for (list<CDir*>::iterator q = info.resultfrags.begin();
+ q != info.resultfrags.end();
+ ++q)
+ replicate_dir(*q, p.first, notify->basebl);
+
+ mds->send_message_mds(notify, p.first);
+ }
+
+ // journal commit
+ EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
+ mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
+
+
+ // unfreeze resulting frags
+ for (list<CDir*>::iterator p = info.resultfrags.begin();
+ p != info.resultfrags.end();
+ ++p) {
+ CDir *dir = *p;
+ dout(10) << " result frag " << *dir << dendl;
+
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
+ ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
+ dn->state_clear(CDentry::STATE_FRAGMENTING);
+ dn->put(CDentry::PIN_FRAGMENTING);
+ }
+
+ // unfreeze
+ dir->unfreeze_dir();
+ }
+
+ if (info.notify_ack_waiting.empty()) {
+ fragment_drop_locks(info);
+ } else {
+ mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
+ }
+}
+
+void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
+{
+ dout(10) << "fragment_committed " << basedirfrag << dendl;
+ if (mdr)
+ mdr->mark_event("commit logged");
+
+ ufragment &uf = uncommitted_fragments.at(basedirfrag);
+
+ // remove old frags
+ C_GatherBuilder gather(
+ g_ceph_context,
+ new C_OnFinisher(
+ new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
+ mds->finisher));
+
+ SnapContext nullsnapc;
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+ for (const auto& fg : uf.old_frags) {
+ object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
+ ObjectOperation op;
+ if (fg == frag_t()) {
+ // backtrace object
+ dout(10) << " truncate orphan dirfrag " << oid << dendl;
+ op.truncate(0);
+ op.omap_clear();
+ } else {
+ dout(10) << " removing orphan dirfrag " << oid << dendl;
+ op.remove();
+ }
+ mds->objecter->mutate(oid, oloc, op, nullsnapc,
+ ceph::real_clock::now(),
+ 0, gather.new_sub());
+ }
+
+ ceph_assert(gather.has_subs());
+ gather.activate();
+}
+
+void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
+{
+ dout(10) << "fragment_old_purged " << basedirfrag << dendl;
+ if (mdr)
+ mdr->mark_event("old frags purged");
+
+ EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
+ mds->mdlog->start_submit_entry(le);
+
+ finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
+
+ if (mds->logger) {
+ if (bits > 0) {
+ mds->logger->inc(l_mds_dir_split);
+ } else {
+ mds->logger->inc(l_mds_dir_merge);
+ }
+ }
+
+ if (mdr) {
+ auto it = fragments.find(basedirfrag);
+ ceph_assert(it != fragments.end());
+ it->second.finishing = true;
+ if (it->second.notify_ack_waiting.empty())
+ fragment_maybe_finish(it);
+ else
+ mdr->mark_event("wating for notify acks");
+ }
+}
+
+void MDCache::fragment_drop_locks(fragment_info_t& info)
+{
+ mds->locker->drop_locks(info.mdr.get());
+ request_finish(info.mdr);
+ //info.mdr.reset();
+}
+
+void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
+{
+ if (!it->second.finishing)
+ return;
+
+ // unmark & auth_unpin
+ for (const auto &dir : it->second.resultfrags) {
+ dir->state_clear(CDir::STATE_FRAGMENTING);
+ dir->auth_unpin(this);
+
+ // In case the resulting fragments are beyond the split size,
+ // we might need to split them again right away (they could
+ // have been taking inserts between unfreezing and getting
+ // here)
+ mds->balancer->maybe_fragment(dir, false);
+ }
+
+ fragments.erase(it);
+}
+
+
+void MDCache::handle_fragment_notify_ack(const MMDSFragmentNotifyAck::const_ref &ack)
+{
+ dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
+ mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+ if (mds->get_state() < MDSMap::STATE_ACTIVE) {
+ return;
+ }
+
+ auto it = fragments.find(ack->get_base_dirfrag());
+ if (it == fragments.end() ||
+ it->second.get_tid() != ack->get_tid()) {
+ dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
+ return;
+ }
+
+ if (it->second.notify_ack_waiting.erase(from) &&
+ it->second.notify_ack_waiting.empty()) {
+ fragment_drop_locks(it->second);
+ fragment_maybe_finish(it);
+ }
+}
+
+void MDCache::handle_fragment_notify(const MMDSFragmentNotify::const_ref &notify)
+{
+ dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
+ mds_rank_t from = mds_rank_t(notify->get_source().num());
+
+ if (mds->get_state() < MDSMap::STATE_REJOIN) {
+ return;
+ }
+
+ CInode *diri = get_inode(notify->get_ino());
+ if (diri) {
+ frag_t base = notify->get_basefrag();
+ int bits = notify->get_bits();
+
+/*
+ if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
+ (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
+ dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
+ << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
+ return;
+ }
+*/
+
+ // refragment
+ MDSContext::vec waiters;
+ list<CDir*> resultfrags;
+ adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
+ if (g_conf()->mds_debug_frag)
+ diri->verify_dirfrags();
+
+ for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
+ diri->take_dir_waiting((*p)->get_frag(), waiters);
+
+ // add new replica dirs values
+ auto p = notify->basebl.cbegin();
+ while (!p.end())
+ add_replica_dir(p, diri, from, waiters);
+
+ mds->queue_waiters(waiters);
+ } else {
+ ceph_abort();
+ }
+
+ if (notify->is_ack_wanted()) {
+ auto ack = MMDSFragmentNotifyAck::create(notify->get_base_dirfrag(),
+ notify->get_bits(), notify->get_tid());
+ mds->send_message_mds(ack, from);
+ }
+}
+
+void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
+ LogSegment *ls, bufferlist *rollback)
+{
+ dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
+ ceph_assert(!uncommitted_fragments.count(basedirfrag));
+ ufragment& uf = uncommitted_fragments[basedirfrag];
+ uf.old_frags = old_frags;
+ uf.bits = bits;
+ uf.ls = ls;
+ ls->uncommitted_fragments.insert(basedirfrag);
+ if (rollback)
+ uf.rollback.swap(*rollback);
+}
+
+void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
+{
+ dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
+ << " op " << EFragment::op_name(op) << dendl;
+ map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
+ if (it != uncommitted_fragments.end()) {
+ ufragment& uf = it->second;
+ if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
+ uf.committed = true;
+ } else {
+ uf.ls->uncommitted_fragments.erase(basedirfrag);
+ mds->queue_waiters(uf.waiters);
+ uncommitted_fragments.erase(it);
+ }
+ }
+}
+
+void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
+{
+ dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
+ << " old_frags (" << old_frags << ")" << dendl;
+ map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
+ if (it != uncommitted_fragments.end()) {
+ ufragment& uf = it->second;
+ if (!uf.old_frags.empty()) {
+ uf.old_frags = std::move(old_frags);
+ uf.committed = true;
+ } else {
+ uf.ls->uncommitted_fragments.erase(basedirfrag);
+ uncommitted_fragments.erase(it);
+ }
+ }
+}
+
+void MDCache::wait_for_uncommitted_fragments(MDSContext* finisher)
+{
+ MDSGatherBuilder gather(g_ceph_context, finisher);
+ for (auto& p : uncommitted_fragments) {
+ p.second.waiters.push_back(gather.new_sub());
+ }
+ gather.activate();
+}
+
+void MDCache::rollback_uncommitted_fragments()
+{
+ dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
+ for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
+ p != uncommitted_fragments.end();
+ ++p) {
+ ufragment &uf = p->second;
+ CInode *diri = get_inode(p->first.ino);
+ ceph_assert(diri);
+
+ if (uf.committed) {
+ _fragment_committed(p->first, MDRequestRef());
+ continue;
+ }
+
+ dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
+
+ LogSegment *ls = mds->mdlog->get_current_segment();
+ EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
+ mds->mdlog->start_entry(le);
+ bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
+
+ frag_vec_t old_frags;
+ diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
+
+ list<CDir*> resultfrags;
+ if (uf.old_frags.empty()) {
+ // created by old format EFragment
+ MDSContext::vec waiters;
+ adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
+ } else {
+ auto bp = uf.rollback.cbegin();
+ for (const auto& fg : uf.old_frags) {
+ CDir *dir = force_dir_fragment(diri, fg);
+ resultfrags.push_back(dir);
+
+ dirfrag_rollback rollback;
+ decode(rollback, bp);
+
+ dir->set_version(rollback.fnode.version);
+ dir->fnode = rollback.fnode;
+
+ dir->_mark_dirty(ls);
+
+ if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
+ dout(10) << " dirty nestinfo on " << *dir << dendl;
+ mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
+ ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
+ }
+ if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
+ dout(10) << " dirty fragstat on " << *dir << dendl;
+ mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
+ ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
+ }
+
+ le->add_orig_frag(dir->get_frag());
+ le->metablob.add_dir_context(dir);
+ if (diri_auth) {
+ le->metablob.add_fragmented_dir(dir, true, false);
+ } else {
+ dout(10) << " dirty dirfragtree on " << *dir << dendl;
+ dir->state_set(CDir::STATE_DIRTYDFT);
+ le->metablob.add_fragmented_dir(dir, true, true);
+ }
+ }
+ }
+
+ if (diri_auth) {
+ auto &pi = diri->project_inode();
+ pi.inode.version = diri->pre_dirty();
+ diri->pop_and_dirty_projected_inode(ls); // hacky
+ le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
+ } else {
+ mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
+ ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
+ }
+
+ if (g_conf()->mds_debug_frag)
+ diri->verify_dirfrags();
+
+ for (const auto& leaf : old_frags) {
+ ceph_assert(!diri->dirfragtree.is_leaf(leaf));
+ }
+
+ mds->mdlog->submit_entry(le);
+
+ uf.old_frags.swap(old_frags);
+ _fragment_committed(p->first, MDRequestRef());
+ }
+}
+
+void MDCache::force_readonly()
+{
+ if (is_readonly())
+ return;
+
+ dout(1) << "force file system read-only" << dendl;
+ mds->clog->warn() << "force file system read-only";
+
+ set_readonly();
+
+ mds->server->force_clients_readonly();
+
+ // revoke write caps
+ int count = 0;
+ for (auto &p : inode_map) {
+ CInode *in = p.second;
+ if (in->is_head())
+ mds->locker->eval(in, CEPH_CAP_LOCKS);
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+
+ mds->mdlog->flush();
+}
+
+
+// ==============================================================
+// debug crap
+
+void MDCache::show_subtrees(int dbl, bool force_print)
+{
+ if (g_conf()->mds_thrash_exports)
+ dbl += 15;
+
+ //dout(10) << "show_subtrees" << dendl;
+
+ if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
+ return; // i won't print anything.
+
+ if (subtrees.empty()) {
+ dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
+ << dendl;
+ return;
+ }
+
+ if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD &&
+ !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
+ dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not "
+ "printing subtrees" << dendl;
+ return;
+ }
+
+ // root frags
+ list<CDir*> basefrags;
+ for (set<CInode*>::iterator p = base_inodes.begin();
+ p != base_inodes.end();
+ ++p)
+ (*p)->get_dirfrags(basefrags);
+ //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
+ dout(15) << "show_subtrees" << dendl;
+
+ // queue stuff
+ list<pair<CDir*,int> > q;
+ string indent;
+ set<CDir*> seen;
+
+ // calc max depth
+ for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
+ q.push_back(pair<CDir*,int>(*p, 0));
+
+ set<CDir*> subtrees_seen;
+
+ unsigned int depth = 0;
+ while (!q.empty()) {
+ CDir *dir = q.front().first;
+ unsigned int d = q.front().second;
+ q.pop_front();
+
+ if (subtrees.count(dir) == 0) continue;
+
+ subtrees_seen.insert(dir);
+
+ if (d > depth) depth = d;
+
+ // sanity check
+ //dout(25) << "saw depth " << d << " " << *dir << dendl;
+ if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
+ ceph_assert(seen.count(dir) == 0);
+ seen.insert(dir);
+
+ // nested items?
+ if (!subtrees[dir].empty()) {
+ for (set<CDir*>::iterator p = subtrees[dir].begin();
+ p != subtrees[dir].end();
+ ++p) {
+ //dout(25) << " saw sub " << **p << dendl;
+ q.push_front(pair<CDir*,int>(*p, d+1));
+ }
+ }
+ }
+
+ if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD &&
+ !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
+ dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing "
+ "subtrees" << dendl;
+ return;
+ }
+
+ // print tree
+ for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
+ q.push_back(pair<CDir*,int>(*p, 0));
+
+ while (!q.empty()) {
+ CDir *dir = q.front().first;
+ int d = q.front().second;
+ q.pop_front();
+
+ if (subtrees.count(dir) == 0) continue;
+
+ // adjust indenter
+ while ((unsigned)d < indent.size())
+ indent.resize(d);
+
+ // pad
+ string pad = "______________________________________";
+ pad.resize(depth*2+1-indent.size());
+ if (!subtrees[dir].empty())
+ pad[0] = '.'; // parent
+
+
+ string auth;
+ if (dir->is_auth())
+ auth = "auth ";
+ else
+ auth = " rep ";
+
+ char s[10];
+ if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
+ snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
+ else
+ snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
+
+ // print
+ dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
+ << " " << auth << *dir << dendl;
+
+ if (dir->ino() == MDS_INO_ROOT)
+ ceph_assert(dir->inode == root);
+ if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
+ ceph_assert(dir->inode == myin);
+ if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
+ ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
+
+ // nested items?
+ if (!subtrees[dir].empty()) {
+ // more at my level?
+ if (!q.empty() && q.front().second == d)
+ indent += "| ";
+ else
+ indent += " ";
+
+ for (set<CDir*>::iterator p = subtrees[dir].begin();
+ p != subtrees[dir].end();
+ ++p)
+ q.push_front(pair<CDir*,int>(*p, d+2));
+ }
+ }
+
+ // verify there isn't stray crap in subtree map
+ int lost = 0;
+ for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ if (subtrees_seen.count(p->first)) continue;
+ dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
+ lost++;
+ }
+ ceph_assert(lost == 0);
+}
+
+void MDCache::show_cache()
+{
+ dout(7) << "show_cache" << dendl;
+
+ auto show_func = [this](CInode *in) {
+ // unlinked?
+ if (!in->parent)
+ dout(7) << " unlinked " << *in << dendl;
+
+ // dirfrags?
+ list<CDir*> dfs;
+ in->get_dirfrags(dfs);
+ for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
+ CDir *dir = *p;
+ dout(7) << " dirfrag " << *dir << dendl;
+
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
+ dout(7) << " dentry " << *dn << dendl;
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ if (dnl->is_primary() && dnl->get_inode())
+ dout(7) << " inode " << *dnl->get_inode() << dendl;
+ }
+ }
+ };
+
+ for (auto &p : inode_map)
+ show_func(p.second);
+ for (auto &p : snap_inode_map)
+ show_func(p.second);
+}
+
+void MDCache::cache_status(Formatter *f)
+{
+ f->open_object_section("cache");
+
+ f->open_object_section("pool");
+ mempool::get_pool(mempool::mds_co::id).dump(f);
+ f->close_section();
+
+ f->close_section();
+}
+
+void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f)
+{
+ ceph_assert(in);
+ if ((max_depth >= 0) && (cur_depth > max_depth)) {
+ return;
+ }
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (const auto &subdir : ls) {
+ for (const auto &p : subdir->items) {
+ CDentry *dn = p.second;
+ CInode *in = dn->get_linkage()->get_inode();
+ if (in) {
+ dump_tree(in, cur_depth + 1, max_depth, f);
+ }
+ }
+ }
+ f->open_object_section("inode");
+ in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
+ f->close_section();
+}
+
+int MDCache::dump_cache(std::string_view file_name)
+{
+ return dump_cache(file_name, NULL);
+}
+
+int MDCache::dump_cache(Formatter *f)
+{
+ return dump_cache(std::string_view(""), f);
+}
+
+/**
+ * Dump the metadata cache, either to a Formatter, if
+ * provided, else to a plain text file.
+ */
+int MDCache::dump_cache(std::string_view fn, Formatter *f)
+{
+ int r = 0;
+
+ // dumping large caches may cause mds to hang or worse get killed.
+ // so, disallow the dump if the cache size exceeds the configured
+ // threshold, which is 1G for formatter and unlimited for file (note
+ // that this can be jacked up by the admin... and is nothing but foot
+ // shooting, but the option itself is for devs and hence dangerous to
+ // tune). TODO: remove this when fixed.
+ uint64_t threshold = f ?
+ g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
+ g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
+
+ if (threshold && cache_size() > threshold) {
+ if (f) {
+ std::stringstream ss;
+ ss << "cache usage exceeds dump threshold";
+ f->open_object_section("result");
+ f->dump_string("error", ss.str());
+ f->close_section();
+ } else {
+ derr << "cache usage exceeds dump threshold" << dendl;
+ r = -EINVAL;
+ }
+ return r;
+ }
+
+ r = 0;
+ int fd = -1;
+
+ if (f) {
+ f->open_array_section("inodes");
+ } else {
+ char path[PATH_MAX] = "";
+ if (fn.length()) {
+ snprintf(path, sizeof path, "%s", fn.data());
+ } else {
+ snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
+ }
+
+ dout(1) << "dump_cache to " << path << dendl;
+
+ fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
+ if (fd < 0) {
+ derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
+ return errno;
+ }
+ }
+
+ auto dump_func = [fd, f](CInode *in) {
+ int r;
+ if (f) {
+ f->open_object_section("inode");
+ in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
+ f->close_section();
+ return 1;
+ }
+ ostringstream ss;
+ ss << *in << std::endl;
+ std::string s = ss.str();
+ r = safe_write(fd, s.c_str(), s.length());
+ if (r < 0)
+ return r;
+ list<CDir*> dfs;
+ in->get_dirfrags(dfs);
+ for (auto &dir : dfs) {
+ ostringstream tt;
+ tt << " " << *dir << std::endl;
+ std::string t = tt.str();
+ r = safe_write(fd, t.c_str(), t.length());
+ if (r < 0)
+ return r;
+ for (auto &p : dir->items) {
+ CDentry *dn = p.second;
+ ostringstream uu;
+ uu << " " << *dn << std::endl;
+ std::string u = uu.str();
+ r = safe_write(fd, u.c_str(), u.length());
+ if (r < 0)
+ return r;
+ }
+ dir->check_rstats();
+ }
+ return 1;
+ };
+
+ for (auto &p : inode_map) {
+ r = dump_func(p.second);
+ if (r < 0)
+ goto out;
+ }
+ for (auto &p : snap_inode_map) {
+ r = dump_func(p.second);
+ if (r < 0)
+ goto out;
+ }
+ r = 0;
+
+ out:
+ if (f) {
+ f->close_section(); // inodes
+ } else {
+ ::close(fd);
+ }
+ return r;
+}
+
+
+
+C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
+ : MDSInternalContext(c->mds), cache(c), mdr(r)
+{}
+
+void C_MDS_RetryRequest::finish(int r)
+{
+ mdr->retry++;
+ cache->dispatch_request(mdr);
+}
+
+
+class C_MDS_EnqueueScrub : public Context
+{
+ std::string tag;
+ Formatter *formatter;
+ Context *on_finish;
+public:
+ ScrubHeaderRef header;
+ C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
+ tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
+
+ Context *take_finisher() {
+ Context *fin = on_finish;
+ on_finish = NULL;
+ return fin;
+ }
+
+ void finish(int r) override {
+ if (r == 0) {
+ // since recursive scrub is asynchronous, dump minimal output
+ // to not upset cli tools.
+ if (header && header->get_recursive()) {
+ formatter->open_object_section("results");
+ formatter->dump_int("return_code", 0);
+ formatter->dump_string("scrub_tag", tag);
+ formatter->dump_string("mode", "asynchronous");
+ formatter->close_section(); // results
+ }
+ } else { // we failed the lookup or something; dump ourselves
+ formatter->open_object_section("results");
+ formatter->dump_int("return_code", r);
+ formatter->close_section(); // results
+ r = 0; // already dumped in formatter
+ }
+ if (on_finish)
+ on_finish->complete(r);
+ }
+};
+
+void MDCache::enqueue_scrub(
+ std::string_view path,
+ std::string_view tag,
+ bool force, bool recursive, bool repair,
+ Formatter *f, Context *fin)
+{
+ dout(10) << __func__ << " " << path << dendl;
+ MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
+ if (path == "~mdsdir") {
+ filepath fp(MDS_INO_MDSDIR(mds->get_nodeid()));
+ mdr->set_filepath(fp);
+ } else {
+ filepath fp(path);
+ mdr->set_filepath(path);
+ }
+
+ bool is_internal = false;
+ std::string tag_str(tag);
+ if (tag_str.empty()) {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+ tag_str = uuid_gen.to_string();
+ is_internal = true;
+ }
+
+ C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
+ cs->header = std::make_shared<ScrubHeader>(
+ tag_str, is_internal, force, recursive, repair, f);
+
+ mdr->internal_op_finish = cs;
+ enqueue_scrub_work(mdr);
+}
+
+void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
+{
+ MutationImpl::LockOpVec lov;
+ CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, lov, true);
+ if (NULL == in)
+ return;
+
+ // TODO: Remove this restriction
+ ceph_assert(in->is_auth());
+
+ bool locked = mds->locker->acquire_locks(mdr, lov);
+ if (!locked)
+ return;
+
+ C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
+ ScrubHeaderRef header = cs->header;
+
+ // Cannot scrub same dentry twice at same time
+ if (in->scrub_is_in_progress()) {
+ mds->server->respond_to_request(mdr, -EBUSY);
+ return;
+ } else {
+ in->scrub_info();
+ }
+
+ header->set_origin(in);
+
+ Context *fin;
+ if (header->get_recursive()) {
+ header->get_origin()->get(CInode::PIN_SCRUBQUEUE);
+ fin = new MDSInternalContextWrapper(mds,
+ new FunctionContext([this, header](int r) {
+ recursive_scrub_finish(header);
+ header->get_origin()->put(CInode::PIN_SCRUBQUEUE);
+ })
+ );
+ } else {
+ fin = cs->take_finisher();
+ }
+
+ // If the scrub did some repair, then flush the journal at the end of
+ // the scrub. Otherwise in the case of e.g. rewriting a backtrace
+ // the on disk state will still look damaged.
+ auto scrub_finish = new FunctionContext([this, header, fin](int r){
+ if (!header->get_repaired()) {
+ if (fin)
+ fin->complete(r);
+ return;
+ }
+
+ auto flush_finish = new FunctionContext([this, fin](int r){
+ dout(4) << "Expiring log segments because scrub did some repairs" << dendl;
+ mds->mdlog->trim_all();
+
+ if (fin) {
+ MDSGatherBuilder gather(g_ceph_context);
+ auto& expiring_segments = mds->mdlog->get_expiring_segments();
+ for (auto logseg : expiring_segments)
+ logseg->wait_for_expiry(gather.new_sub());
+ ceph_assert(gather.has_subs());
+ gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
+ gather.activate();
+ }
+ });
+
+ dout(4) << "Flushing journal because scrub did some repairs" << dendl;
+ mds->mdlog->start_new_segment();
+ mds->mdlog->flush();
+ mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish));
+ });
+
+ if (!header->get_recursive()) {
+ mds->scrubstack->enqueue_inode_top(in, header,
+ new MDSInternalContextWrapper(mds, scrub_finish));
+ } else {
+ mds->scrubstack->enqueue_inode_bottom(in, header,
+ new MDSInternalContextWrapper(mds, scrub_finish));
+ }
+
+ mds->server->respond_to_request(mdr, 0);
+ return;
+}
+
+void MDCache::recursive_scrub_finish(const ScrubHeaderRef& header)
+{
+ if (header->get_origin()->is_base() &&
+ header->get_force() && header->get_repair()) {
+ // notify snapserver that base directory is recursively scrubbed.
+ // After both root and mdsdir are recursively scrubbed, snapserver
+ // knows that all old format snaprealms are converted to the new
+ // format.
+ if (mds->mdsmap->get_num_in_mds() == 1 &&
+ mds->mdsmap->get_num_failed_mds() == 0 &&
+ mds->mdsmap->get_tableserver() == mds->get_nodeid()) {
+ mds->mark_base_recursively_scrubbed(header->get_origin()->ino());
+ }
+ }
+}
+
+struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
+ MDRequestRef mdr;
+ C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
+ MDCacheLogContext(c), mdr(m) {}
+ void finish(int r) override {
+ mdr->apply();
+ get_mds()->server->respond_to_request(mdr, r);
+ }
+};
+
+void MDCache::repair_dirfrag_stats(CDir *dir)
+{
+ MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
+ mdr->pin(dir);
+ mdr->internal_op_private = dir;
+ mdr->internal_op_finish = new C_MDSInternalNoop;
+ repair_dirfrag_stats_work(mdr);
+}
+
+void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
+{
+ CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
+ dout(10) << __func__ << " " << *dir << dendl;
+
+ if (!dir->is_auth()) {
+ mds->server->respond_to_request(mdr, -ESTALE);
+ return;
+ }
+
+ if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
+ dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
+
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ if (!mdr->remote_auth_pins.empty())
+ mds->locker->notify_freeze_waiter(dir);
+ return;
+ }
+
+ mdr->auth_pin(dir);
+
+ MutationImpl::LockOpVec lov;
+ CInode *diri = dir->inode;
+ lov.add_rdlock(&diri->dirfragtreelock);
+ lov.add_wrlock(&diri->nestlock);
+ lov.add_wrlock(&diri->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!dir->is_complete()) {
+ dir->fetch(new C_MDS_RetryRequest(this, mdr));
+ return;
+ }
+
+ frag_info_t frag_info;
+ nest_info_t nest_info;
+ for (auto it = dir->begin(); it != dir->end(); ++it) {
+ CDentry *dn = it->second;
+ if (dn->last != CEPH_NOSNAP)
+ continue;
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ nest_info.add(in->get_projected_inode()->accounted_rstat);
+ if (in->is_dir())
+ frag_info.nsubdirs++;
+ else
+ frag_info.nfiles++;
+ } else if (dnl->is_remote())
+ frag_info.nfiles++;
+ }
+
+ fnode_t *pf = dir->get_projected_fnode();
+ bool good_fragstat = frag_info.same_sums(pf->fragstat);
+ bool good_rstat = nest_info.same_sums(pf->rstat);
+ if (good_fragstat && good_rstat) {
+ dout(10) << __func__ << " no corruption found" << dendl;
+ mds->server->respond_to_request(mdr, 0);
+ return;
+ }
+
+ pf = dir->project_fnode();
+ pf->version = dir->pre_dirty();
+ mdr->add_projected_fnode(dir);
+
+ mdr->ls = mds->mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
+ mds->mdlog->start_entry(le);
+
+ if (!good_fragstat) {
+ if (pf->fragstat.mtime > frag_info.mtime)
+ frag_info.mtime = pf->fragstat.mtime;
+ if (pf->fragstat.change_attr > frag_info.change_attr)
+ frag_info.change_attr = pf->fragstat.change_attr;
+ pf->fragstat = frag_info;
+ mds->locker->mark_updated_scatterlock(&diri->filelock);
+ mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
+ mdr->add_updated_lock(&diri->filelock);
+ }
+
+ if (!good_rstat) {
+ if (pf->rstat.rctime > nest_info.rctime)
+ nest_info.rctime = pf->rstat.rctime;
+ pf->rstat = nest_info;
+ mds->locker->mark_updated_scatterlock(&diri->nestlock);
+ mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
+ mdr->add_updated_lock(&diri->nestlock);
+ }
+
+ le->metablob.add_dir_context(dir);
+ le->metablob.add_dir(dir, true);
+
+ mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
+}
+
+void MDCache::repair_inode_stats(CInode *diri)
+{
+ MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
+ mdr->pin(diri);
+ mdr->internal_op_private = diri;
+ mdr->internal_op_finish = new C_MDSInternalNoop;
+ repair_inode_stats_work(mdr);
+}
+
+void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
+{
+ CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
+ dout(10) << __func__ << " " << *diri << dendl;
+
+ if (!diri->is_auth()) {
+ mds->server->respond_to_request(mdr, -ESTALE);
+ return;
+ }
+ if (!diri->is_dir()) {
+ mds->server->respond_to_request(mdr, -ENOTDIR);
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+
+ if (mdr->ls) // already marked filelock/nestlock dirty ?
+ goto do_rdlocks;
+
+ lov.add_rdlock(&diri->dirfragtreelock);
+ lov.add_wrlock(&diri->nestlock);
+ lov.add_wrlock(&diri->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
+ // the scatter-gather process, which will fix any fragstat/rstat errors.
+ {
+ frag_vec_t leaves;
+ diri->dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ CDir *dir = diri->get_dirfrag(leaf);
+ if (!dir) {
+ ceph_assert(mdr->is_auth_pinned(diri));
+ dir = diri->get_or_open_dirfrag(this, leaf);
+ }
+ if (dir->get_version() == 0) {
+ ceph_assert(dir->is_auth());
+ dir->fetch(new C_MDS_RetryRequest(this, mdr));
+ return;
+ }
+ }
+ }
+
+ diri->state_set(CInode::STATE_REPAIRSTATS);
+ mdr->ls = mds->mdlog->get_current_segment();
+ mds->locker->mark_updated_scatterlock(&diri->filelock);
+ mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
+ mds->locker->mark_updated_scatterlock(&diri->nestlock);
+ mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
+
+ mds->locker->drop_locks(mdr.get());
+
+do_rdlocks:
+ // force the scatter-gather process
+ lov.clear();
+ lov.add_rdlock(&diri->dirfragtreelock);
+ lov.add_rdlock(&diri->nestlock);
+ lov.add_rdlock(&diri->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ diri->state_clear(CInode::STATE_REPAIRSTATS);
+
+ frag_info_t dir_info;
+ nest_info_t nest_info;
+ nest_info.rsubdirs = 1; // it gets one to account for self
+ if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
+ nest_info.rsnaps = srnode->snaps.size();
+
+ {
+ frag_vec_t leaves;
+ diri->dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ CDir *dir = diri->get_dirfrag(leaf);
+ ceph_assert(dir);
+ ceph_assert(dir->get_version() > 0);
+ dir_info.add(dir->fnode.accounted_fragstat);
+ nest_info.add(dir->fnode.accounted_rstat);
+ }
+ }
+
+ if (!dir_info.same_sums(diri->inode.dirstat) ||
+ !nest_info.same_sums(diri->inode.rstat)) {
+ dout(10) << __func__ << " failed to fix fragstat/rstat on "
+ << *diri << dendl;
+ }
+
+ mds->server->respond_to_request(mdr, 0);
+}
+
+void MDCache::upgrade_inode_snaprealm(CInode *in)
+{
+ MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_UPGRADE_SNAPREALM);
+ mdr->pin(in);
+ mdr->internal_op_private = in;
+ mdr->internal_op_finish = new C_MDSInternalNoop;
+ upgrade_inode_snaprealm_work(mdr);
+}
+
+void MDCache::upgrade_inode_snaprealm_work(MDRequestRef& mdr)
+{
+ CInode *in = static_cast<CInode*>(mdr->internal_op_private);
+ dout(10) << __func__ << " " << *in << dendl;
+
+ if (!in->is_auth()) {
+ mds->server->respond_to_request(mdr, -ESTALE);
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+ mds->locker->include_snap_rdlocks(in, lov);
+ lov.erase_rdlock(&in->snaplock);
+ lov.add_xlock(&in->snaplock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ // project_snaprealm() upgrades snaprealm format
+ auto &pi = in->project_inode(false, true);
+ mdr->add_projected_inode(in);
+ pi.inode.version = in->pre_dirty();
+
+ mdr->ls = mds->mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mds->mdlog, "upgrade_snaprealm");
+ mds->mdlog->start_entry(le);
+
+ if (in->is_base()) {
+ le->metablob.add_root(true, in);
+ } else {
+ CDentry *pdn = in->get_projected_parent_dn();
+ le->metablob.add_dir_context(pdn->get_dir());
+ le->metablob.add_primary_dentry(pdn, in, true);
+ }
+
+ mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
+}
+
+void MDCache::flush_dentry(std::string_view path, Context *fin)
+{
+ if (is_readonly()) {
+ dout(10) << __func__ << ": read-only FS" << dendl;
+ fin->complete(-EROFS);
+ return;
+ }
+ dout(10) << "flush_dentry " << path << dendl;
+ MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
+ filepath fp(path);
+ mdr->set_filepath(fp);
+ mdr->internal_op_finish = fin;
+ flush_dentry_work(mdr);
+}
+
+class C_FinishIOMDR : public MDSContext {
+protected:
+ MDSRank *mds;
+ MDRequestRef mdr;
+ MDSRank *get_mds() override { return mds; }
+public:
+ C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
+ void finish(int r) override { mds->server->respond_to_request(mdr, r); }
+};
+
+void MDCache::flush_dentry_work(MDRequestRef& mdr)
+{
+ MutationImpl::LockOpVec lov;
+ CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, lov, true);
+ if (NULL == in)
+ return;
+
+ // TODO: Is this necessary? Fix it if so
+ ceph_assert(in->is_auth());
+ bool locked = mds->locker->acquire_locks(mdr, lov);
+ if (!locked)
+ return;
+ in->flush(new C_FinishIOMDR(mds, mdr));
+}
+
+
+/**
+ * Initialize performance counters with global perfcounter
+ * collection.
+ */
+void MDCache::register_perfcounters()
+{
+ PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
+
+ // Stray/purge statistics
+ pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
+ PerfCountersBuilder::PRIO_INTERESTING);
+ pcb.add_u64(l_mdc_num_recovering_enqueued,
+ "num_recovering_enqueued", "Files waiting for recovery", "recy",
+ PerfCountersBuilder::PRIO_INTERESTING);
+ pcb.add_u64_counter(l_mdc_recovery_completed,
+ "recovery_completed", "File recoveries completed", "recd",
+ PerfCountersBuilder::PRIO_INTERESTING);
+
+ // useful recovery queue statistics
+ pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+ pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
+ "Files currently being recovered");
+ pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
+ "Files waiting for recovery with elevated priority");
+ pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
+ "File recoveries started");
+
+ // along with other stray dentries stats
+ pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
+ "Stray dentries delayed");
+ pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
+ "Stray dentries enqueuing for purge");
+ pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
+ "Stray dentries created");
+ pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
+ "Stray dentries enqueued for purge");
+ pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
+ "Stray dentries reintegrated");
+ pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
+ "Stray dentries migrated");
+
+ // low prio internal request stats
+ pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
+ "Internal Request type enqueue scrub");
+ pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
+ "Internal Request type export dir");
+ pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
+ "Internal Request type flush");
+ pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
+ "Internal Request type fragmentdir");
+ pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
+ "Internal Request type frag stats");
+ pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
+ "Internal Request type inode stats");
+
+ logger.reset(pcb.create_perf_counters());
+ g_ceph_context->get_perfcounters_collection()->add(logger.get());
+ recovery_queue.set_logger(logger.get());
+ stray_manager.set_logger(logger.get());
+}
+
+/**
+ * Call this when putting references to an inode/dentry or
+ * when attempting to trim it.
+ *
+ * If this inode is no longer linked by anyone, and this MDS
+ * rank holds the primary dentry, and that dentry is in a stray
+ * directory, then give up the dentry to the StrayManager, never
+ * to be seen again by MDCache.
+ *
+ * @param delay if true, then purgeable inodes are stashed til
+ * the next trim(), rather than being purged right
+ * away.
+ */
+void MDCache::maybe_eval_stray(CInode *in, bool delay) {
+ if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
+ mds->get_state() <= MDSMap::STATE_REJOIN)
+ return;
+
+ CDentry *dn = in->get_projected_parent_dn();
+
+ if (dn->state_test(CDentry::STATE_PURGING)) {
+ /* We have already entered the purging process, no need
+ * to re-evaluate me ! */
+ return;
+ }
+
+ if (dn->get_dir()->get_inode()->is_stray()) {
+ if (delay)
+ stray_manager.queue_delayed(dn);
+ else
+ stray_manager.eval_stray(dn);
+ }
+}
+
+void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
+ dout(10) << __func__ << " " << *diri << dendl;
+ ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
+ list<CDir*> ls;
+ diri->get_dirfrags(ls);
+ for (auto &p : ls) {
+ if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
+ p->try_remove_dentries_for_stray();
+ }
+ if (!diri->snaprealm) {
+ if (diri->is_auth())
+ diri->clear_dirty_rstat();
+ diri->clear_scatter_dirty();
+ }
+}
+
+bool MDCache::dump_inode(Formatter *f, uint64_t number) {
+ CInode *in = get_inode(number);
+ if (!in) {
+ return false;
+ }
+ f->open_object_section("inode");
+ in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
+ f->close_section();
+ return true;
+}
+
+void MDCache::handle_mdsmap(const MDSMap &mdsmap) {
+ // process export_pin_delayed_queue whenever a new MDSMap received
+ auto &q = export_pin_delayed_queue;
+ for (auto it = q.begin(); it != q.end(); ) {
+ auto *in = *it;
+ mds_rank_t export_pin = in->get_export_pin(false);
+ dout(10) << " delayed export_pin=" << export_pin << " on " << *in
+ << " max_mds=" << mdsmap.get_max_mds() << dendl;
+ if (export_pin >= mdsmap.get_max_mds()) {
+ it++;
+ continue;
+ }
+
+ in->state_clear(CInode::STATE_DELAYEDEXPORTPIN);
+ it = q.erase(it);
+ in->maybe_export_pin();
+ }
+}
+
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
new file mode 100644
index 00000000..ab5adb68
--- /dev/null
+++ b/src/mds/MDCache.h
@@ -0,0 +1,1363 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef CEPH_MDCACHE_H
+#define CEPH_MDCACHE_H
+
+#include <atomic>
+#include <string_view>
+#include <thread>
+
+#include "common/DecayCounter.h"
+#include "include/types.h"
+#include "include/filepath.h"
+#include "include/elist.h"
+
+#include "messages/MCacheExpire.h"
+#include "messages/MClientQuota.h"
+#include "messages/MClientRequest.h"
+#include "messages/MClientSnap.h"
+#include "messages/MDentryLink.h"
+#include "messages/MDentryUnlink.h"
+#include "messages/MDirUpdate.h"
+#include "messages/MDiscover.h"
+#include "messages/MDiscoverReply.h"
+#include "messages/MGatherCaps.h"
+#include "messages/MGenericMessage.h"
+#include "messages/MInodeFileCaps.h"
+#include "messages/MLock.h"
+#include "messages/MMDSCacheRejoin.h"
+#include "messages/MMDSFindIno.h"
+#include "messages/MMDSFindInoReply.h"
+#include "messages/MMDSFragmentNotify.h"
+#include "messages/MMDSFragmentNotifyAck.h"
+#include "messages/MMDSOpenIno.h"
+#include "messages/MMDSOpenInoReply.h"
+#include "messages/MMDSResolve.h"
+#include "messages/MMDSResolveAck.h"
+#include "messages/MMDSSlaveRequest.h"
+#include "messages/MMDSSnapUpdate.h"
+
+
+#include "osdc/Filer.h"
+#include "CInode.h"
+#include "CDentry.h"
+#include "CDir.h"
+#include "include/Context.h"
+#include "events/EMetaBlob.h"
+#include "RecoveryQueue.h"
+#include "StrayManager.h"
+#include "OpenFileTable.h"
+#include "MDSContext.h"
+#include "MDSMap.h"
+#include "Mutation.h"
+
+
+class PerfCounters;
+
+class MDSRank;
+class Session;
+class Migrator;
+
+class Session;
+
+class ESubtreeMap;
+
+enum {
+ l_mdc_first = 3000,
+ // How many inodes currently in stray dentries
+ l_mdc_num_strays,
+ // How many stray dentries are currently delayed for purge due to refs
+ l_mdc_num_strays_delayed,
+ // How many stray dentries are currently being enqueued for purge
+ l_mdc_num_strays_enqueuing,
+
+ // How many dentries have ever been added to stray dir
+ l_mdc_strays_created,
+ // How many dentries have been passed on to PurgeQueue
+ l_mdc_strays_enqueued,
+ // How many strays have been reintegrated?
+ l_mdc_strays_reintegrated,
+ // How many strays have been migrated?
+ l_mdc_strays_migrated,
+
+ // How many inode sizes currently being recovered
+ l_mdc_num_recovering_processing,
+ // How many inodes currently waiting to have size recovered
+ l_mdc_num_recovering_enqueued,
+ // How many inodes waiting with elevated priority for recovery
+ l_mdc_num_recovering_prioritized,
+ // How many inodes ever started size recovery
+ l_mdc_recovery_started,
+ // How many inodes ever completed size recovery
+ l_mdc_recovery_completed,
+
+ l_mdss_ireq_enqueue_scrub,
+ l_mdss_ireq_exportdir,
+ l_mdss_ireq_flush,
+ l_mdss_ireq_fragmentdir,
+ l_mdss_ireq_fragstats,
+ l_mdss_ireq_inodestats,
+
+ l_mdc_last,
+};
+
+
+// flags for predirty_journal_parents()
+static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting
+static const int PREDIRTY_DIR = 2; // update parent dir mtime/size
+static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback)
+
+class MDCache {
+ public:
+ using clock = ceph::coarse_mono_clock;
+ using time = ceph::coarse_mono_time;
+
+ typedef std::map<mds_rank_t, MCacheExpire::ref> expiremap;
+
+ // my master
+ MDSRank *mds;
+
+ // -- my cache --
+ LRU lru; // dentry lru for expiring items from cache
+ LRU bottom_lru; // dentries that should be trimmed ASAP
+ protected:
+ ceph::unordered_map<inodeno_t,CInode*> inode_map; // map of head inodes by ino
+ map<vinodeno_t, CInode*> snap_inode_map; // map of snap inodes by ino
+ CInode *root = nullptr; // root inode
+ CInode *myin = nullptr; // .ceph/mds%d dir
+
+ bool readonly = false;
+ void set_readonly() { readonly = true; }
+
+ std::array<CInode *, NUM_STRAY> strays{}; // my stray dir
+ int stray_index = 0;
+
+ CInode *get_stray() {
+ return strays[stray_index];
+ }
+
+ set<CInode*> base_inodes;
+
+ std::unique_ptr<PerfCounters> logger;
+
+ Filer filer;
+
+ bool exceeded_size_limit = false;
+
+private:
+ uint64_t cache_inode_limit;
+ uint64_t cache_memory_limit;
+ double cache_reservation;
+ double cache_health_threshold;
+ bool forward_all_requests_to_auth;
+
+public:
+ uint64_t cache_limit_inodes(void) {
+ return cache_inode_limit;
+ }
+ bool forward_all_reqs_to_auth() const {
+ return forward_all_requests_to_auth;
+ }
+ uint64_t cache_limit_memory(void) {
+ return cache_memory_limit;
+ }
+ double cache_toofull_ratio(void) const {
+ double inode_reserve = cache_inode_limit*(1.0-cache_reservation);
+ double memory_reserve = cache_memory_limit*(1.0-cache_reservation);
+ return fmax(0.0, fmax((cache_size()-memory_reserve)/memory_reserve, cache_inode_limit == 0 ? 0.0 : (CInode::count()-inode_reserve)/inode_reserve));
+ }
+ bool cache_toofull(void) const {
+ return cache_toofull_ratio() > 0.0;
+ }
+ uint64_t cache_size(void) const {
+ return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
+ }
+ bool cache_overfull(void) const {
+ return (cache_inode_limit > 0 && CInode::count() > cache_inode_limit*cache_health_threshold) || (cache_size() > cache_memory_limit*cache_health_threshold);
+ }
+
+ void advance_stray() {
+ stray_index = (stray_index+1)%NUM_STRAY;
+ }
+
+ /**
+ * Call this when you know that a CDentry is ready to be passed
+ * on to StrayManager (i.e. this is a stray you've just created)
+ */
+ void notify_stray(CDentry *dn) {
+ ceph_assert(dn->get_dir()->get_inode()->is_stray());
+ if (dn->state_test(CDentry::STATE_PURGING))
+ return;
+
+ stray_manager.eval_stray(dn);
+ }
+
+ void maybe_eval_stray(CInode *in, bool delay=false);
+ void clear_dirty_bits_for_stray(CInode* diri);
+
+ bool is_readonly() { return readonly; }
+ void force_readonly();
+
+ DecayRate decayrate;
+
+ int num_shadow_inodes = 0;
+
+ int num_inodes_with_caps = 0;
+
+ unsigned max_dir_commit_size;
+
+ static file_layout_t gen_default_file_layout(const MDSMap &mdsmap);
+ static file_layout_t gen_default_log_layout(const MDSMap &mdsmap);
+
+ file_layout_t default_file_layout;
+ file_layout_t default_log_layout;
+
+ void register_perfcounters();
+
+ // -- client leases --
+public:
+ static constexpr std::size_t client_lease_pools = 3;
+ std::array<float, client_lease_pools> client_lease_durations{5.0, 30.0, 300.0};
+
+protected:
+ std::array<xlist<ClientLease*>, client_lease_pools> client_leases{};
+public:
+ void touch_client_lease(ClientLease *r, int pool, utime_t ttl) {
+ client_leases[pool].push_back(&r->item_lease);
+ r->ttl = ttl;
+ }
+
+ void notify_stray_removed()
+ {
+ stray_manager.notify_stray_removed();
+ }
+
+ void notify_stray_created()
+ {
+ stray_manager.notify_stray_created();
+ }
+
+ void eval_remote(CDentry *dn)
+ {
+ stray_manager.eval_remote(dn);
+ }
+
+ // -- client caps --
+ uint64_t last_cap_id = 0;
+
+ // -- discover --
+ struct discover_info_t {
+ ceph_tid_t tid;
+ mds_rank_t mds;
+ inodeno_t ino;
+ frag_t frag;
+ snapid_t snap;
+ filepath want_path;
+ CInode *basei;
+ bool want_base_dir;
+ bool want_xlocked;
+
+ discover_info_t() :
+ tid(0), mds(-1), snap(CEPH_NOSNAP), basei(NULL),
+ want_base_dir(false), want_xlocked(false) {}
+ ~discover_info_t() {
+ if (basei)
+ basei->put(MDSCacheObject::PIN_DISCOVERBASE);
+ }
+ void pin_base(CInode *b) {
+ basei = b;
+ basei->get(MDSCacheObject::PIN_DISCOVERBASE);
+ }
+ };
+
+ map<ceph_tid_t, discover_info_t> discovers;
+ ceph_tid_t discover_last_tid = 0;
+
+ void _send_discover(discover_info_t& dis);
+ discover_info_t& _create_discover(mds_rank_t mds) {
+ ceph_tid_t t = ++discover_last_tid;
+ discover_info_t& d = discovers[t];
+ d.tid = t;
+ d.mds = mds;
+ return d;
+ }
+
+ // waiters
+ map<int, map<inodeno_t, MDSContext::vec > > waiting_for_base_ino;
+
+ void discover_base_ino(inodeno_t want_ino, MDSContext *onfinish, mds_rank_t from=MDS_RANK_NONE);
+ void discover_dir_frag(CInode *base, frag_t approx_fg, MDSContext *onfinish,
+ mds_rank_t from=MDS_RANK_NONE);
+ void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
+ bool want_xlocked=false, mds_rank_t from=MDS_RANK_NONE);
+ void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
+ bool want_xlocked=false);
+ void kick_discovers(mds_rank_t who); // after a failure.
+
+
+ // -- subtrees --
+private:
+ static const unsigned int SUBTREES_COUNT_THRESHOLD = 5;
+ static const unsigned int SUBTREES_DEPTH_THRESHOLD = 5;
+protected:
+ /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
+ map<CDir*,set<CDir*> > subtrees;
+ map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames; // renamed ino -> target dir
+
+ // adjust subtree auth specification
+ // dir->dir_auth
+ // imports/exports/nested_exports
+ // join/split subtrees as appropriate
+public:
+ bool is_subtrees() { return !subtrees.empty(); }
+ template<typename T>
+ void get_subtrees(T& c) {
+ if constexpr (std::is_same_v<T, std::vector<CDir*>>)
+ c.reserve(c.size() + subtrees.size());
+ for (const auto& p : subtrees) {
+ c.push_back(p.first);
+ }
+ }
+ void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true);
+ void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
+ adjust_subtree_auth(root, mds_authority_t(a,b));
+ }
+ void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth);
+ void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_rank_t a) {
+ adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
+ }
+ void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, const mds_authority_t &auth);
+ void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, mds_rank_t a) {
+ adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
+ }
+ void map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result);
+ void try_subtree_merge(CDir *root);
+ void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval, bool adjust_pop=true);
+ void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut);
+ void eval_subtree_root(CInode *diri);
+ CDir *get_subtree_root(CDir *dir);
+ CDir *get_projected_subtree_root(CDir *dir);
+ bool is_leaf_subtree(CDir *dir) {
+ ceph_assert(subtrees.count(dir));
+ return subtrees[dir].empty();
+ }
+ void remove_subtree(CDir *dir);
+ bool is_subtree(CDir *root) {
+ return subtrees.count(root);
+ }
+ void get_subtree_bounds(CDir *root, set<CDir*>& bounds);
+ void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds);
+ void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds);
+ void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds);
+
+ void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir);
+ void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop);
+
+ auto get_auth_subtrees() {
+ std::vector<CDir*> c;
+ for (auto& p : subtrees) {
+ auto& root = p.first;
+ if (root->is_auth()) {
+ c.push_back(root);
+ }
+ }
+ return c;
+ }
+
+ auto get_fullauth_subtrees() {
+ std::vector<CDir*> c;
+ for (auto& p : subtrees) {
+ auto& root = p.first;
+ if (root->is_full_dir_auth()) {
+ c.push_back(root);
+ }
+ }
+ return c;
+ }
+ auto num_subtrees_fullauth() const {
+ std::size_t n = 0;
+ for (auto& p : subtrees) {
+ auto& root = p.first;
+ if (root->is_full_dir_auth()) {
+ ++n;
+ }
+ }
+ return n;
+ }
+
+ auto num_subtrees_fullnonauth() const {
+ std::size_t n = 0;
+ for (auto& p : subtrees) {
+ auto& root = p.first;
+ if (root->is_full_dir_nonauth()) {
+ ++n;
+ }
+ }
+ return n;
+ }
+
+ auto num_subtrees() const {
+ return subtrees.size();
+ }
+
+
+protected:
+ // -- requests --
+ ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
+
+public:
+ int get_num_client_requests();
+
+ MDRequestRef request_start(const MClientRequest::const_ref& req);
+ MDRequestRef request_start_slave(metareqid_t rid, __u32 attempt, const Message::const_ref &m);
+ MDRequestRef request_start_internal(int op);
+ bool have_request(metareqid_t rid) {
+ return active_requests.count(rid);
+ }
+ MDRequestRef request_get(metareqid_t rid);
+ void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace);
+ void request_finish(MDRequestRef& mdr);
+ void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0);
+ void dispatch_request(MDRequestRef& mdr);
+ void request_drop_foreign_locks(MDRequestRef& mdr);
+ void request_drop_non_rdlocks(MDRequestRef& r);
+ void request_drop_locks(MDRequestRef& r);
+ void request_cleanup(MDRequestRef& r);
+
+ void request_kill(MDRequestRef& r); // called when session closes
+
+ // journal/snap helpers
+ CInode *pick_inode_snap(CInode *in, snapid_t follows);
+ CInode *cow_inode(CInode *in, snapid_t last);
+ void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
+ snapid_t follows=CEPH_NOSNAP,
+ CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
+ void journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP,
+ CInode **pcow_inode=0);
+ void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
+
+ void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
+ int linkunlink, SnapRealm *prealm);
+ void _project_rstat_inode_to_frag(CInode::mempool_inode & inode, snapid_t ofirst, snapid_t last,
+ CDir *parent, int linkunlink, bool update_inode);
+ void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
+ snapid_t ofirst, snapid_t last,
+ CInode *pin, bool cow_head);
+ void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false);
+ void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
+ CInode *in, CDir *parent,
+ int flags, int linkunlink=0,
+ snapid_t follows=CEPH_NOSNAP);
+
+ // slaves
+ void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &slaves, bool safe=false) {
+ uncommitted_masters[reqid].ls = ls;
+ uncommitted_masters[reqid].slaves = slaves;
+ uncommitted_masters[reqid].safe = safe;
+ }
+ void wait_for_uncommitted_master(metareqid_t reqid, MDSContext *c) {
+ uncommitted_masters[reqid].waiters.push_back(c);
+ }
+ bool have_uncommitted_master(metareqid_t reqid, mds_rank_t from) {
+ auto p = uncommitted_masters.find(reqid);
+ return p != uncommitted_masters.end() && p->second.slaves.count(from) > 0;
+ }
+ void log_master_commit(metareqid_t reqid);
+ void logged_master_update(metareqid_t reqid);
+ void _logged_master_commit(metareqid_t reqid);
+ void committed_master_slave(metareqid_t r, mds_rank_t from);
+ void finish_committed_masters();
+
+ void add_uncommitted_slave(metareqid_t reqid, LogSegment*, mds_rank_t, MDSlaveUpdate *su=nullptr);
+ void wait_for_uncommitted_slave(metareqid_t reqid, MDSContext *c) {
+ uncommitted_slaves.at(reqid).waiters.push_back(c);
+ }
+ void finish_uncommitted_slave(metareqid_t reqid, bool assert_exist=true);
+ MDSlaveUpdate* get_uncommitted_slave(metareqid_t reqid, mds_rank_t master);
+ void _logged_slave_commit(mds_rank_t from, metareqid_t reqid);
+
+ // -- recovery --
+protected:
+ set<mds_rank_t> recovery_set;
+
+public:
+ void set_recovery_set(set<mds_rank_t>& s);
+ void handle_mds_failure(mds_rank_t who);
+ void handle_mds_recovery(mds_rank_t who);
+
+protected:
+ // [resolve]
+ // from EImportStart w/o EImportFinish during journal replay
+ map<dirfrag_t, vector<dirfrag_t> > my_ambiguous_imports;
+ // from MMDSResolves
+ map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
+
+ map<CInode*, int> uncommitted_slave_rename_olddir; // slave: preserve the non-auth dir until seeing commit.
+ map<CInode*, int> uncommitted_slave_unlink; // slave: preserve the unlinked inode until seeing commit.
+
+ // track master requests whose slaves haven't acknowledged commit
+ struct umaster {
+ set<mds_rank_t> slaves;
+ LogSegment *ls;
+ MDSContext::vec waiters;
+ bool safe;
+ bool committing;
+ bool recovering;
+ umaster() : ls(NULL), safe(false), committing(false), recovering(false) {}
+ };
+ map<metareqid_t, umaster> uncommitted_masters; // master: req -> slave set
+
+ struct uslave {
+ uslave() {}
+ mds_rank_t master;
+ LogSegment *ls = nullptr;
+ MDSlaveUpdate *su = nullptr;
+ MDSContext::vec waiters;
+ };
+ map<metareqid_t, uslave> uncommitted_slaves; // slave: preserve the slave req until seeing commit.
+
+ set<metareqid_t> pending_masters;
+ map<int, set<metareqid_t> > ambiguous_slave_updates;
+
+ friend class ESlaveUpdate;
+ friend class ECommitted;
+
+ bool resolves_pending = false;
+ set<mds_rank_t> resolve_gather; // nodes i need resolves from
+ set<mds_rank_t> resolve_ack_gather; // nodes i need a resolve_ack from
+ set<version_t> resolve_snapclient_commits;
+ map<metareqid_t, mds_rank_t> resolve_need_rollback; // rollbacks i'm writing to the journal
+ map<mds_rank_t, MMDSResolve::const_ref> delayed_resolve;
+
+ void handle_resolve(const MMDSResolve::const_ref &m);
+ void handle_resolve_ack(const MMDSResolveAck::const_ref &m);
+ void process_delayed_resolve();
+ void discard_delayed_resolve(mds_rank_t who);
+ void maybe_resolve_finish();
+ void disambiguate_my_imports();
+ void disambiguate_other_imports();
+ void trim_unlinked_inodes();
+
+ void send_slave_resolves();
+ void send_subtree_resolves();
+ void maybe_finish_slave_resolve();
+
+public:
+ void recalc_auth_bits(bool replay);
+ void remove_inode_recursive(CInode *in);
+
+ bool is_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
+ auto p = ambiguous_slave_updates.find(master);
+ return p != ambiguous_slave_updates.end() && p->second.count(reqid);
+ }
+ void add_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
+ ambiguous_slave_updates[master].insert(reqid);
+ }
+ void remove_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
+ auto p = ambiguous_slave_updates.find(master);
+ auto q = p->second.find(reqid);
+ ceph_assert(q != p->second.end());
+ p->second.erase(q);
+ if (p->second.empty())
+ ambiguous_slave_updates.erase(p);
+ }
+
+ void add_rollback(metareqid_t reqid, mds_rank_t master) {
+ resolve_need_rollback[reqid] = master;
+ }
+ void finish_rollback(metareqid_t reqid, MDRequestRef& mdr);
+
+ // ambiguous imports
+ void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
+ void add_ambiguous_import(CDir *base, const set<CDir*>& bounds);
+ bool have_ambiguous_import(dirfrag_t base) {
+ return my_ambiguous_imports.count(base);
+ }
+ void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) {
+ ceph_assert(my_ambiguous_imports.count(base));
+ bounds = my_ambiguous_imports[base];
+ }
+ void cancel_ambiguous_import(CDir *);
+ void finish_ambiguous_import(dirfrag_t dirino);
+ void resolve_start(MDSContext *resolve_done_);
+ void send_resolves();
+ void maybe_send_pending_resolves() {
+ if (resolves_pending)
+ send_subtree_resolves();
+ }
+
+ void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
+ map<dirfrag_t,vector<dirfrag_t> >& subtrees);
+ ESubtreeMap *create_subtree_map();
+
+
+ void clean_open_file_lists();
+ void dump_openfiles(Formatter *f);
+ bool dump_inode(Formatter *f, uint64_t number);
+protected:
+ // [rejoin]
+ bool rejoins_pending = false;
+ set<mds_rank_t> rejoin_gather; // nodes from whom i need a rejoin
+ set<mds_rank_t> rejoin_sent; // nodes i sent a rejoin to
+ set<mds_rank_t> rejoin_ack_sent; // nodes i sent a rejoin to
+ set<mds_rank_t> rejoin_ack_gather; // nodes from whom i need a rejoin ack
+ map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
+ map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_slave_exports;
+
+ map<client_t,entity_inst_t> rejoin_client_map;
+ map<client_t,client_metadata_t> rejoin_client_metadata_map;
+ map<client_t,pair<Session*,uint64_t> > rejoin_session_map;
+
+ map<inodeno_t,pair<mds_rank_t,map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex
+
+ map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports; // ino -> client -> frommds -> capex
+ set<inodeno_t> cap_imports_missing;
+ map<inodeno_t, MDSContext::vec > cap_reconnect_waiters;
+ int cap_imports_num_opening = 0;
+
+ set<CInode*> rejoin_undef_inodes;
+ set<CInode*> rejoin_potential_updated_scatterlocks;
+ set<CDir*> rejoin_undef_dirfrags;
+ map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes;
+
+ vector<CInode*> rejoin_recover_q, rejoin_check_q;
+ list<SimpleLock*> rejoin_eval_locks;
+ MDSContext::vec rejoin_waiters;
+
+ void rejoin_walk(CDir *dir, const MMDSCacheRejoin::ref &rejoin);
+ void handle_cache_rejoin(const MMDSCacheRejoin::const_ref &m);
+ void handle_cache_rejoin_weak(const MMDSCacheRejoin::const_ref &m);
+ CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
+ CDir* rejoin_invent_dirfrag(dirfrag_t df);
+ void handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref &m);
+ void rejoin_scour_survivor_replicas(mds_rank_t from, const MMDSCacheRejoin::const_ref &ack,
+ set<vinodeno_t>& acked_inodes,
+ set<SimpleLock *>& gather_locks);
+ void handle_cache_rejoin_ack(const MMDSCacheRejoin::const_ref &m);
+ void rejoin_send_acks();
+ void rejoin_trim_undef_inodes();
+ void maybe_send_pending_rejoins() {
+ if (rejoins_pending)
+ rejoin_send_rejoins();
+ }
+ std::unique_ptr<MDSContext> rejoin_done;
+ std::unique_ptr<MDSContext> resolve_done;
+public:
+ void rejoin_start(MDSContext *rejoin_done_);
+ void rejoin_gather_finish();
+ void rejoin_send_rejoins();
+ void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
+ int target=-1, bool drop_path=false) {
+ auto& ex = cap_exports[ino];
+ ex.first = target;
+ auto &_icr = ex.second[client] = icr;
+ if (drop_path)
+ _icr.path.clear();
+ }
+ void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
+ mds_rank_t frommds=MDS_RANK_NONE, bool drop_path=false) {
+ auto &_icr = cap_imports[ino][client][frommds] = icr;
+ if (drop_path)
+ _icr.path.clear();
+ }
+ void rejoin_recovered_client(client_t client, const entity_inst_t& inst) {
+ rejoin_client_map.emplace(client, inst);
+ }
+ bool rejoin_has_cap_reconnect(inodeno_t ino) const {
+ return cap_imports.count(ino);
+ }
+ void add_replay_ino_alloc(inodeno_t ino) {
+ cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin
+ }
+ const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
+ if (cap_imports.count(ino) &&
+ cap_imports[ino].count(client) &&
+ cap_imports[ino][client].count(MDS_RANK_NONE)) {
+ return &cap_imports[ino][client][MDS_RANK_NONE];
+ }
+ return NULL;
+ }
+ void remove_replay_cap_reconnect(inodeno_t ino, client_t client) {
+ ceph_assert(cap_imports[ino].size() == 1);
+ ceph_assert(cap_imports[ino][client].size() == 1);
+ cap_imports.erase(ino);
+ }
+ void wait_replay_cap_reconnect(inodeno_t ino, MDSContext *c) {
+ cap_reconnect_waiters[ino].push_back(c);
+ }
+
+ // [reconnect/rejoin caps]
+ struct reconnected_cap_info_t {
+ inodeno_t realm_ino;
+ snapid_t snap_follows;
+ int dirty_caps;
+ bool snapflush;
+ reconnected_cap_info_t() :
+ realm_ino(0), snap_follows(0), dirty_caps(0), snapflush(false) {}
+ };
+ map<inodeno_t,map<client_t, reconnected_cap_info_t> > reconnected_caps; // inode -> client -> snap_follows,realmino
+ map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms; // realmino -> client -> realmseq
+
+ void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) {
+ reconnected_cap_info_t &info = reconnected_caps[ino][client];
+ info.realm_ino = inodeno_t(icr.capinfo.snaprealm);
+ info.snap_follows = icr.snap_follows;
+ }
+ void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty, bool snapflush) {
+ reconnected_cap_info_t &info = reconnected_caps[ino][client];
+ info.dirty_caps |= dirty;
+ if (snapflush)
+ info.snapflush = snapflush;
+ }
+ void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
+ reconnected_snaprealms[ino][client] = seq;
+ }
+
+ friend class C_MDC_RejoinOpenInoFinish;
+ friend class C_MDC_RejoinSessionsOpened;
+ void rejoin_open_ino_finish(inodeno_t ino, int ret);
+ void rejoin_prefetch_ino_finish(inodeno_t ino, int ret);
+ void rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map);
+ bool process_imported_caps();
+ void choose_lock_states_and_reconnect_caps();
+ void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
+ map<client_t,MClientSnap::ref>& splits);
+ void prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, map<client_t,MClientSnap::ref>& splits);
+ void send_snaps(map<client_t,MClientSnap::ref>& splits);
+ Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
+ void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
+ map<client_t,MClientSnap::ref>& updates);
+ Capability* try_reconnect_cap(CInode *in, Session *session);
+ void export_remaining_imported_caps();
+
+ // realm inodes
+ set<CInode*> rejoin_pending_snaprealms;
+ // cap imports. delayed snap parent opens.
+ map<client_t,set<CInode*> > delayed_imported_caps;
+
+ void do_cap_import(Session *session, CInode *in, Capability *cap,
+ uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
+ int peer, int p_flags);
+ void do_delayed_cap_imports();
+ void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client,
+ snapid_t snap_follows);
+ void open_snaprealms();
+
+ bool open_undef_inodes_dirfrags();
+ void opened_undef_inode(CInode *in);
+ void opened_undef_dirfrag(CDir *dir) {
+ rejoin_undef_dirfrags.erase(dir);
+ }
+
+ void reissue_all_caps();
+
+
+ friend class Locker;
+ friend class Migrator;
+ friend class MDBalancer;
+
+ // StrayManager needs to be able to remove_inode() from us
+ // when it is done purging
+ friend class StrayManager;
+
+ // File size recovery
+private:
+ RecoveryQueue recovery_queue;
+ void identify_files_to_recover();
+public:
+ void start_files_to_recover();
+ void do_file_recover();
+ void queue_file_recover(CInode *in);
+ void _queued_file_recover_cow(CInode *in, MutationRef& mut);
+
+ // subsystems
+ std::unique_ptr<Migrator> migrator;
+
+ public:
+ explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
+ ~MDCache();
+ void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
+
+ // debug
+ void log_stat();
+
+ // root inode
+ CInode *get_root() { return root; }
+ CInode *get_myin() { return myin; }
+
+ size_t get_cache_size() { return lru.lru_get_size(); }
+
+ // trimming
+ std::pair<bool, uint64_t> trim(uint64_t count=0);
+private:
+ std::pair<bool, uint64_t> trim_lru(uint64_t count, expiremap& expiremap);
+ bool trim_dentry(CDentry *dn, expiremap& expiremap);
+ void trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap);
+ bool trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap&);
+ void send_expire_messages(expiremap& expiremap);
+ void trim_non_auth(); // trim out trimmable non-auth items
+public:
+ bool trim_non_auth_subtree(CDir *directory);
+ void standby_trim_segment(LogSegment *ls);
+ void try_trim_non_auth_subtree(CDir *dir);
+ bool can_trim_non_auth_dirfrag(CDir *dir) {
+ return my_ambiguous_imports.count((dir)->dirfrag()) == 0 &&
+ uncommitted_slave_rename_olddir.count(dir->inode) == 0;
+ }
+
+ /**
+ * For all unreferenced inodes, dirs, dentries below an inode, compose
+ * expiry messages. This is used when giving up all replicas of entities
+ * for an MDS peer in the 'stopping' state, such that the peer can
+ * empty its cache and finish shutting down.
+ *
+ * We have to make sure we're only expiring un-referenced items to
+ * avoid interfering with ongoing stray-movement (we can't distinguish
+ * between the "moving my strays" and "waiting for my cache to empty"
+ * phases within 'stopping')
+ *
+ * @return false if we completed cleanly, true if caller should stop
+ * expiring because we hit something with refs.
+ */
+ bool expire_recursive(CInode *in, expiremap& expiremap);
+
+ void trim_client_leases();
+ void check_memory_usage();
+
+ // shutdown
+private:
+ set<inodeno_t> shutdown_exporting_strays;
+ pair<dirfrag_t, string> shutdown_export_next;
+public:
+ void shutdown_start();
+ void shutdown_check();
+ bool shutdown_pass();
+ bool shutdown(); // clear cache (ie at shutodwn)
+ bool shutdown_export_strays();
+ void shutdown_export_stray_finish(inodeno_t ino) {
+ if (shutdown_exporting_strays.erase(ino))
+ shutdown_export_strays();
+ }
+
+ bool did_shutdown_log_cap = false;
+
+ // inode_map
+ bool have_inode(vinodeno_t vino) {
+ if (vino.snapid == CEPH_NOSNAP)
+ return inode_map.count(vino.ino) ? true : false;
+ else
+ return snap_inode_map.count(vino) ? true : false;
+ }
+ bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) {
+ return have_inode(vinodeno_t(ino, snap));
+ }
+ CInode* get_inode(vinodeno_t vino) {
+ if (vino.snapid == CEPH_NOSNAP) {
+ auto p = inode_map.find(vino.ino);
+ if (p != inode_map.end())
+ return p->second;
+ } else {
+ auto p = snap_inode_map.find(vino);
+ if (p != snap_inode_map.end())
+ return p->second;
+ }
+ return NULL;
+ }
+ CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) {
+ return get_inode(vinodeno_t(ino, s));
+ }
+ CInode* lookup_snap_inode(vinodeno_t vino) {
+ auto p = snap_inode_map.lower_bound(vino);
+ if (p != snap_inode_map.end() &&
+ p->second->ino() == vino.ino && p->second->first <= vino.snapid)
+ return p->second;
+ return NULL;
+ }
+
+ CDir* get_dirfrag(dirfrag_t df) {
+ CInode *in = get_inode(df.ino);
+ if (!in)
+ return NULL;
+ return in->get_dirfrag(df.frag);
+ }
+ CDir* get_dirfrag(inodeno_t ino, std::string_view dn) {
+ CInode *in = get_inode(ino);
+ if (!in)
+ return NULL;
+ frag_t fg = in->pick_dirfrag(dn);
+ return in->get_dirfrag(fg);
+ }
+ CDir* get_force_dirfrag(dirfrag_t df, bool replay) {
+ CInode *diri = get_inode(df.ino);
+ if (!diri)
+ return NULL;
+ CDir *dir = force_dir_fragment(diri, df.frag, replay);
+ if (!dir)
+ dir = diri->get_dirfrag(df.frag);
+ return dir;
+ }
+
+ MDSCacheObject *get_object(const MDSCacheObjectInfo &info);
+
+
+
+ public:
+ void add_inode(CInode *in);
+
+ void remove_inode(CInode *in);
+ protected:
+ void touch_inode(CInode *in) {
+ if (in->get_parent_dn())
+ touch_dentry(in->get_projected_parent_dn());
+ }
+public:
+ void touch_dentry(CDentry *dn) {
+ if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
+ bottom_lru.lru_midtouch(dn);
+ } else {
+ if (dn->is_auth())
+ lru.lru_touch(dn);
+ else
+ lru.lru_midtouch(dn);
+ }
+ }
+ void touch_dentry_bottom(CDentry *dn) {
+ if (dn->state_test(CDentry::STATE_BOTTOMLRU))
+ return;
+ lru.lru_bottouch(dn);
+ }
+protected:
+
+ void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin,
+ set<SimpleLock *>& gather_locks);
+ void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks);
+
+ void rename_file(CDentry *srcdn, CDentry *destdn);
+
+ public:
+ // truncate
+ void truncate_inode(CInode *in, LogSegment *ls);
+ void _truncate_inode(CInode *in, LogSegment *ls);
+ void truncate_inode_finish(CInode *in, LogSegment *ls);
+ void truncate_inode_logged(CInode *in, MutationRef& mut);
+
+ void add_recovered_truncate(CInode *in, LogSegment *ls);
+ void remove_recovered_truncate(CInode *in, LogSegment *ls);
+ void start_recovered_truncates();
+
+
+ public:
+ CDir *get_auth_container(CDir *in);
+ CDir *get_export_container(CDir *dir);
+ void find_nested_exports(CDir *dir, set<CDir*>& s);
+ void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
+
+
+private:
+ bool opening_root = false, open = false;
+ MDSContext::vec waiting_for_open;
+
+public:
+ void init_layouts();
+ void create_unlinked_system_inode(CInode *in, inodeno_t ino,
+ int mode) const;
+ CInode *create_system_inode(inodeno_t ino, int mode);
+ CInode *create_root_inode();
+
+ void create_empty_hierarchy(MDSGather *gather);
+ void create_mydir_hierarchy(MDSGather *gather);
+
+ bool is_open() { return open; }
+ void wait_for_open(MDSContext *c) {
+ waiting_for_open.push_back(c);
+ }
+
+ void open_root_inode(MDSContext *c);
+ void open_root();
+ void open_mydir_inode(MDSContext *c);
+ void open_mydir_frag(MDSContext *c);
+ void populate_mydir();
+
+ void _create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin);
+ void _create_system_file_finish(MutationRef& mut, CDentry *dn,
+ version_t dpv, MDSContext *fin);
+
+ void open_foreign_mdsdir(inodeno_t ino, MDSContext *c);
+ CDir *get_stray_dir(CInode *in);
+ CDentry *get_or_create_stray_dentry(CInode *in);
+
+ /**
+ * Find the given dentry (and whether it exists or not), its ancestors,
+ * and get them all into memory and usable on this MDS. This function
+ * makes a best-effort attempt to load everything; if it needs to
+ * go away and do something then it will put the request on a waitlist.
+ * It prefers the mdr, then the req, then the fin. (At least one of these
+ * must be non-null.)
+ *
+ * At least one of the params mdr, req, and fin must be non-null.
+ *
+ * @param mdr The MDRequest associated with the path. Can be null.
+ * @param cf A MDSContextFactory for waiter building.
+ * @param path The path to traverse to.
+ * @param pdnvec Data return parameter -- on success, contains a
+ * vector of dentries. On failure, is either empty or contains the
+ * full trace of traversable dentries.
+ * @param pin Data return parameter -- if successful, points to the inode
+ * associated with filepath. If unsuccessful, is null.
+ * @param onfail Specifies different lookup failure behaviors. If set to
+ * MDS_TRAVERSE_DISCOVERXLOCK, path_traverse will succeed on null
+ * dentries (instead of returning -ENOENT). If set to
+ * MDS_TRAVERSE_FORWARD, it will forward the request to the auth
+ * MDS if that becomes appropriate (ie, if it doesn't know the contents
+ * of a directory). If set to MDS_TRAVERSE_DISCOVER, it
+ * will attempt to look up the path from a different MDS (and bring them
+ * into its cache as replicas).
+ *
+ * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
+ * If it returns 1, the requester associated with this call has been placed
+ * on the appropriate waitlist, and it should unwind itself and back out.
+ * If it returns 2 the request has been forwarded, and again the requester
+ * should unwind itself and back out.
+ */
+ int path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, const filepath& path,
+ vector<CDentry*> *pdnvec, CInode **pin, int onfail);
+
+ CInode *cache_traverse(const filepath& path);
+
+ void open_remote_dirfrag(CInode *diri, frag_t fg, MDSContext *fin);
+ CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
+
+ bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
+ bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
+ set<CDir*>& fetch_queue, set<inodeno_t>& missing,
+ C_GatherBuilder &gather_bld);
+
+ void open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin,
+ bool want_xlocked=false);
+ void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
+ bool want_xlocked, int r);
+
+ void make_trace(vector<CDentry*>& trace, CInode *in);
+
+protected:
+ struct open_ino_info_t {
+ vector<inode_backpointer_t> ancestors;
+ set<mds_rank_t> checked;
+ mds_rank_t checking;
+ mds_rank_t auth_hint;
+ bool check_peers;
+ bool fetch_backtrace;
+ bool discover;
+ bool want_replica;
+ bool want_xlocked;
+ version_t tid;
+ int64_t pool;
+ int last_err;
+ MDSContext::vec waiters;
+ open_ino_info_t() : checking(MDS_RANK_NONE), auth_hint(MDS_RANK_NONE),
+ check_peers(true), fetch_backtrace(true), discover(false),
+ want_replica(false), want_xlocked(false), tid(0), pool(-1),
+ last_err(0) {}
+ };
+ ceph_tid_t open_ino_last_tid = 0;
+ map<inodeno_t,open_ino_info_t> opening_inodes;
+
+ void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
+ void _open_ino_parent_opened(inodeno_t ino, int ret);
+ void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
+ void _open_ino_fetch_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m, CDir *dir, bool parent);
+ int open_ino_traverse_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m,
+ const vector<inode_backpointer_t>& ancestors,
+ bool discover, bool want_xlocked, mds_rank_t *hint);
+ void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
+ void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
+ void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
+ void handle_open_ino(const MMDSOpenIno::const_ref &m, int err=0);
+ void handle_open_ino_reply(const MMDSOpenInoReply::const_ref &m);
+ friend class C_IO_MDC_OpenInoBacktraceFetched;
+ friend struct C_MDC_OpenInoTraverseDir;
+ friend struct C_MDC_OpenInoParentOpened;
+
+public:
+ void kick_open_ino_peers(mds_rank_t who);
+ void open_ino(inodeno_t ino, int64_t pool, MDSContext *fin,
+ bool want_replica=true, bool want_xlocked=false);
+
+ // -- find_ino_peer --
+ struct find_ino_peer_info_t {
+ inodeno_t ino;
+ ceph_tid_t tid;
+ MDSContext *fin;
+ mds_rank_t hint;
+ mds_rank_t checking;
+ set<mds_rank_t> checked;
+
+ find_ino_peer_info_t() : tid(0), fin(NULL), hint(MDS_RANK_NONE), checking(MDS_RANK_NONE) {}
+ };
+
+ map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer;
+ ceph_tid_t find_ino_peer_last_tid = 0;
+
+ void find_ino_peers(inodeno_t ino, MDSContext *c, mds_rank_t hint=MDS_RANK_NONE);
+ void _do_find_ino_peer(find_ino_peer_info_t& fip);
+ void handle_find_ino(const MMDSFindIno::const_ref &m);
+ void handle_find_ino_reply(const MMDSFindInoReply::const_ref &m);
+ void kick_find_ino_peers(mds_rank_t who);
+
+ // -- snaprealms --
+private:
+ SnapRealm *global_snaprealm = nullptr;
+public:
+ SnapRealm *get_global_snaprealm() const { return global_snaprealm; }
+ void create_global_snaprealm();
+ void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients=true);
+ void send_snap_update(CInode *in, version_t stid, int snap_op);
+ void handle_snap_update(const MMDSSnapUpdate::const_ref &m);
+ void notify_global_snaprealm_update(int snap_op);
+
+ // -- stray --
+public:
+ void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
+ uint64_t get_num_strays() const { return stray_manager.get_num_strays(); }
+
+protected:
+ void scan_stray_dir(dirfrag_t next=dirfrag_t());
+ StrayManager stray_manager;
+ friend struct C_MDC_RetryScanStray;
+
+ // == messages ==
+ public:
+ void dispatch(const Message::const_ref &m);
+
+ protected:
+ // -- replicas --
+ void handle_discover(const MDiscover::const_ref &dis);
+ void handle_discover_reply(const MDiscoverReply::const_ref &m);
+ friend class C_MDC_Join;
+
+public:
+ void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
+ void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
+ void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
+ uint64_t features);
+
+ CDir* add_replica_dir(bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, MDSContext::vec& finished);
+ CDentry *add_replica_dentry(bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished);
+ CInode *add_replica_inode(bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished);
+
+ void replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl);
+ CDentry *add_replica_stray(const bufferlist &bl, mds_rank_t from);
+
+ // -- namespace --
+public:
+ void send_dentry_link(CDentry *dn, MDRequestRef& mdr);
+ void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr);
+protected:
+ void handle_dentry_link(const MDentryLink::const_ref &m);
+ void handle_dentry_unlink(const MDentryUnlink::const_ref &m);
+
+
+ // -- fragmenting --
+private:
+ struct ufragment {
+ int bits;
+ bool committed;
+ LogSegment *ls;
+ MDSContext::vec waiters;
+ frag_vec_t old_frags;
+ bufferlist rollback;
+ ufragment() : bits(0), committed(false), ls(NULL) {}
+ };
+ map<dirfrag_t, ufragment> uncommitted_fragments;
+
+ struct fragment_info_t {
+ int bits;
+ list<CDir*> dirs;
+ list<CDir*> resultfrags;
+ MDRequestRef mdr;
+ set<mds_rank_t> notify_ack_waiting;
+ bool finishing = false;
+
+ // for deadlock detection
+ bool all_frozen = false;
+ utime_t last_cum_auth_pins_change;
+ int last_cum_auth_pins = 0;
+ int num_remote_waiters = 0; // number of remote authpin waiters
+ fragment_info_t() {}
+ bool is_fragmenting() { return !resultfrags.empty(); }
+ uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; }
+ };
+ map<dirfrag_t,fragment_info_t> fragments;
+ typedef map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator;
+
+ void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
+ list<CDir*>& frags, MDSContext::vec& waiters, bool replay);
+ void adjust_dir_fragments(CInode *diri,
+ list<CDir*>& srcfrags,
+ frag_t basefrag, int bits,
+ list<CDir*>& resultfrags,
+ MDSContext::vec& waiters,
+ bool replay);
+ CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true);
+ void get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds);
+
+ bool can_fragment(CInode *diri, list<CDir*>& dirs);
+ void fragment_freeze_dirs(list<CDir*>& dirs);
+ void fragment_mark_and_complete(MDRequestRef& mdr);
+ void fragment_frozen(MDRequestRef& mdr, int r);
+ void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
+ void fragment_drop_locks(fragment_info_t &info);
+ void fragment_maybe_finish(const fragment_info_iterator& it);
+ void dispatch_fragment_dir(MDRequestRef& mdr);
+ void _fragment_logged(MDRequestRef& mdr);
+ void _fragment_stored(MDRequestRef& mdr);
+ void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr);
+ void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr);
+
+ friend class EFragment;
+ friend class C_MDC_FragmentFrozen;
+ friend class C_MDC_FragmentMarking;
+ friend class C_MDC_FragmentPrep;
+ friend class C_MDC_FragmentStore;
+ friend class C_MDC_FragmentCommit;
+ friend class C_IO_MDC_FragmentPurgeOld;
+
+ void handle_fragment_notify(const MMDSFragmentNotify::const_ref &m);
+ void handle_fragment_notify_ack(const MMDSFragmentNotifyAck::const_ref &m);
+
+ void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frag,
+ LogSegment *ls, bufferlist *rollback=NULL);
+ void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
+ void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags);
+
+
+ DecayCounter trim_counter;
+
+public:
+ void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) {
+ uncommitted_fragments.at(dirfrag).waiters.push_back(c);
+ }
+ bool is_any_uncommitted_fragment() const {
+ return !uncommitted_fragments.empty();
+ }
+ void wait_for_uncommitted_fragments(MDSContext* finisher);
+ void rollback_uncommitted_fragments();
+
+ void split_dir(CDir *dir, int byn);
+ void merge_dir(CInode *diri, frag_t fg);
+
+ void find_stale_fragment_freeze();
+ void fragment_freeze_inc_num_waiters(CDir *dir);
+ bool fragment_are_all_frozen(CDir *dir);
+ int get_num_fragmenting_dirs() { return fragments.size(); }
+
+ // -- updates --
+ //int send_inode_updates(CInode *in);
+ //void handle_inode_update(MInodeUpdate *m);
+
+ int send_dir_updates(CDir *in, bool bcast=false);
+ void handle_dir_update(const MDirUpdate::const_ref &m);
+
+ // -- cache expiration --
+ void handle_cache_expire(const MCacheExpire::const_ref &m);
+ // delayed cache expire
+ map<CDir*, expiremap> delayed_expire; // subtree root -> expire msg
+ void process_delayed_expire(CDir *dir);
+ void discard_delayed_expire(CDir *dir);
+
+ // -- mdsmap --
+ void handle_mdsmap(const MDSMap &mdsmap);
+
+protected:
+ int dump_cache(std::string_view fn, Formatter *f);
+public:
+ int dump_cache() { return dump_cache(NULL, NULL); }
+ int dump_cache(std::string_view filename);
+ int dump_cache(Formatter *f);
+ void dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f);
+
+ void cache_status(Formatter *f);
+
+ void dump_resolve_status(Formatter *f) const;
+ void dump_rejoin_status(Formatter *f) const;
+
+ // == crap fns ==
+ public:
+ void show_cache();
+ void show_subtrees(int dbl=10, bool force_print=false);
+
+ CInode *hack_pick_random_inode() {
+ ceph_assert(!inode_map.empty());
+ int n = rand() % inode_map.size();
+ auto p = inode_map.begin();
+ while (n--) ++p;
+ return p->second;
+ }
+
+protected:
+ void flush_dentry_work(MDRequestRef& mdr);
+ /**
+ * Resolve path to a dentry and pass it onto the ScrubStack.
+ *
+ * TODO: return enough information to the original mdr formatter
+ * and completion that they can subsequeuntly check the progress of
+ * this scrub (we won't block them on a whole scrub as it can take a very
+ * long time)
+ */
+ void enqueue_scrub_work(MDRequestRef& mdr);
+ void recursive_scrub_finish(const ScrubHeaderRef& header);
+ void repair_inode_stats_work(MDRequestRef& mdr);
+ void repair_dirfrag_stats_work(MDRequestRef& mdr);
+ void upgrade_inode_snaprealm_work(MDRequestRef& mdr);
+ friend class C_MDC_RespondInternalRequest;
+public:
+ void flush_dentry(std::string_view path, Context *fin);
+ /**
+ * Create and start an OP_ENQUEUE_SCRUB
+ */
+ void enqueue_scrub(std::string_view path, std::string_view tag,
+ bool force, bool recursive, bool repair,
+ Formatter *f, Context *fin);
+ void repair_inode_stats(CInode *diri);
+ void repair_dirfrag_stats(CDir *dir);
+ void upgrade_inode_snaprealm(CInode *in);
+
+public:
+ /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
+ std::set<CInode *> export_pin_queue;
+ std::set<CInode *> export_pin_delayed_queue;
+
+ OpenFileTable open_file_table;
+
+private:
+ std::thread upkeeper;
+ ceph::mutex upkeep_mutex = ceph::make_mutex("MDCache::upkeep_mutex");
+ ceph::condition_variable upkeep_cvar;
+ time upkeep_last_trim = time::min();
+ time upkeep_last_release = time::min();
+ std::atomic<bool> upkeep_trim_shutdown{false};
+};
+
+class C_MDS_RetryRequest : public MDSInternalContext {
+ MDCache *cache;
+ MDRequestRef mdr;
+ public:
+ C_MDS_RetryRequest(MDCache *c, MDRequestRef& r);
+ void finish(int r) override;
+};
+
+#endif
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
new file mode 100644
index 00000000..5277f9af
--- /dev/null
+++ b/src/mds/MDLog.cc
@@ -0,0 +1,1530 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "MDSRank.h"
+#include "MDLog.h"
+#include "MDCache.h"
+#include "LogEvent.h"
+#include "MDSContext.h"
+
+#include "osdc/Journaler.h"
+#include "mds/JournalPointer.h"
+
+#include "common/entity_name.h"
+#include "common/perf_counters.h"
+#include "common/Cond.h"
+
+#include "events/ESubtreeMap.h"
+
+#include "common/config.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".log "
+
+// cons/des
+MDLog::~MDLog()
+{
+ if (journaler) { delete journaler; journaler = 0; }
+ if (logger) {
+ g_ceph_context->get_perfcounters_collection()->remove(logger);
+ delete logger;
+ logger = 0;
+ }
+}
+
+
+void MDLog::create_logger()
+{
+ PerfCountersBuilder plb(g_ceph_context, "mds_log", l_mdl_first, l_mdl_last);
+
+ plb.add_u64_counter(l_mdl_evadd, "evadd", "Events submitted", "subm",
+ PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_u64(l_mdl_ev, "ev", "Events", "evts",
+ PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_u64(l_mdl_seg, "seg", "Segments", "segs",
+ PerfCountersBuilder::PRIO_INTERESTING);
+
+ plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+ plb.add_u64(l_mdl_evexg, "evexg", "Expiring events");
+ plb.add_u64(l_mdl_evexd, "evexd", "Current expired events");
+ plb.add_u64(l_mdl_segexg, "segexg", "Expiring segments");
+ plb.add_u64(l_mdl_segexd, "segexd", "Current expired segments");
+ plb.add_u64_counter(l_mdl_replayed, "replayed", "Events replayed",
+ "repl", PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_time_avg(l_mdl_jlat, "jlat", "Journaler flush latency");
+ plb.add_u64_counter(l_mdl_evex, "evex", "Total expired events");
+ plb.add_u64_counter(l_mdl_evtrm, "evtrm", "Trimmed events");
+ plb.add_u64_counter(l_mdl_segadd, "segadd", "Segments added");
+ plb.add_u64_counter(l_mdl_segex, "segex", "Total expired segments");
+ plb.add_u64_counter(l_mdl_segtrm, "segtrm", "Trimmed segments");
+
+ plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+ plb.add_u64(l_mdl_expos, "expos", "Journaler xpire position");
+ plb.add_u64(l_mdl_wrpos, "wrpos", "Journaler write position");
+ plb.add_u64(l_mdl_rdpos, "rdpos", "Journaler read position");
+
+ // logger
+ logger = plb.create_perf_counters();
+ g_ceph_context->get_perfcounters_collection()->add(logger);
+}
+
+void MDLog::set_write_iohint(unsigned iohint_flags)
+{
+ journaler->set_write_iohint(iohint_flags);
+}
+
+class C_MDL_WriteError : public MDSIOContextBase {
+ protected:
+ MDLog *mdlog;
+ MDSRank *get_mds() override {return mdlog->mds;}
+
+ void finish(int r) override {
+ MDSRank *mds = get_mds();
+ // assume journal is reliable, so don't choose action based on
+ // g_conf()->mds_action_on_write_error.
+ if (r == -EBLACKLISTED) {
+ derr << "we have been blacklisted (fenced), respawning..." << dendl;
+ mds->respawn();
+ } else {
+ derr << "unhandled error " << cpp_strerror(r) << ", shutting down..." << dendl;
+ // Although it's possible that this could be something transient,
+ // it's severe and scary, so disable this rank until an administrator
+ // intervenes.
+ mds->clog->error() << "Unhandled journal write error on MDS rank " <<
+ mds->get_nodeid() << ": " << cpp_strerror(r) << ", shutting down.";
+ mds->damaged();
+ ceph_abort(); // damaged should never return
+ }
+ }
+
+ public:
+ explicit C_MDL_WriteError(MDLog *m) :
+ MDSIOContextBase(false), mdlog(m) {}
+ void print(ostream& out) const override {
+ out << "mdlog_write_error";
+ }
+};
+
+
+void MDLog::write_head(MDSContext *c)
+{
+ Context *fin = NULL;
+ if (c != NULL) {
+ fin = new C_IO_Wrapper(mds, c);
+ }
+ journaler->write_head(fin);
+}
+
+uint64_t MDLog::get_read_pos() const
+{
+ return journaler->get_read_pos();
+}
+
+uint64_t MDLog::get_write_pos() const
+{
+ return journaler->get_write_pos();
+}
+
+uint64_t MDLog::get_safe_pos() const
+{
+ return journaler->get_write_safe_pos();
+}
+
+
+
+void MDLog::create(MDSContext *c)
+{
+ dout(5) << "create empty log" << dendl;
+
+ C_GatherBuilder gather(g_ceph_context);
+ // This requires an OnFinisher wrapper because Journaler will call back the completion for write_head inside its own lock
+ // XXX but should maybe that be handled inside Journaler?
+ gather.set_finisher(new C_IO_Wrapper(mds, c));
+
+ // The inode of the default Journaler we will create
+ ino = MDS_INO_LOG_OFFSET + mds->get_nodeid();
+
+ // Instantiate Journaler and start async write to RADOS
+ ceph_assert(journaler == NULL);
+ journaler = new Journaler("mdlog", ino, mds->mdsmap->get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC, mds->objecter, logger,
+ l_mdl_jlat, mds->finisher);
+ ceph_assert(journaler->is_readonly());
+ journaler->set_write_error_handler(new C_MDL_WriteError(this));
+ journaler->set_writeable();
+ journaler->create(&mds->mdcache->default_log_layout, g_conf()->mds_journal_format);
+ journaler->write_head(gather.new_sub());
+
+ // Async write JournalPointer to RADOS
+ JournalPointer jp(mds->get_nodeid(), mds->mdsmap->get_metadata_pool());
+ jp.front = ino;
+ jp.back = 0;
+ jp.save(mds->objecter, gather.new_sub());
+
+ gather.activate();
+
+ logger->set(l_mdl_expos, journaler->get_expire_pos());
+ logger->set(l_mdl_wrpos, journaler->get_write_pos());
+
+ submit_thread.create("md_submit");
+}
+
+void MDLog::open(MDSContext *c)
+{
+ dout(5) << "open discovering log bounds" << dendl;
+
+ ceph_assert(!recovery_thread.is_started());
+ recovery_thread.set_completion(c);
+ recovery_thread.create("md_recov_open");
+
+ submit_thread.create("md_submit");
+ // either append() or replay() will follow.
+}
+
+/**
+ * Final part of reopen() procedure, after recovery_thread
+ * has done its thing we call append()
+ */
+class C_ReopenComplete : public MDSInternalContext {
+ MDLog *mdlog;
+ MDSContext *on_complete;
+public:
+ C_ReopenComplete(MDLog *mdlog_, MDSContext *on_complete_) : MDSInternalContext(mdlog_->mds), mdlog(mdlog_), on_complete(on_complete_) {}
+ void finish(int r) override {
+ mdlog->append();
+ on_complete->complete(r);
+ }
+};
+
+/**
+ * Given that open() has been called in the past, go through the journal
+ * recovery procedure again, potentially reformatting the journal if it
+ * was in an old format.
+ */
+void MDLog::reopen(MDSContext *c)
+{
+ dout(5) << "reopen" << dendl;
+
+ // Because we will call append() at the completion of this, check that we have already
+ // read the whole journal.
+ ceph_assert(journaler != NULL);
+ ceph_assert(journaler->get_read_pos() == journaler->get_write_pos());
+
+ delete journaler;
+ journaler = NULL;
+
+ // recovery_thread was started at some point in the past. Although
+ // it has called it's completion if we made it back here, it might
+ // still not have been cleaned up: join it.
+ recovery_thread.join();
+
+ recovery_thread.set_completion(new C_ReopenComplete(this, c));
+ recovery_thread.create("md_recov_reopen");
+}
+
+void MDLog::append()
+{
+ dout(5) << "append positioning at end and marking writeable" << dendl;
+ journaler->set_read_pos(journaler->get_write_pos());
+ journaler->set_expire_pos(journaler->get_write_pos());
+
+ journaler->set_writeable();
+
+ logger->set(l_mdl_expos, journaler->get_write_pos());
+}
+
+
+
+// -------------------------------------------------
+
+void MDLog::_start_entry(LogEvent *e)
+{
+ ceph_assert(submit_mutex.is_locked_by_me());
+
+ ceph_assert(cur_event == NULL);
+ cur_event = e;
+
+ event_seq++;
+
+ EMetaBlob *metablob = e->get_metablob();
+ if (metablob) {
+ metablob->event_seq = event_seq;
+ metablob->last_subtree_map = get_last_segment_seq();
+ }
+}
+
+void MDLog::cancel_entry(LogEvent *le)
+{
+ ceph_assert(le == cur_event);
+ cur_event = NULL;
+ delete le;
+}
+
+void MDLog::_submit_entry(LogEvent *le, MDSLogContextBase *c)
+{
+ ceph_assert(submit_mutex.is_locked_by_me());
+ ceph_assert(!mds->is_any_replay());
+ ceph_assert(!capped);
+
+ ceph_assert(le == cur_event);
+ cur_event = NULL;
+
+ // let the event register itself in the segment
+ ceph_assert(!segments.empty());
+ LogSegment *ls = segments.rbegin()->second;
+ ls->num_events++;
+
+ le->_segment = ls;
+ le->update_segment();
+ le->set_stamp(ceph_clock_now());
+
+ mdsmap_up_features = mds->mdsmap->get_up_features();
+ pending_events[ls->seq].push_back(PendingEvent(le, c));
+ num_events++;
+
+ if (logger) {
+ logger->inc(l_mdl_evadd);
+ logger->set(l_mdl_ev, num_events);
+ }
+
+ unflushed++;
+
+ uint64_t period = journaler->get_layout_period();
+ // start a new segment?
+ if (le->get_type() == EVENT_SUBTREEMAP ||
+ (le->get_type() == EVENT_IMPORTFINISH && mds->is_resolve())) {
+ // avoid infinite loop when ESubtreeMap is very large.
+ // do not insert ESubtreeMap among EImportFinish events that finish
+ // disambiguate imports. Because the ESubtreeMap reflects the subtree
+ // state when all EImportFinish events are replayed.
+ } else if (ls->end/period != ls->offset/period ||
+ ls->num_events >= g_conf()->mds_log_events_per_segment) {
+ dout(10) << "submit_entry also starting new segment: last = "
+ << ls->seq << "/" << ls->offset << ", event seq = " << event_seq << dendl;
+ _start_new_segment();
+ } else if (g_conf()->mds_debug_subtrees &&
+ le->get_type() != EVENT_SUBTREEMAP_TEST) {
+ // debug: journal this every time to catch subtree replay bugs.
+ // use a different event id so it doesn't get interpreted as a
+ // LogSegment boundary on replay.
+ LogEvent *sle = mds->mdcache->create_subtree_map();
+ sle->set_type(EVENT_SUBTREEMAP_TEST);
+ _submit_entry(sle, NULL);
+ }
+}
+
+/**
+ * Invoked on the flush after each entry submitted
+ */
+class C_MDL_Flushed : public MDSLogContextBase {
+protected:
+ MDLog *mdlog;
+ MDSRank *get_mds() override {return mdlog->mds;}
+ MDSContext *wrapped;
+
+ void finish(int r) override {
+ if (wrapped)
+ wrapped->complete(r);
+ }
+
+public:
+ C_MDL_Flushed(MDLog *m, MDSContext *w)
+ : mdlog(m), wrapped(w) {}
+ C_MDL_Flushed(MDLog *m, uint64_t wp) : mdlog(m), wrapped(NULL) {
+ set_write_pos(wp);
+ }
+};
+
+void MDLog::_submit_thread()
+{
+ dout(10) << "_submit_thread start" << dendl;
+
+ submit_mutex.Lock();
+
+ while (!mds->is_daemon_stopping()) {
+ if (g_conf()->mds_log_pause) {
+ submit_cond.Wait(submit_mutex);
+ continue;
+ }
+
+ map<uint64_t,list<PendingEvent> >::iterator it = pending_events.begin();
+ if (it == pending_events.end()) {
+ submit_cond.Wait(submit_mutex);
+ continue;
+ }
+
+ if (it->second.empty()) {
+ pending_events.erase(it);
+ continue;
+ }
+
+ int64_t features = mdsmap_up_features;
+ PendingEvent data = it->second.front();
+ it->second.pop_front();
+
+ submit_mutex.Unlock();
+
+ if (data.le) {
+ LogEvent *le = data.le;
+ LogSegment *ls = le->_segment;
+ // encode it, with event type
+ bufferlist bl;
+ le->encode_with_header(bl, features);
+
+ uint64_t write_pos = journaler->get_write_pos();
+
+ le->set_start_off(write_pos);
+ if (le->get_type() == EVENT_SUBTREEMAP)
+ ls->offset = write_pos;
+
+ dout(5) << "_submit_thread " << write_pos << "~" << bl.length()
+ << " : " << *le << dendl;
+
+ // journal it.
+ const uint64_t new_write_pos = journaler->append_entry(bl); // bl is destroyed.
+ ls->end = new_write_pos;
+
+ MDSLogContextBase *fin;
+ if (data.fin) {
+ fin = dynamic_cast<MDSLogContextBase*>(data.fin);
+ ceph_assert(fin);
+ fin->set_write_pos(new_write_pos);
+ } else {
+ fin = new C_MDL_Flushed(this, new_write_pos);
+ }
+
+ journaler->wait_for_flush(fin);
+
+ if (data.flush)
+ journaler->flush();
+
+ if (logger)
+ logger->set(l_mdl_wrpos, ls->end);
+
+ delete le;
+ } else {
+ if (data.fin) {
+ MDSContext* fin =
+ dynamic_cast<MDSContext*>(data.fin);
+ ceph_assert(fin);
+ C_MDL_Flushed *fin2 = new C_MDL_Flushed(this, fin);
+ fin2->set_write_pos(journaler->get_write_pos());
+ journaler->wait_for_flush(fin2);
+ }
+ if (data.flush)
+ journaler->flush();
+ }
+
+ submit_mutex.Lock();
+ if (data.flush)
+ unflushed = 0;
+ else if (data.le)
+ unflushed++;
+ }
+
+ submit_mutex.Unlock();
+}
+
+void MDLog::wait_for_safe(MDSContext *c)
+{
+ submit_mutex.Lock();
+
+ bool no_pending = true;
+ if (!pending_events.empty()) {
+ pending_events.rbegin()->second.push_back(PendingEvent(NULL, c));
+ no_pending = false;
+ submit_cond.Signal();
+ }
+
+ submit_mutex.Unlock();
+
+ if (no_pending && c)
+ journaler->wait_for_flush(new C_IO_Wrapper(mds, c));
+}
+
+void MDLog::flush()
+{
+ submit_mutex.Lock();
+
+ bool do_flush = unflushed > 0;
+ unflushed = 0;
+ if (!pending_events.empty()) {
+ pending_events.rbegin()->second.push_back(PendingEvent(NULL, NULL, true));
+ do_flush = false;
+ submit_cond.Signal();
+ }
+
+ submit_mutex.Unlock();
+
+ if (do_flush)
+ journaler->flush();
+}
+
+void MDLog::kick_submitter()
+{
+ std::lock_guard l(submit_mutex);
+ submit_cond.Signal();
+}
+
+void MDLog::cap()
+{
+ dout(5) << "cap" << dendl;
+ capped = true;
+}
+
+void MDLog::shutdown()
+{
+ ceph_assert(mds->mds_lock.is_locked_by_me());
+
+ dout(5) << "shutdown" << dendl;
+ if (submit_thread.is_started()) {
+ ceph_assert(mds->is_daemon_stopping());
+
+ if (submit_thread.am_self()) {
+ // Called suicide from the thread: trust it to do no work after
+ // returning from suicide, and subsequently respect mds->is_daemon_stopping()
+ // and fall out of its loop.
+ } else {
+ mds->mds_lock.Unlock();
+ // Because MDS::stopping is true, it's safe to drop mds_lock: nobody else
+ // picking it up will do anything with it.
+
+ submit_mutex.Lock();
+ submit_cond.Signal();
+ submit_mutex.Unlock();
+
+ mds->mds_lock.Lock();
+
+ submit_thread.join();
+ }
+ }
+
+ // Replay thread can be stuck inside e.g. Journaler::wait_for_readable,
+ // so we need to shutdown the journaler first.
+ if (journaler) {
+ journaler->shutdown();
+ }
+
+ if (replay_thread.is_started() && !replay_thread.am_self()) {
+ mds->mds_lock.Unlock();
+ replay_thread.join();
+ mds->mds_lock.Lock();
+ }
+
+ if (recovery_thread.is_started() && !recovery_thread.am_self()) {
+ mds->mds_lock.Unlock();
+ recovery_thread.join();
+ mds->mds_lock.Lock();
+ }
+}
+
+
+// -----------------------------
+// segments
+
+void MDLog::_start_new_segment()
+{
+ _prepare_new_segment();
+ _journal_segment_subtree_map(NULL);
+}
+
+void MDLog::_prepare_new_segment()
+{
+ ceph_assert(submit_mutex.is_locked_by_me());
+
+ uint64_t seq = event_seq + 1;
+ dout(7) << __func__ << " seq " << seq << dendl;
+
+ segments[seq] = new LogSegment(seq);
+
+ logger->inc(l_mdl_segadd);
+ logger->set(l_mdl_seg, segments.size());
+
+ // Adjust to next stray dir
+ dout(10) << "Advancing to next stray directory on mds " << mds->get_nodeid()
+ << dendl;
+ mds->mdcache->advance_stray();
+}
+
+void MDLog::_journal_segment_subtree_map(MDSContext *onsync)
+{
+ ceph_assert(submit_mutex.is_locked_by_me());
+
+ dout(7) << __func__ << dendl;
+ ESubtreeMap *sle = mds->mdcache->create_subtree_map();
+ sle->event_seq = get_last_segment_seq();
+
+ _submit_entry(sle, new C_MDL_Flushed(this, onsync));
+}
+
+class C_OFT_Committed : public MDSInternalContext {
+ MDLog *mdlog;
+ uint64_t seq;
+public:
+ C_OFT_Committed(MDLog *l, uint64_t s) :
+ MDSInternalContext(l->mds), mdlog(l), seq(s) {}
+ void finish(int ret) override {
+ mdlog->trim_expired_segments();
+ }
+};
+
+void MDLog::trim(int m)
+{
+ unsigned max_segments = g_conf()->mds_log_max_segments;
+ int max_events = g_conf()->mds_log_max_events;
+ if (m >= 0)
+ max_events = m;
+
+ if (mds->mdcache->is_readonly()) {
+ dout(10) << "trim, ignoring read-only FS" << dendl;
+ return;
+ }
+
+ // Clamp max_events to not be smaller than events per segment
+ if (max_events > 0 && max_events <= g_conf()->mds_log_events_per_segment) {
+ max_events = g_conf()->mds_log_events_per_segment + 1;
+ }
+
+ submit_mutex.Lock();
+
+ // trim!
+ dout(10) << "trim "
+ << segments.size() << " / " << max_segments << " segments, "
+ << num_events << " / " << max_events << " events"
+ << ", " << expiring_segments.size() << " (" << expiring_events << ") expiring"
+ << ", " << expired_segments.size() << " (" << expired_events << ") expired"
+ << dendl;
+
+ if (segments.empty()) {
+ submit_mutex.Unlock();
+ return;
+ }
+
+ // hack: only trim for a few seconds at a time
+ utime_t stop = ceph_clock_now();
+ stop += 2.0;
+
+ int op_prio = CEPH_MSG_PRIO_LOW +
+ (CEPH_MSG_PRIO_HIGH - CEPH_MSG_PRIO_LOW) *
+ expiring_segments.size() / max_segments;
+ if (op_prio > CEPH_MSG_PRIO_HIGH)
+ op_prio = CEPH_MSG_PRIO_HIGH;
+
+ unsigned new_expiring_segments = 0;
+
+ unsigned max_expiring_segments = 0;
+ if (pre_segments_size > 0){
+ max_expiring_segments = max_segments/2;
+ assert(segments.size() >= pre_segments_size);
+ max_expiring_segments = std::max<unsigned>(max_expiring_segments,segments.size() - pre_segments_size);
+ }
+
+ map<uint64_t,LogSegment*>::iterator p = segments.begin();
+ while (p != segments.end()) {
+ if (stop < ceph_clock_now())
+ break;
+
+ unsigned num_remaining_segments = (segments.size() - expired_segments.size() - expiring_segments.size());
+ if ((num_remaining_segments <= max_segments) &&
+ (max_events < 0 || num_events - expiring_events - expired_events <= max_events))
+ break;
+
+ // Do not trim too many segments at once for peak workload. If mds keeps creating N segments each tick,
+ // the upper bound of 'num_remaining_segments - max_segments' is '2 * N'
+ if (new_expiring_segments * 2 > num_remaining_segments)
+ break;
+
+ if (max_expiring_segments > 0 &&
+ expiring_segments.size() >= max_expiring_segments)
+ break;
+
+ // look at first segment
+ LogSegment *ls = p->second;
+ ceph_assert(ls);
+ ++p;
+
+ if (pending_events.count(ls->seq) ||
+ ls->end > safe_pos) {
+ dout(5) << "trim segment " << ls->seq << "/" << ls->offset << ", not fully flushed yet, safe "
+ << journaler->get_write_safe_pos() << " < end " << ls->end << dendl;
+ break;
+ }
+
+ if (expiring_segments.count(ls)) {
+ dout(5) << "trim already expiring segment " << ls->seq << "/" << ls->offset
+ << ", " << ls->num_events << " events" << dendl;
+ } else if (expired_segments.count(ls)) {
+ dout(5) << "trim already expired segment " << ls->seq << "/" << ls->offset
+ << ", " << ls->num_events << " events" << dendl;
+ } else {
+ ceph_assert(expiring_segments.count(ls) == 0);
+ new_expiring_segments++;
+ expiring_segments.insert(ls);
+ expiring_events += ls->num_events;
+ submit_mutex.Unlock();
+
+ uint64_t last_seq = ls->seq;
+ try_expire(ls, op_prio);
+
+ submit_mutex.Lock();
+ p = segments.lower_bound(last_seq + 1);
+ }
+ }
+
+ if (!capped &&
+ !mds->mdcache->open_file_table.is_any_committing()) {
+ uint64_t last_seq = get_last_segment_seq();
+ if (mds->mdcache->open_file_table.is_any_dirty() ||
+ last_seq > mds->mdcache->open_file_table.get_committed_log_seq()) {
+ submit_mutex.Unlock();
+ mds->mdcache->open_file_table.commit(new C_OFT_Committed(this, last_seq),
+ last_seq, CEPH_MSG_PRIO_HIGH);
+ submit_mutex.Lock();
+ }
+ }
+
+ // discard expired segments and unlock submit_mutex
+ _trim_expired_segments();
+}
+
+class C_MaybeExpiredSegment : public MDSInternalContext {
+ MDLog *mdlog;
+ LogSegment *ls;
+ int op_prio;
+ public:
+ C_MaybeExpiredSegment(MDLog *mdl, LogSegment *s, int p) :
+ MDSInternalContext(mdl->mds), mdlog(mdl), ls(s), op_prio(p) {}
+ void finish(int res) override {
+ if (res < 0)
+ mdlog->mds->handle_write_error(res);
+ mdlog->_maybe_expired(ls, op_prio);
+ }
+};
+
+/**
+ * Like MDLog::trim, but instead of trimming to max_segments, trim all but the latest
+ * segment.
+ */
+int MDLog::trim_all()
+{
+ submit_mutex.Lock();
+
+ dout(10) << __func__ << ": "
+ << segments.size()
+ << "/" << expiring_segments.size()
+ << "/" << expired_segments.size() << dendl;
+
+ uint64_t last_seq = 0;
+ if (!segments.empty()) {
+ last_seq = get_last_segment_seq();
+ if (!capped &&
+ !mds->mdcache->open_file_table.is_any_committing() &&
+ last_seq > mds->mdcache->open_file_table.get_committing_log_seq()) {
+ submit_mutex.Unlock();
+ mds->mdcache->open_file_table.commit(new C_OFT_Committed(this, last_seq),
+ last_seq, CEPH_MSG_PRIO_DEFAULT);
+ submit_mutex.Lock();
+ }
+ }
+
+ map<uint64_t,LogSegment*>::iterator p = segments.begin();
+ while (p != segments.end() &&
+ p->first < last_seq &&
+ p->second->end < safe_pos) { // next segment should have been started
+ LogSegment *ls = p->second;
+ ++p;
+
+ // Caller should have flushed journaler before calling this
+ if (pending_events.count(ls->seq)) {
+ dout(5) << __func__ << ": segment " << ls->seq << " has pending events" << dendl;
+ submit_mutex.Unlock();
+ return -EAGAIN;
+ }
+
+ if (expiring_segments.count(ls)) {
+ dout(5) << "trim already expiring segment " << ls->seq << "/" << ls->offset
+ << ", " << ls->num_events << " events" << dendl;
+ } else if (expired_segments.count(ls)) {
+ dout(5) << "trim already expired segment " << ls->seq << "/" << ls->offset
+ << ", " << ls->num_events << " events" << dendl;
+ } else {
+ ceph_assert(expiring_segments.count(ls) == 0);
+ expiring_segments.insert(ls);
+ expiring_events += ls->num_events;
+ submit_mutex.Unlock();
+
+ uint64_t next_seq = ls->seq + 1;
+ try_expire(ls, CEPH_MSG_PRIO_DEFAULT);
+
+ submit_mutex.Lock();
+ p = segments.lower_bound(next_seq);
+ }
+ }
+
+ _trim_expired_segments();
+
+ return 0;
+}
+
+
+void MDLog::try_expire(LogSegment *ls, int op_prio)
+{
+ MDSGatherBuilder gather_bld(g_ceph_context);
+ ls->try_to_expire(mds, gather_bld, op_prio);
+
+ if (gather_bld.has_subs()) {
+ dout(5) << "try_expire expiring segment " << ls->seq << "/" << ls->offset << dendl;
+ gather_bld.set_finisher(new C_MaybeExpiredSegment(this, ls, op_prio));
+ gather_bld.activate();
+ } else {
+ dout(10) << "try_expire expired segment " << ls->seq << "/" << ls->offset << dendl;
+ submit_mutex.Lock();
+ ceph_assert(expiring_segments.count(ls));
+ expiring_segments.erase(ls);
+ expiring_events -= ls->num_events;
+ _expired(ls);
+ submit_mutex.Unlock();
+ }
+
+ logger->set(l_mdl_segexg, expiring_segments.size());
+ logger->set(l_mdl_evexg, expiring_events);
+}
+
+void MDLog::_maybe_expired(LogSegment *ls, int op_prio)
+{
+ if (mds->mdcache->is_readonly()) {
+ dout(10) << "_maybe_expired, ignoring read-only FS" << dendl;
+ return;
+ }
+
+ dout(10) << "_maybe_expired segment " << ls->seq << "/" << ls->offset
+ << ", " << ls->num_events << " events" << dendl;
+ try_expire(ls, op_prio);
+}
+
+void MDLog::_trim_expired_segments()
+{
+ ceph_assert(submit_mutex.is_locked_by_me());
+
+ uint64_t oft_committed_seq = mds->mdcache->open_file_table.get_committed_log_seq();
+
+ // trim expired segments?
+ bool trimmed = false;
+ while (!segments.empty()) {
+ LogSegment *ls = segments.begin()->second;
+ if (!expired_segments.count(ls)) {
+ dout(10) << "_trim_expired_segments waiting for " << ls->seq << "/" << ls->offset
+ << " to expire" << dendl;
+ break;
+ }
+
+ if (!capped && ls->seq >= oft_committed_seq) {
+ dout(10) << "_trim_expired_segments open file table committedseq " << oft_committed_seq
+ << " <= " << ls->seq << "/" << ls->offset << dendl;
+ break;
+ }
+
+ dout(10) << "_trim_expired_segments trimming expired "
+ << ls->seq << "/0x" << std::hex << ls->offset << std::dec << dendl;
+ expired_events -= ls->num_events;
+ expired_segments.erase(ls);
+ if (pre_segments_size > 0)
+ pre_segments_size--;
+ num_events -= ls->num_events;
+
+ // this was the oldest segment, adjust expire pos
+ if (journaler->get_expire_pos() < ls->end) {
+ journaler->set_expire_pos(ls->end);
+ logger->set(l_mdl_expos, ls->end);
+ } else {
+ logger->set(l_mdl_expos, ls->offset);
+ }
+
+ logger->inc(l_mdl_segtrm);
+ logger->inc(l_mdl_evtrm, ls->num_events);
+
+ segments.erase(ls->seq);
+ delete ls;
+ trimmed = true;
+ }
+
+ submit_mutex.Unlock();
+
+ if (trimmed)
+ journaler->write_head(0);
+}
+
+void MDLog::trim_expired_segments()
+{
+ submit_mutex.Lock();
+ _trim_expired_segments();
+}
+
+void MDLog::_expired(LogSegment *ls)
+{
+ ceph_assert(submit_mutex.is_locked_by_me());
+
+ dout(5) << "_expired segment " << ls->seq << "/" << ls->offset
+ << ", " << ls->num_events << " events" << dendl;
+
+ if (!capped && ls == peek_current_segment()) {
+ dout(5) << "_expired not expiring " << ls->seq << "/" << ls->offset
+ << ", last one and !capped" << dendl;
+ } else {
+ // expired.
+ expired_segments.insert(ls);
+ expired_events += ls->num_events;
+
+ // Trigger all waiters
+ finish_contexts(g_ceph_context, ls->expiry_waiters);
+
+ logger->inc(l_mdl_evex, ls->num_events);
+ logger->inc(l_mdl_segex);
+ }
+
+ logger->set(l_mdl_ev, num_events);
+ logger->set(l_mdl_evexd, expired_events);
+ logger->set(l_mdl_seg, segments.size());
+ logger->set(l_mdl_segexd, expired_segments.size());
+}
+
+
+
+void MDLog::replay(MDSContext *c)
+{
+ ceph_assert(journaler->is_active());
+ ceph_assert(journaler->is_readonly());
+
+ // empty?
+ if (journaler->get_read_pos() == journaler->get_write_pos()) {
+ dout(10) << "replay - journal empty, done." << dendl;
+ mds->mdcache->trim();
+ if (mds->is_standby_replay())
+ mds->update_mlogger();
+ if (c) {
+ c->complete(0);
+ }
+ return;
+ }
+
+ // add waiter
+ if (c)
+ waitfor_replay.push_back(c);
+
+ // go!
+ dout(10) << "replay start, from " << journaler->get_read_pos()
+ << " to " << journaler->get_write_pos() << dendl;
+
+ ceph_assert(num_events == 0 || already_replayed);
+ if (already_replayed) {
+ // Ensure previous instance of ReplayThread is joined before
+ // we create another one
+ replay_thread.join();
+ }
+ already_replayed = true;
+
+ replay_thread.create("md_log_replay");
+}
+
+
+/**
+ * Resolve the JournalPointer object to a journal file, and
+ * instantiate a Journaler object. This may re-write the journal
+ * if the journal in RADOS appears to be in an old format.
+ *
+ * This is a separate thread because of the way it is initialized from inside
+ * the mds lock, which is also the global objecter lock -- rather than split
+ * it up into hard-to-read async operations linked up by contexts,
+ *
+ * When this function completes, the `journaler` attribute will be set to
+ * a Journaler instance using the latest available serialization format.
+ */
+void MDLog::_recovery_thread(MDSContext *completion)
+{
+ ceph_assert(journaler == NULL);
+ if (g_conf()->mds_journal_format > JOURNAL_FORMAT_MAX) {
+ dout(0) << "Configuration value for mds_journal_format is out of bounds, max is "
+ << JOURNAL_FORMAT_MAX << dendl;
+
+ // Oh dear, something unreadable in the store for this rank: require
+ // operator intervention.
+ mds->damaged();
+ ceph_abort(); // damaged should not return
+ }
+
+ // First, read the pointer object.
+ // If the pointer object is not present, then create it with
+ // front = default ino and back = null
+ JournalPointer jp(mds->get_nodeid(), mds->mdsmap->get_metadata_pool());
+ const int read_result = jp.load(mds->objecter);
+ if (read_result == -ENOENT) {
+ inodeno_t const default_log_ino = MDS_INO_LOG_OFFSET + mds->get_nodeid();
+ jp.front = default_log_ino;
+ int write_result = jp.save(mds->objecter);
+ // Nothing graceful we can do for this
+ ceph_assert(write_result >= 0);
+ } else if (read_result == -EBLACKLISTED) {
+ derr << "Blacklisted during JournalPointer read! Respawning..." << dendl;
+ mds->respawn();
+ ceph_abort(); // Should be unreachable because respawn calls execv
+ } else if (read_result != 0) {
+ mds->clog->error() << "failed to read JournalPointer: " << read_result
+ << " (" << cpp_strerror(read_result) << ")";
+ mds->damaged_unlocked();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+
+ // If the back pointer is non-null, that means that a journal
+ // rewrite failed part way through. Erase the back journal
+ // to clean up.
+ if (jp.back) {
+ if (mds->is_standby_replay()) {
+ dout(1) << "Journal " << jp.front << " is being rewritten, "
+ << "cannot replay in standby until an active MDS completes rewrite" << dendl;
+ std::lock_guard l(mds->mds_lock);
+ if (mds->is_daemon_stopping()) {
+ return;
+ }
+ completion->complete(-EAGAIN);
+ return;
+ }
+ dout(1) << "Erasing journal " << jp.back << dendl;
+ C_SaferCond erase_waiter;
+ Journaler back("mdlog", jp.back, mds->mdsmap->get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC, mds->objecter, logger, l_mdl_jlat,
+ mds->finisher);
+
+ // Read all about this journal (header + extents)
+ C_SaferCond recover_wait;
+ back.recover(&recover_wait);
+ int recovery_result = recover_wait.wait();
+ if (recovery_result == -EBLACKLISTED) {
+ derr << "Blacklisted during journal recovery! Respawning..." << dendl;
+ mds->respawn();
+ ceph_abort(); // Should be unreachable because respawn calls execv
+ } else if (recovery_result != 0) {
+ // Journaler.recover succeeds if no journal objects are present: an error
+ // means something worse like a corrupt header, which we can't handle here.
+ mds->clog->error() << "Error recovering journal " << jp.front << ": "
+ << cpp_strerror(recovery_result);
+ mds->damaged_unlocked();
+ ceph_assert(recovery_result == 0); // Unreachable because damaged() calls respawn()
+ }
+
+ // We could read journal, so we can erase it.
+ back.erase(&erase_waiter);
+ int erase_result = erase_waiter.wait();
+
+ // If we are successful, or find no data, we can update the JournalPointer to
+ // reflect that the back journal is gone.
+ if (erase_result != 0 && erase_result != -ENOENT) {
+ derr << "Failed to erase journal " << jp.back << ": " << cpp_strerror(erase_result) << dendl;
+ } else {
+ dout(1) << "Successfully erased journal, updating journal pointer" << dendl;
+ jp.back = 0;
+ int write_result = jp.save(mds->objecter);
+ // Nothing graceful we can do for this
+ ceph_assert(write_result >= 0);
+ }
+ }
+
+ /* Read the header from the front journal */
+ Journaler *front_journal = new Journaler("mdlog", jp.front,
+ mds->mdsmap->get_metadata_pool(), CEPH_FS_ONDISK_MAGIC, mds->objecter,
+ logger, l_mdl_jlat, mds->finisher);
+
+ // Assign to ::journaler so that we can be aborted by ::shutdown while
+ // waiting for journaler recovery
+ {
+ std::lock_guard l(mds->mds_lock);
+ journaler = front_journal;
+ }
+
+ C_SaferCond recover_wait;
+ front_journal->recover(&recover_wait);
+ dout(4) << "Waiting for journal " << jp.front << " to recover..." << dendl;
+ int recovery_result = recover_wait.wait();
+ dout(4) << "Journal " << jp.front << " recovered." << dendl;
+
+ if (recovery_result == -EBLACKLISTED) {
+ derr << "Blacklisted during journal recovery! Respawning..." << dendl;
+ mds->respawn();
+ ceph_abort(); // Should be unreachable because respawn calls execv
+ } else if (recovery_result != 0) {
+ mds->clog->error() << "Error recovering journal " << jp.front << ": "
+ << cpp_strerror(recovery_result);
+ mds->damaged_unlocked();
+ ceph_assert(recovery_result == 0); // Unreachable because damaged() calls respawn()
+ }
+
+ /* Check whether the front journal format is acceptable or needs re-write */
+ if (front_journal->get_stream_format() > JOURNAL_FORMAT_MAX) {
+ dout(0) << "Journal " << jp.front << " is in unknown format " << front_journal->get_stream_format()
+ << ", does this MDS daemon require upgrade?" << dendl;
+ {
+ std::lock_guard l(mds->mds_lock);
+ if (mds->is_daemon_stopping()) {
+ journaler = NULL;
+ delete front_journal;
+ return;
+ }
+ completion->complete(-EINVAL);
+ }
+ } else if (mds->is_standby_replay() || front_journal->get_stream_format() >= g_conf()->mds_journal_format) {
+ /* The journal is of configured format, or we are in standbyreplay and will
+ * tolerate replaying old journals until we have to go active. Use front_journal as
+ * our journaler attribute and complete */
+ dout(4) << "Recovered journal " << jp.front << " in format " << front_journal->get_stream_format() << dendl;
+ journaler->set_write_error_handler(new C_MDL_WriteError(this));
+ {
+ std::lock_guard l(mds->mds_lock);
+ if (mds->is_daemon_stopping()) {
+ return;
+ }
+ completion->complete(0);
+ }
+ } else {
+ /* Hand off to reformat routine, which will ultimately set the
+ * completion when it has done its thing */
+ dout(1) << "Journal " << jp.front << " has old format "
+ << front_journal->get_stream_format() << ", it will now be updated" << dendl;
+ _reformat_journal(jp, front_journal, completion);
+ }
+}
+
+/**
+ * Blocking rewrite of the journal to a new file, followed by
+ * swap of journal pointer to point to the new one.
+ *
+ * We write the new journal to the 'back' journal from the JournalPointer,
+ * swapping pointers to make that one the front journal only when we have
+ * safely completed.
+ */
+void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journal, MDSContext *completion)
+{
+ ceph_assert(!jp_in.is_null());
+ ceph_assert(completion != NULL);
+ ceph_assert(old_journal != NULL);
+
+ JournalPointer jp = jp_in;
+
+ /* Set JournalPointer.back to the location we will write the new journal */
+ inodeno_t primary_ino = MDS_INO_LOG_OFFSET + mds->get_nodeid();
+ inodeno_t secondary_ino = MDS_INO_LOG_BACKUP_OFFSET + mds->get_nodeid();
+ jp.back = (jp.front == primary_ino ? secondary_ino : primary_ino);
+ int write_result = jp.save(mds->objecter);
+ ceph_assert(write_result == 0);
+
+ /* Create the new Journaler file */
+ Journaler *new_journal = new Journaler("mdlog", jp.back,
+ mds->mdsmap->get_metadata_pool(), CEPH_FS_ONDISK_MAGIC, mds->objecter, logger, l_mdl_jlat, mds->finisher);
+ dout(4) << "Writing new journal header " << jp.back << dendl;
+ file_layout_t new_layout = old_journal->get_layout();
+ new_journal->set_writeable();
+ new_journal->create(&new_layout, g_conf()->mds_journal_format);
+
+ /* Write the new journal header to RADOS */
+ C_SaferCond write_head_wait;
+ new_journal->write_head(&write_head_wait);
+ write_head_wait.wait();
+
+ // Read in the old journal, and whenever we have readable events,
+ // write them to the new journal.
+ int r = 0;
+
+ // In old format journals before event_seq was introduced, the serialized
+ // offset of a SubtreeMap message in the log is used as the unique ID for
+ // a log segment. Because we change serialization, this will end up changing
+ // for us, so we have to explicitly update the fields that point back to that
+ // log segment.
+ std::map<LogSegment::seq_t, LogSegment::seq_t> segment_pos_rewrite;
+
+ // The logic in here borrowed from replay_thread expects mds_lock to be held,
+ // e.g. between checking readable and doing wait_for_readable so that journaler
+ // state doesn't change in between.
+ uint32_t events_transcribed = 0;
+ while (1) {
+ while (!old_journal->is_readable() &&
+ old_journal->get_read_pos() < old_journal->get_write_pos() &&
+ !old_journal->get_error()) {
+
+ // Issue a journal prefetch
+ C_SaferCond readable_waiter;
+ old_journal->wait_for_readable(&readable_waiter);
+
+ // Wait for a journal prefetch to complete
+ readable_waiter.wait();
+ }
+ if (old_journal->get_error()) {
+ r = old_journal->get_error();
+ dout(0) << "_replay journaler got error " << r << ", aborting" << dendl;
+ break;
+ }
+
+ if (!old_journal->is_readable() &&
+ old_journal->get_read_pos() == old_journal->get_write_pos())
+ break;
+
+ // Read one serialized LogEvent
+ ceph_assert(old_journal->is_readable());
+ bufferlist bl;
+ uint64_t le_pos = old_journal->get_read_pos();
+ bool r = old_journal->try_read_entry(bl);
+ if (!r && old_journal->get_error())
+ continue;
+ ceph_assert(r);
+
+ // Update segment_pos_rewrite
+ auto le = LogEvent::decode_event(bl.cbegin());
+ if (le) {
+ bool modified = false;
+
+ if (le->get_type() == EVENT_SUBTREEMAP ||
+ le->get_type() == EVENT_RESETJOURNAL) {
+ auto sle = dynamic_cast<ESubtreeMap*>(le.get());
+ if (sle == NULL || sle->event_seq == 0) {
+ // A non-explicit event seq: the effective sequence number
+ // of this segment is it's position in the old journal and
+ // the new effective sequence number will be its position
+ // in the new journal.
+ segment_pos_rewrite[le_pos] = new_journal->get_write_pos();
+ dout(20) << __func__ << " discovered segment seq mapping "
+ << le_pos << " -> " << new_journal->get_write_pos() << dendl;
+ }
+ } else {
+ event_seq++;
+ }
+
+ // Rewrite segment references if necessary
+ EMetaBlob *blob = le->get_metablob();
+ if (blob) {
+ modified = blob->rewrite_truncate_finish(mds, segment_pos_rewrite);
+ }
+
+ // Zero-out expire_pos in subtreemap because offsets have changed
+ // (expire_pos is just an optimization so it's safe to eliminate it)
+ if (le->get_type() == EVENT_SUBTREEMAP
+ || le->get_type() == EVENT_SUBTREEMAP_TEST) {
+ auto& sle = dynamic_cast<ESubtreeMap&>(*le);
+ dout(20) << __func__ << " zeroing expire_pos in subtreemap event at "
+ << le_pos << " seq=" << sle.event_seq << dendl;
+ sle.expire_pos = 0;
+ modified = true;
+ }
+
+ if (modified) {
+ bl.clear();
+ le->encode_with_header(bl, mds->mdsmap->get_up_features());
+ }
+ } else {
+ // Failure from LogEvent::decode, our job is to change the journal wrapper,
+ // not validate the contents, so pass it through.
+ dout(1) << __func__ << " transcribing un-decodable LogEvent at old position "
+ << old_journal->get_read_pos() << ", new position " << new_journal->get_write_pos()
+ << dendl;
+ }
+
+ // Write (buffered, synchronous) one serialized LogEvent
+ events_transcribed += 1;
+ new_journal->append_entry(bl);
+ }
+
+ dout(1) << "Transcribed " << events_transcribed << " events, flushing new journal" << dendl;
+ C_SaferCond flush_waiter;
+ new_journal->flush(&flush_waiter);
+ flush_waiter.wait();
+
+ // If failed to rewrite journal, leave the part written journal
+ // as garbage to be cleaned up next startup.
+ ceph_assert(r == 0);
+
+ /* Now that the new journal is safe, we can flip the pointers */
+ inodeno_t const tmp = jp.front;
+ jp.front = jp.back;
+ jp.back = tmp;
+ write_result = jp.save(mds->objecter);
+ ceph_assert(write_result == 0);
+
+ /* Delete the old journal to free space */
+ dout(1) << "New journal flushed, erasing old journal" << dendl;
+ C_SaferCond erase_waiter;
+ old_journal->erase(&erase_waiter);
+ int erase_result = erase_waiter.wait();
+ ceph_assert(erase_result == 0);
+ {
+ std::lock_guard l(mds->mds_lock);
+ if (mds->is_daemon_stopping()) {
+ delete new_journal;
+ return;
+ }
+ ceph_assert(journaler == old_journal);
+ journaler = NULL;
+ delete old_journal;
+ }
+
+ /* Update the pointer to reflect we're back in clean single journal state. */
+ jp.back = 0;
+ write_result = jp.save(mds->objecter);
+ ceph_assert(write_result == 0);
+
+ /* Reset the Journaler object to its default state */
+ dout(1) << "Journal rewrite complete, continuing with normal startup" << dendl;
+ {
+ std::lock_guard l(mds->mds_lock);
+ if (mds->is_daemon_stopping()) {
+ delete new_journal;
+ return;
+ }
+ journaler = new_journal;
+ journaler->set_readonly();
+ journaler->set_write_error_handler(new C_MDL_WriteError(this));
+ }
+
+ /* Trigger completion */
+ {
+ std::lock_guard l(mds->mds_lock);
+ if (mds->is_daemon_stopping()) {
+ return;
+ }
+ completion->complete(0);
+ }
+}
+
+
+// i am a separate thread
+void MDLog::_replay_thread()
+{
+ dout(10) << "_replay_thread start" << dendl;
+
+ // loop
+ int r = 0;
+ while (1) {
+ // wait for read?
+ while (!journaler->is_readable() &&
+ journaler->get_read_pos() < journaler->get_write_pos() &&
+ !journaler->get_error()) {
+ C_SaferCond readable_waiter;
+ journaler->wait_for_readable(&readable_waiter);
+ r = readable_waiter.wait();
+ }
+ if (journaler->get_error()) {
+ r = journaler->get_error();
+ dout(0) << "_replay journaler got error " << r << ", aborting" << dendl;
+ if (r == -ENOENT) {
+ if (mds->is_standby_replay()) {
+ // journal has been trimmed by somebody else
+ r = -EAGAIN;
+ } else {
+ mds->clog->error() << "missing journal object";
+ mds->damaged_unlocked();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+ } else if (r == -EINVAL) {
+ if (journaler->get_read_pos() < journaler->get_expire_pos()) {
+ // this should only happen if you're following somebody else
+ if(journaler->is_readonly()) {
+ dout(0) << "expire_pos is higher than read_pos, returning EAGAIN" << dendl;
+ r = -EAGAIN;
+ } else {
+ mds->clog->error() << "invalid journaler offsets";
+ mds->damaged_unlocked();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+ } else {
+ /* re-read head and check it
+ * Given that replay happens in a separate thread and
+ * the MDS is going to either shut down or restart when
+ * we return this error, doing it synchronously is fine
+ * -- as long as we drop the main mds lock--. */
+ C_SaferCond reread_fin;
+ journaler->reread_head(&reread_fin);
+ int err = reread_fin.wait();
+ if (err) {
+ if (err == -ENOENT && mds->is_standby_replay()) {
+ r = -EAGAIN;
+ dout(1) << "Journal header went away while in standby replay, journal rewritten?"
+ << dendl;
+ break;
+ } else {
+ dout(0) << "got error while reading head: " << cpp_strerror(err)
+ << dendl;
+
+ mds->clog->error() << "error reading journal header";
+ mds->damaged_unlocked();
+ ceph_abort(); // Should be unreachable because damaged() calls
+ // respawn()
+ }
+ }
+ standby_trim_segments();
+ if (journaler->get_read_pos() < journaler->get_expire_pos()) {
+ dout(0) << "expire_pos is higher than read_pos, returning EAGAIN" << dendl;
+ r = -EAGAIN;
+ }
+ }
+ }
+ break;
+ }
+
+ if (!journaler->is_readable() &&
+ journaler->get_read_pos() == journaler->get_write_pos())
+ break;
+
+ ceph_assert(journaler->is_readable() || mds->is_daemon_stopping());
+
+ // read it
+ uint64_t pos = journaler->get_read_pos();
+ bufferlist bl;
+ bool r = journaler->try_read_entry(bl);
+ if (!r && journaler->get_error())
+ continue;
+ ceph_assert(r);
+
+ // unpack event
+ auto le = LogEvent::decode_event(bl.cbegin());
+ if (!le) {
+ dout(0) << "_replay " << pos << "~" << bl.length() << " / " << journaler->get_write_pos()
+ << " -- unable to decode event" << dendl;
+ dout(0) << "dump of unknown or corrupt event:\n";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+
+ mds->clog->error() << "corrupt journal event at " << pos << "~"
+ << bl.length() << " / "
+ << journaler->get_write_pos();
+ if (g_conf()->mds_log_skip_corrupt_events) {
+ continue;
+ } else {
+ mds->damaged_unlocked();
+ ceph_abort(); // Should be unreachable because damaged() calls
+ // respawn()
+ }
+
+ }
+ le->set_start_off(pos);
+
+ // new segment?
+ if (le->get_type() == EVENT_SUBTREEMAP ||
+ le->get_type() == EVENT_RESETJOURNAL) {
+ auto sle = dynamic_cast<ESubtreeMap*>(le.get());
+ if (sle && sle->event_seq > 0)
+ event_seq = sle->event_seq;
+ else
+ event_seq = pos;
+ segments[event_seq] = new LogSegment(event_seq, pos);
+ logger->set(l_mdl_seg, segments.size());
+ } else {
+ event_seq++;
+ }
+
+ // have we seen an import map yet?
+ if (segments.empty()) {
+ dout(10) << "_replay " << pos << "~" << bl.length() << " / " << journaler->get_write_pos()
+ << " " << le->get_stamp() << " -- waiting for subtree_map. (skipping " << *le << ")" << dendl;
+ } else {
+ dout(10) << "_replay " << pos << "~" << bl.length() << " / " << journaler->get_write_pos()
+ << " " << le->get_stamp() << ": " << *le << dendl;
+ le->_segment = get_current_segment(); // replay may need this
+ le->_segment->num_events++;
+ le->_segment->end = journaler->get_read_pos();
+ num_events++;
+
+ {
+ std::lock_guard l(mds->mds_lock);
+ if (mds->is_daemon_stopping()) {
+ return;
+ }
+ logger->inc(l_mdl_replayed);
+ le->replay(mds);
+ }
+ }
+
+ logger->set(l_mdl_rdpos, pos);
+ }
+
+ // done!
+ if (r == 0) {
+ ceph_assert(journaler->get_read_pos() == journaler->get_write_pos());
+ dout(10) << "_replay - complete, " << num_events
+ << " events" << dendl;
+
+ logger->set(l_mdl_expos, journaler->get_expire_pos());
+ }
+
+ safe_pos = journaler->get_write_safe_pos();
+
+ dout(10) << "_replay_thread kicking waiters" << dendl;
+ {
+ std::lock_guard l(mds->mds_lock);
+ if (mds->is_daemon_stopping()) {
+ return;
+ }
+ pre_segments_size = segments.size(); // get num of logs when replay is finished
+ finish_contexts(g_ceph_context, waitfor_replay, r);
+ }
+
+ dout(10) << "_replay_thread finish" << dendl;
+}
+
+void MDLog::standby_trim_segments()
+{
+ dout(10) << "standby_trim_segments" << dendl;
+ uint64_t expire_pos = journaler->get_expire_pos();
+ dout(10) << " expire_pos=" << expire_pos << dendl;
+
+ mds->mdcache->open_file_table.trim_destroyed_inos(expire_pos);
+
+ bool removed_segment = false;
+ while (have_any_segments()) {
+ LogSegment *seg = get_oldest_segment();
+ dout(10) << " segment seq=" << seg->seq << " " << seg->offset <<
+ "~" << seg->end - seg->offset << dendl;
+
+ if (seg->end > expire_pos) {
+ dout(10) << " won't remove, not expired!" << dendl;
+ break;
+ }
+
+ if (segments.size() == 1) {
+ dout(10) << " won't remove, last segment!" << dendl;
+ break;
+ }
+
+ dout(10) << " removing segment" << dendl;
+ mds->mdcache->standby_trim_segment(seg);
+ remove_oldest_segment();
+ removed_segment = true;
+ }
+
+ if (removed_segment) {
+ dout(20) << " calling mdcache->trim!" << dendl;
+ mds->mdcache->trim();
+ } else {
+ dout(20) << " removed no segments!" << dendl;
+ }
+}
+
+void MDLog::dump_replay_status(Formatter *f) const
+{
+ f->open_object_section("replay_status");
+ f->dump_unsigned("journal_read_pos", journaler ? journaler->get_read_pos() : 0);
+ f->dump_unsigned("journal_write_pos", journaler ? journaler->get_write_pos() : 0);
+ f->dump_unsigned("journal_expire_pos", journaler ? journaler->get_expire_pos() : 0);
+ f->dump_unsigned("num_events", get_num_events());
+ f->dump_unsigned("num_segments", get_num_segments());
+ f->close_section();
+}
diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h
new file mode 100644
index 00000000..ea74180b
--- /dev/null
+++ b/src/mds/MDLog.h
@@ -0,0 +1,337 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_MDLOG_H
+#define CEPH_MDLOG_H
+
+enum {
+ l_mdl_first = 5000,
+ l_mdl_evadd,
+ l_mdl_evex,
+ l_mdl_evtrm,
+ l_mdl_ev,
+ l_mdl_evexg,
+ l_mdl_evexd,
+ l_mdl_segadd,
+ l_mdl_segex,
+ l_mdl_segtrm,
+ l_mdl_seg,
+ l_mdl_segexg,
+ l_mdl_segexd,
+ l_mdl_expos,
+ l_mdl_wrpos,
+ l_mdl_rdpos,
+ l_mdl_jlat,
+ l_mdl_replayed,
+ l_mdl_last,
+};
+
+#include "include/types.h"
+#include "include/Context.h"
+
+#include "MDSContext.h"
+#include "common/Thread.h"
+#include "common/Cond.h"
+
+#include "LogSegment.h"
+
+#include <list>
+
+class Journaler;
+class JournalPointer;
+class LogEvent;
+class MDSRank;
+class LogSegment;
+class ESubtreeMap;
+
+class PerfCounters;
+
+#include <map>
+using std::map;
+
+#include "common/Finisher.h"
+
+
+class MDLog {
+public:
+ MDSRank *mds;
+protected:
+ int num_events; // in events
+
+ int unflushed;
+
+ bool capped;
+
+ // Log position which is persistent *and* for which
+ // submit_entry wait_for_safe callbacks have already
+ // been called.
+ uint64_t safe_pos;
+
+ inodeno_t ino;
+ Journaler *journaler;
+
+ PerfCounters *logger;
+
+
+ // -- replay --
+ class ReplayThread : public Thread {
+ MDLog *log;
+ public:
+ explicit ReplayThread(MDLog *l) : log(l) {}
+ void* entry() override {
+ log->_replay_thread();
+ return 0;
+ }
+ } replay_thread;
+ bool already_replayed;
+
+ friend class ReplayThread;
+ friend class C_MDL_Replay;
+
+ MDSContext::vec waitfor_replay;
+
+ void _replay(); // old way
+ void _replay_thread(); // new way
+
+ // Journal recovery/rewrite logic
+ class RecoveryThread : public Thread {
+ MDLog *log;
+ MDSContext *completion;
+ public:
+ void set_completion(MDSContext *c) {completion = c;}
+ explicit RecoveryThread(MDLog *l) : log(l), completion(NULL) {}
+ void* entry() override {
+ log->_recovery_thread(completion);
+ return 0;
+ }
+ } recovery_thread;
+ void _recovery_thread(MDSContext *completion);
+ void _reformat_journal(JournalPointer const &jp, Journaler *old_journal, MDSContext *completion);
+
+ // -- segments --
+ map<uint64_t,LogSegment*> segments;
+ set<LogSegment*> expiring_segments;
+ set<LogSegment*> expired_segments;
+ std::size_t pre_segments_size = 0; // the num of segments when the mds finished replay-journal, to calc the num of segments growing
+ uint64_t event_seq;
+ int expiring_events;
+ int expired_events;
+
+ struct PendingEvent {
+ LogEvent *le;
+ MDSContext *fin;
+ bool flush;
+ PendingEvent(LogEvent *e, MDSContext *c, bool f=false) : le(e), fin(c), flush(f) {}
+ };
+
+ int64_t mdsmap_up_features;
+ map<uint64_t,list<PendingEvent> > pending_events; // log segment -> event list
+ Mutex submit_mutex;
+ Cond submit_cond;
+
+ void set_safe_pos(uint64_t pos)
+ {
+ std::lock_guard l(submit_mutex);
+ ceph_assert(pos >= safe_pos);
+ safe_pos = pos;
+ }
+ friend class MDSLogContextBase;
+
+ void _submit_thread();
+ class SubmitThread : public Thread {
+ MDLog *log;
+ public:
+ explicit SubmitThread(MDLog *l) : log(l) {}
+ void* entry() override {
+ log->_submit_thread();
+ return 0;
+ }
+ } submit_thread;
+ friend class SubmitThread;
+
+public:
+ const std::set<LogSegment*> &get_expiring_segments() const
+ {
+ return expiring_segments;
+ }
+protected:
+
+ // -- subtreemaps --
+ friend class ESubtreeMap;
+ friend class MDCache;
+
+ uint64_t get_last_segment_seq() const {
+ ceph_assert(!segments.empty());
+ return segments.rbegin()->first;
+ }
+ LogSegment *get_oldest_segment() {
+ return segments.begin()->second;
+ }
+ void remove_oldest_segment() {
+ map<uint64_t, LogSegment*>::iterator p = segments.begin();
+ delete p->second;
+ segments.erase(p);
+ }
+
+public:
+ void create_logger();
+
+ // replay state
+ map<inodeno_t, set<inodeno_t> > pending_exports;
+
+ void set_write_iohint(unsigned iohint_flags);
+
+public:
+ explicit MDLog(MDSRank *m) : mds(m),
+ num_events(0),
+ unflushed(0),
+ capped(false),
+ safe_pos(0),
+ journaler(0),
+ logger(0),
+ replay_thread(this),
+ already_replayed(false),
+ recovery_thread(this),
+ event_seq(0), expiring_events(0), expired_events(0),
+ mdsmap_up_features(0),
+ submit_mutex("MDLog::submit_mutex"),
+ submit_thread(this),
+ cur_event(NULL) { }
+ ~MDLog();
+
+
+private:
+ // -- segments --
+ void _start_new_segment();
+ void _prepare_new_segment();
+ void _journal_segment_subtree_map(MDSContext *onsync);
+public:
+ void start_new_segment() {
+ std::lock_guard l(submit_mutex);
+ _start_new_segment();
+ }
+ void prepare_new_segment() {
+ std::lock_guard l(submit_mutex);
+ _prepare_new_segment();
+ }
+ void journal_segment_subtree_map(MDSContext *onsync=NULL) {
+ submit_mutex.Lock();
+ _journal_segment_subtree_map(onsync);
+ submit_mutex.Unlock();
+ if (onsync)
+ flush();
+ }
+
+ LogSegment *peek_current_segment() {
+ return segments.empty() ? NULL : segments.rbegin()->second;
+ }
+
+ LogSegment *get_current_segment() {
+ ceph_assert(!segments.empty());
+ return segments.rbegin()->second;
+ }
+
+ LogSegment *get_segment(LogSegment::seq_t seq) {
+ if (segments.count(seq))
+ return segments[seq];
+ return NULL;
+ }
+
+ bool have_any_segments() const {
+ return !segments.empty();
+ }
+
+ void flush_logger();
+
+ size_t get_num_events() const { return num_events; }
+ size_t get_num_segments() const { return segments.size(); }
+
+ uint64_t get_read_pos() const;
+ uint64_t get_write_pos() const;
+ uint64_t get_safe_pos() const;
+ Journaler *get_journaler() { return journaler; }
+ bool empty() const { return segments.empty(); }
+
+ bool is_capped() const { return capped; }
+ void cap();
+
+ void kick_submitter();
+ void shutdown();
+
+ // -- events --
+private:
+ LogEvent *cur_event;
+public:
+ void _start_entry(LogEvent *e);
+ void start_entry(LogEvent *e) {
+ std::lock_guard l(submit_mutex);
+ _start_entry(e);
+ }
+ void cancel_entry(LogEvent *e);
+ void _submit_entry(LogEvent *e, MDSLogContextBase *c);
+ void submit_entry(LogEvent *e, MDSLogContextBase *c = 0) {
+ std::lock_guard l(submit_mutex);
+ _submit_entry(e, c);
+ submit_cond.Signal();
+ }
+ void start_submit_entry(LogEvent *e, MDSLogContextBase *c = 0) {
+ std::lock_guard l(submit_mutex);
+ _start_entry(e);
+ _submit_entry(e, c);
+ submit_cond.Signal();
+ }
+ bool entry_is_open() const { return cur_event != NULL; }
+
+ void wait_for_safe( MDSContext *c );
+ void flush();
+ bool is_flushed() const {
+ return unflushed == 0;
+ }
+
+private:
+ void try_expire(LogSegment *ls, int op_prio);
+ void _maybe_expired(LogSegment *ls, int op_prio);
+ void _expired(LogSegment *ls);
+ void _trim_expired_segments();
+
+ friend class C_MaybeExpiredSegment;
+ friend class C_MDL_Flushed;
+ friend class C_OFT_Committed;
+
+public:
+ void trim_expired_segments();
+ void trim(int max=-1);
+ int trim_all();
+ bool expiry_done() const
+ {
+ return expiring_segments.empty() && expired_segments.empty();
+ };
+
+private:
+ void write_head(MDSContext *onfinish);
+
+public:
+ void create(MDSContext *onfinish); // fresh, empty log!
+ void open(MDSContext *onopen); // append() or replay() to follow!
+ void reopen(MDSContext *onopen);
+ void append();
+ void replay(MDSContext *onfinish);
+
+ void standby_trim_segments();
+
+ void dump_replay_status(Formatter *f) const;
+};
+
+#endif
diff --git a/src/mds/MDSAuthCaps.cc b/src/mds/MDSAuthCaps.cc
new file mode 100644
index 00000000..949ac62c
--- /dev/null
+++ b/src/mds/MDSAuthCaps.cc
@@ -0,0 +1,434 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string_view>
+
+#include <errno.h>
+#include <fcntl.h>
+
+#include <boost/spirit/include/qi.hpp>
+#include <boost/spirit/include/phoenix_operator.hpp>
+#include <boost/spirit/include/phoenix.hpp>
+
+#include "common/debug.h"
+#include "MDSAuthCaps.h"
+#include "include/ipaddr.h"
+
+#define dout_subsys ceph_subsys_mds
+
+#undef dout_prefix
+#define dout_prefix *_dout << "MDSAuthCap "
+
+using std::ostream;
+using std::string;
+namespace qi = boost::spirit::qi;
+namespace ascii = boost::spirit::ascii;
+namespace phoenix = boost::phoenix;
+
+template <typename Iterator>
+struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()>
+{
+ MDSCapParser() : MDSCapParser::base_type(mdscaps)
+ {
+ using qi::char_;
+ using qi::int_;
+ using qi::uint_;
+ using qi::lexeme;
+ using qi::alnum;
+ using qi::_val;
+ using qi::_1;
+ using qi::_2;
+ using qi::_3;
+ using qi::eps;
+ using qi::lit;
+
+ spaces = +(lit(' ') | lit('\n') | lit('\t'));
+
+ quoted_path %=
+ lexeme[lit("\"") >> *(char_ - '"') >> '"'] |
+ lexeme[lit("'") >> *(char_ - '\'') >> '\''];
+ unquoted_path %= +char_("a-zA-Z0-9_./-");
+ network_str %= +char_("/.:a-fA-F0-9][");
+
+ // match := [path=<path>] [uid=<uid> [gids=<gid>[,<gid>...]]
+ path %= (spaces >> lit("path") >> lit('=') >> (quoted_path | unquoted_path));
+ uid %= (spaces >> lit("uid") >> lit('=') >> uint_);
+ uintlist %= (uint_ % lit(','));
+ gidlist %= -(spaces >> lit("gids") >> lit('=') >> uintlist);
+ match = -(
+ (uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2)] |
+ (path >> uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2, _3)] |
+ (path)[_val = phoenix::construct<MDSCapMatch>(_1)]);
+
+ // capspec = * | r[w][p][s]
+ capspec = spaces >> (
+ lit("*")[_val = MDSCapSpec(MDSCapSpec::ALL)]
+ |
+ lit("all")[_val = MDSCapSpec(MDSCapSpec::ALL)]
+ |
+ (lit("rwps"))[_val = MDSCapSpec(MDSCapSpec::RWPS)]
+ |
+ (lit("rwp"))[_val = MDSCapSpec(MDSCapSpec::RWP)]
+ |
+ (lit("rws"))[_val = MDSCapSpec(MDSCapSpec::RWS)]
+ |
+ (lit("rw"))[_val = MDSCapSpec(MDSCapSpec::RW)]
+ |
+ (lit("r"))[_val = MDSCapSpec(MDSCapSpec::READ)]
+ );
+
+ grant = lit("allow") >> (capspec >> match >>
+ -(spaces >> lit("network") >> spaces >> network_str))
+ [_val = phoenix::construct<MDSCapGrant>(_1, _2, _3)];
+ grants %= (grant % (*lit(' ') >> (lit(';') | lit(',')) >> *lit(' ')));
+ mdscaps = grants [_val = phoenix::construct<MDSAuthCaps>(_1)];
+ }
+ qi::rule<Iterator> spaces;
+ qi::rule<Iterator, string()> quoted_path, unquoted_path, network_str;
+ qi::rule<Iterator, MDSCapSpec()> capspec;
+ qi::rule<Iterator, string()> path;
+ qi::rule<Iterator, uint32_t()> uid;
+ qi::rule<Iterator, std::vector<uint32_t>() > uintlist;
+ qi::rule<Iterator, std::vector<uint32_t>() > gidlist;
+ qi::rule<Iterator, MDSCapMatch()> match;
+ qi::rule<Iterator, MDSCapGrant()> grant;
+ qi::rule<Iterator, std::vector<MDSCapGrant>()> grants;
+ qi::rule<Iterator, MDSAuthCaps()> mdscaps;
+};
+
+void MDSCapMatch::normalize_path()
+{
+ // drop any leading /
+ while (path.length() && path[0] == '/') {
+ path = path.substr(1);
+ }
+
+ // drop dup //
+ // drop .
+ // drop ..
+}
+
+bool MDSCapMatch::match(std::string_view target_path,
+ const int caller_uid,
+ const int caller_gid,
+ const vector<uint64_t> *caller_gid_list) const
+{
+ if (uid != MDS_AUTH_UID_ANY) {
+ if (uid != caller_uid)
+ return false;
+ if (!gids.empty()) {
+ bool gid_matched = false;
+ if (std::find(gids.begin(), gids.end(), caller_gid) != gids.end())
+ gid_matched = true;
+ if (caller_gid_list) {
+ for (auto i = caller_gid_list->begin(); i != caller_gid_list->end(); ++i) {
+ if (std::find(gids.begin(), gids.end(), *i) != gids.end()) {
+ gid_matched = true;
+ break;
+ }
+ }
+ }
+ if (!gid_matched)
+ return false;
+ }
+ }
+
+ if (!match_path(target_path)) {
+ return false;
+ }
+
+ return true;
+}
+
+bool MDSCapMatch::match_path(std::string_view target_path) const
+{
+ if (path.length()) {
+ if (target_path.find(path) != 0)
+ return false;
+ // if path doesn't already have a trailing /, make sure the target
+ // does so that path=/foo doesn't match target_path=/food
+ if (target_path.length() > path.length() &&
+ path[path.length()-1] != '/' &&
+ target_path[path.length()] != '/')
+ return false;
+ }
+
+ return true;
+}
+
+void MDSCapGrant::parse_network()
+{
+ network_valid = ::parse_network(network.c_str(), &network_parsed,
+ &network_prefix);
+}
+
+/**
+ * Is the client *potentially* able to access this path? Actual
+ * permission will depend on uids/modes in the full is_capable.
+ */
+bool MDSAuthCaps::path_capable(std::string_view inode_path) const
+{
+ for (const auto &i : grants) {
+ if (i.match.match_path(inode_path)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/**
+ * For a given filesystem path, query whether this capability carries`
+ * authorization to read or write.
+ *
+ * This is true if any of the 'grant' clauses in the capability match the
+ * requested path + op.
+ */
+bool MDSAuthCaps::is_capable(std::string_view inode_path,
+ uid_t inode_uid, gid_t inode_gid,
+ unsigned inode_mode,
+ uid_t caller_uid, gid_t caller_gid,
+ const vector<uint64_t> *caller_gid_list,
+ unsigned mask,
+ uid_t new_uid, gid_t new_gid,
+ const entity_addr_t& addr) const
+{
+ if (cct)
+ ldout(cct, 10) << __func__ << " inode(path /" << inode_path
+ << " owner " << inode_uid << ":" << inode_gid
+ << " mode 0" << std::oct << inode_mode << std::dec
+ << ") by caller " << caller_uid << ":" << caller_gid
+// << "[" << caller_gid_list << "]";
+ << " mask " << mask
+ << " new " << new_uid << ":" << new_gid
+ << " cap: " << *this << dendl;
+
+ for (std::vector<MDSCapGrant>::const_iterator i = grants.begin();
+ i != grants.end();
+ ++i) {
+ if (i->network.size() &&
+ (!i->network_valid ||
+ !network_contains(i->network_parsed,
+ i->network_prefix,
+ addr))) {
+ continue;
+ }
+
+ if (i->match.match(inode_path, caller_uid, caller_gid, caller_gid_list) &&
+ i->spec.allows(mask & (MAY_READ|MAY_EXECUTE), mask & MAY_WRITE)) {
+ // we have a match; narrow down GIDs to those specifically allowed here
+ vector<uint64_t> gids;
+ if (std::find(i->match.gids.begin(), i->match.gids.end(), caller_gid) !=
+ i->match.gids.end()) {
+ gids.push_back(caller_gid);
+ }
+ if (caller_gid_list) {
+ std::set_intersection(i->match.gids.begin(), i->match.gids.end(),
+ caller_gid_list->begin(), caller_gid_list->end(),
+ std::back_inserter(gids));
+ std::sort(gids.begin(), gids.end());
+ }
+
+
+ // Spec is non-allowing if caller asked for set pool but spec forbids it
+ if (mask & MAY_SET_VXATTR) {
+ if (!i->spec.allow_set_vxattr()) {
+ continue;
+ }
+ }
+
+ if (mask & MAY_SNAPSHOT) {
+ if (!i->spec.allow_snapshot()) {
+ continue;
+ }
+ }
+
+ // check unix permissions?
+ if (i->match.uid == MDSCapMatch::MDS_AUTH_UID_ANY) {
+ return true;
+ }
+
+ // chown/chgrp
+ if (mask & MAY_CHOWN) {
+ if (new_uid != caller_uid || // you can't chown to someone else
+ inode_uid != caller_uid) { // you can't chown from someone else
+ continue;
+ }
+ }
+ if (mask & MAY_CHGRP) {
+ // you can only chgrp *to* one of your groups... if you own the file.
+ if (inode_uid != caller_uid ||
+ std::find(gids.begin(), gids.end(), new_gid) ==
+ gids.end()) {
+ continue;
+ }
+ }
+
+ if (inode_uid == caller_uid) {
+ if ((!(mask & MAY_READ) || (inode_mode & S_IRUSR)) &&
+ (!(mask & MAY_WRITE) || (inode_mode & S_IWUSR)) &&
+ (!(mask & MAY_EXECUTE) || (inode_mode & S_IXUSR))) {
+ return true;
+ }
+ } else if (std::find(gids.begin(), gids.end(),
+ inode_gid) != gids.end()) {
+ if ((!(mask & MAY_READ) || (inode_mode & S_IRGRP)) &&
+ (!(mask & MAY_WRITE) || (inode_mode & S_IWGRP)) &&
+ (!(mask & MAY_EXECUTE) || (inode_mode & S_IXGRP))) {
+ return true;
+ }
+ } else {
+ if ((!(mask & MAY_READ) || (inode_mode & S_IROTH)) &&
+ (!(mask & MAY_WRITE) || (inode_mode & S_IWOTH)) &&
+ (!(mask & MAY_EXECUTE) || (inode_mode & S_IXOTH))) {
+ return true;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+void MDSAuthCaps::set_allow_all()
+{
+ grants.clear();
+ grants.push_back(MDSCapGrant(MDSCapSpec(MDSCapSpec::ALL), MDSCapMatch(),
+ {}));
+}
+
+bool MDSAuthCaps::parse(CephContext *c, std::string_view str, ostream *err)
+{
+ // Special case for legacy caps
+ if (str == "allow") {
+ grants.clear();
+ grants.push_back(MDSCapGrant(MDSCapSpec(MDSCapSpec::RWPS), MDSCapMatch(),
+ {}));
+ return true;
+ }
+
+ auto iter = str.begin();
+ auto end = str.end();
+ MDSCapParser<decltype(iter)> g;
+
+ bool r = qi::phrase_parse(iter, end, g, ascii::space, *this);
+ cct = c; // set after parser self-assignment
+ if (r && iter == end) {
+ for (auto& grant : grants) {
+ std::sort(grant.match.gids.begin(), grant.match.gids.end());
+ grant.parse_network();
+ }
+ return true;
+ } else {
+ // Make sure no grants are kept after parsing failed!
+ grants.clear();
+
+ if (err)
+ *err << "mds capability parse failed, stopped at '"
+ << std::string(iter, end)
+ << "' of '" << str << "'";
+ return false;
+ }
+}
+
+
+bool MDSAuthCaps::allow_all() const
+{
+ for (std::vector<MDSCapGrant>::const_iterator i = grants.begin(); i != grants.end(); ++i) {
+ if (i->match.is_match_all() && i->spec.allow_all()) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+ostream &operator<<(ostream &out, const MDSCapMatch &match)
+{
+ if (match.path.length()) {
+ out << "path=\"/" << match.path << "\"";
+ if (match.uid != MDSCapMatch::MDS_AUTH_UID_ANY) {
+ out << " ";
+ }
+ }
+ if (match.uid != MDSCapMatch::MDS_AUTH_UID_ANY) {
+ out << "uid=" << match.uid;
+ if (!match.gids.empty()) {
+ out << " gids=";
+ for (std::vector<gid_t>::const_iterator p = match.gids.begin();
+ p != match.gids.end();
+ ++p) {
+ if (p != match.gids.begin())
+ out << ',';
+ out << *p;
+ }
+ }
+ }
+
+ return out;
+}
+
+
+ostream &operator<<(ostream &out, const MDSCapSpec &spec)
+{
+ if (spec.allow_all()) {
+ out << "*";
+ } else {
+ if (spec.allow_read()) {
+ out << "r";
+ }
+ if (spec.allow_write()) {
+ out << "w";
+ }
+ if (spec.allow_set_vxattr()) {
+ out << "p";
+ }
+ if (spec.allow_snapshot()) {
+ out << "s";
+ }
+ }
+
+ return out;
+}
+
+
+ostream &operator<<(ostream &out, const MDSCapGrant &grant)
+{
+ out << "allow ";
+ out << grant.spec;
+ if (!grant.match.is_match_all()) {
+ out << " " << grant.match;
+ }
+ if (grant.network.size()) {
+ out << " network " << grant.network;
+ }
+ return out;
+}
+
+
+ostream &operator<<(ostream &out, const MDSAuthCaps &cap)
+{
+ out << "MDSAuthCaps[";
+ for (size_t i = 0; i < cap.grants.size(); ++i) {
+ out << cap.grants[i];
+ if (i < cap.grants.size() - 1) {
+ out << ", ";
+ }
+ }
+ out << "]";
+
+ return out;
+}
+
diff --git a/src/mds/MDSAuthCaps.h b/src/mds/MDSAuthCaps.h
new file mode 100644
index 00000000..cc1006cd
--- /dev/null
+++ b/src/mds/MDSAuthCaps.h
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef MDS_AUTH_CAPS_H
+#define MDS_AUTH_CAPS_H
+
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "include/types.h"
+#include "common/debug.h"
+
+// unix-style capabilities
+enum {
+ MAY_READ = (1 << 0),
+ MAY_WRITE = (1 << 1),
+ MAY_EXECUTE = (1 << 2),
+ MAY_CHOWN = (1 << 4),
+ MAY_CHGRP = (1 << 5),
+ MAY_SET_VXATTR = (1 << 6),
+ MAY_SNAPSHOT = (1 << 7),
+};
+
+class CephContext;
+
+// what we can do
+struct MDSCapSpec {
+ static const unsigned ALL = (1 << 0);
+ static const unsigned READ = (1 << 1);
+ static const unsigned WRITE = (1 << 2);
+ // if the capability permits setting vxattrs (layout, quota, etc)
+ static const unsigned SET_VXATTR = (1 << 3);
+ // if the capability permits mksnap/rmsnap
+ static const unsigned SNAPSHOT = (1 << 4);
+
+ static const unsigned RW = (READ|WRITE);
+ static const unsigned RWP = (READ|WRITE|SET_VXATTR);
+ static const unsigned RWS = (READ|WRITE|SNAPSHOT);
+ static const unsigned RWPS = (READ|WRITE|SET_VXATTR|SNAPSHOT);
+
+ MDSCapSpec() = default;
+ MDSCapSpec(unsigned _caps) : caps(_caps) {
+ if (caps & ALL)
+ caps |= RWPS;
+ }
+
+ bool allow_all() const {
+ return (caps & ALL);
+ }
+ bool allow_read() const {
+ return (caps & READ);
+ }
+ bool allow_write() const {
+ return (caps & WRITE);
+ }
+
+ bool allows(bool r, bool w) const {
+ if (allow_all())
+ return true;
+ if (r && !allow_read())
+ return false;
+ if (w && !allow_write())
+ return false;
+ return true;
+ }
+
+ bool allow_snapshot() const {
+ return (caps & SNAPSHOT);
+ }
+ bool allow_set_vxattr() const {
+ return (caps & SET_VXATTR);
+ }
+private:
+ unsigned caps = 0;
+};
+
+// conditions before we are allowed to do it
+struct MDSCapMatch {
+ static const int64_t MDS_AUTH_UID_ANY = -1;
+
+ int64_t uid; // Require UID to be equal to this, if !=MDS_AUTH_UID_ANY
+ std::vector<gid_t> gids; // Use these GIDs
+ std::string path; // Require path to be child of this (may be "" or "/" for any)
+
+ MDSCapMatch() : uid(MDS_AUTH_UID_ANY) {}
+ MDSCapMatch(int64_t uid_, std::vector<gid_t>& gids_) : uid(uid_), gids(gids_) {}
+ explicit MDSCapMatch(const std::string &path_)
+ : uid(MDS_AUTH_UID_ANY), path(path_) {
+ normalize_path();
+ }
+ MDSCapMatch(const std::string& path_, int64_t uid_, std::vector<gid_t>& gids_)
+ : uid(uid_), gids(gids_), path(path_) {
+ normalize_path();
+ }
+
+ void normalize_path();
+
+ bool is_match_all() const
+ {
+ return uid == MDS_AUTH_UID_ANY && path == "";
+ }
+
+ // check whether this grant matches against a given file and caller uid:gid
+ bool match(std::string_view target_path,
+ const int caller_uid,
+ const int caller_gid,
+ const vector<uint64_t> *caller_gid_list) const;
+
+ /**
+ * Check whether this path *might* be accessible (actual permission
+ * depends on the stronger check in match()).
+ *
+ * @param target_path filesystem path without leading '/'
+ */
+ bool match_path(std::string_view target_path) const;
+};
+
+struct MDSCapGrant {
+ MDSCapSpec spec;
+ MDSCapMatch match;
+
+ std::string network;
+
+ entity_addr_t network_parsed;
+ unsigned network_prefix = 0;
+ bool network_valid = true;
+
+ MDSCapGrant(const MDSCapSpec &spec_, const MDSCapMatch &match_,
+ boost::optional<std::string> n)
+ : spec(spec_), match(match_) {
+ if (n) {
+ network = *n;
+ parse_network();
+ }
+ }
+ MDSCapGrant() {}
+
+ void parse_network();
+};
+
+class MDSAuthCaps
+{
+ CephContext *cct = nullptr;
+ std::vector<MDSCapGrant> grants;
+
+public:
+ MDSAuthCaps() = default;
+ explicit MDSAuthCaps(CephContext *cct_) : cct(cct_) {}
+
+ // this ctor is used by spirit/phoenix; doesn't need cct.
+ explicit MDSAuthCaps(const std::vector<MDSCapGrant>& grants_) : grants(grants_) {}
+
+ void clear() {
+ grants.clear();
+ }
+
+ void set_allow_all();
+ bool parse(CephContext *cct, std::string_view str, std::ostream *err);
+
+ bool allow_all() const;
+ bool is_capable(std::string_view inode_path,
+ uid_t inode_uid, gid_t inode_gid, unsigned inode_mode,
+ uid_t uid, gid_t gid, const vector<uint64_t> *caller_gid_list,
+ unsigned mask, uid_t new_uid, gid_t new_gid,
+ const entity_addr_t& addr) const;
+ bool path_capable(std::string_view inode_path) const;
+
+ friend std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap);
+};
+
+
+std::ostream &operator<<(std::ostream &out, const MDSCapMatch &match);
+std::ostream &operator<<(std::ostream &out, const MDSCapSpec &spec);
+std::ostream &operator<<(std::ostream &out, const MDSCapGrant &grant);
+std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap);
+
+#endif // MDS_AUTH_CAPS_H
diff --git a/src/mds/MDSCacheObject.cc b/src/mds/MDSCacheObject.cc
new file mode 100644
index 00000000..3ad8190b
--- /dev/null
+++ b/src/mds/MDSCacheObject.cc
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MDSCacheObject.h"
+#include "MDSContext.h"
+#include "common/Formatter.h"
+
+uint64_t MDSCacheObject::last_wait_seq = 0;
+
+void MDSCacheObject::finish_waiting(uint64_t mask, int result) {
+ MDSContext::vec finished;
+ take_waiting(mask, finished);
+ finish_contexts(g_ceph_context, finished, result);
+}
+
+void MDSCacheObject::dump(Formatter *f) const
+{
+ f->dump_bool("is_auth", is_auth());
+
+ // Fields only meaningful for auth
+ f->open_object_section("auth_state");
+ {
+ f->open_object_section("replicas");
+ for (const auto &it : get_replicas()) {
+ std::ostringstream rank_str;
+ rank_str << it.first;
+ f->dump_int(rank_str.str().c_str(), it.second);
+ }
+ f->close_section();
+ }
+ f->close_section(); // auth_state
+
+ // Fields only meaningful for replica
+ f->open_object_section("replica_state");
+ {
+ f->open_array_section("authority");
+ f->dump_int("first", authority().first);
+ f->dump_int("second", authority().second);
+ f->close_section();
+ f->dump_unsigned("replica_nonce", get_replica_nonce());
+ }
+ f->close_section(); // replica_state
+
+ f->dump_int("auth_pins", auth_pins);
+ f->dump_bool("is_frozen", is_frozen());
+ f->dump_bool("is_freezing", is_freezing());
+
+#ifdef MDS_REF_SET
+ f->open_object_section("pins");
+ for(const auto& p : ref_map) {
+ f->dump_int(pin_name(p.first).data(), p.second);
+ }
+ f->close_section();
+#endif
+ f->dump_int("nref", ref);
+}
+
+/*
+ * Use this in subclasses when printing their specialized
+ * states too.
+ */
+void MDSCacheObject::dump_states(Formatter *f) const
+{
+ if (state_test(STATE_AUTH)) f->dump_string("state", "auth");
+ if (state_test(STATE_DIRTY)) f->dump_string("state", "dirty");
+ if (state_test(STATE_NOTIFYREF)) f->dump_string("state", "notifyref");
+ if (state_test(STATE_REJOINING)) f->dump_string("state", "rejoining");
+ if (state_test(STATE_REJOINUNDEF))
+ f->dump_string("state", "rejoinundef");
+}
+
diff --git a/src/mds/MDSCacheObject.h b/src/mds/MDSCacheObject.h
new file mode 100644
index 00000000..e17089bb
--- /dev/null
+++ b/src/mds/MDSCacheObject.h
@@ -0,0 +1,415 @@
+#ifndef CEPH_MDSCACHEOBJECT_H
+#define CEPH_MDSCACHEOBJECT_H
+
+#include <ostream>
+#include <string_view>
+
+#include "common/config.h"
+
+#include "include/Context.h"
+#include "include/ceph_assert.h"
+#include "include/mempool.h"
+#include "include/types.h"
+#include "include/xlist.h"
+
+#include "mdstypes.h"
+#include "MDSContext.h"
+
+#define MDS_REF_SET // define me for improved debug output, sanity checking
+//#define MDS_AUTHPIN_SET // define me for debugging auth pin leaks
+//#define MDS_VERIFY_FRAGSTAT // do (slow) sanity checking on frags
+
+
+class MLock;
+class SimpleLock;
+class MDSCacheObject;
+class MDSContext;
+
+/*
+ * for metadata leases to clients
+ */
+struct ClientLease {
+ client_t client;
+ MDSCacheObject *parent;
+
+ ceph_seq_t seq = 0;
+ utime_t ttl;
+ xlist<ClientLease*>::item item_session_lease; // per-session list
+ xlist<ClientLease*>::item item_lease; // global list
+
+ ClientLease(client_t c, MDSCacheObject *p) :
+ client(c), parent(p),
+ item_session_lease(this),
+ item_lease(this) { }
+ ClientLease() = delete;
+};
+
+
+// print hack
+struct mdsco_db_line_prefix {
+ MDSCacheObject *object;
+ explicit mdsco_db_line_prefix(MDSCacheObject *o) : object(o) {}
+};
+std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o);
+
+// printer
+std::ostream& operator<<(std::ostream& out, const MDSCacheObject &o);
+
+class MDSCacheObject {
+ public:
+ // -- pins --
+ const static int PIN_REPLICATED = 1000;
+ const static int PIN_DIRTY = 1001;
+ const static int PIN_LOCK = -1002;
+ const static int PIN_REQUEST = -1003;
+ const static int PIN_WAITER = 1004;
+ const static int PIN_DIRTYSCATTERED = -1005;
+ static const int PIN_AUTHPIN = 1006;
+ static const int PIN_PTRWAITER = -1007;
+ const static int PIN_TEMPEXPORTING = 1008; // temp pin between encode_ and finish_export
+ static const int PIN_CLIENTLEASE = 1009;
+ static const int PIN_DISCOVERBASE = 1010;
+
+ std::string_view generic_pin_name(int p) const {
+ switch (p) {
+ case PIN_REPLICATED: return "replicated";
+ case PIN_DIRTY: return "dirty";
+ case PIN_LOCK: return "lock";
+ case PIN_REQUEST: return "request";
+ case PIN_WAITER: return "waiter";
+ case PIN_DIRTYSCATTERED: return "dirtyscattered";
+ case PIN_AUTHPIN: return "authpin";
+ case PIN_PTRWAITER: return "ptrwaiter";
+ case PIN_TEMPEXPORTING: return "tempexporting";
+ case PIN_CLIENTLEASE: return "clientlease";
+ case PIN_DISCOVERBASE: return "discoverbase";
+ default: ceph_abort(); return std::string_view();
+ }
+ }
+
+ // -- state --
+ const static int STATE_AUTH = (1<<30);
+ const static int STATE_DIRTY = (1<<29);
+ const static int STATE_NOTIFYREF = (1<<28); // notify dropping ref drop through _put()
+ const static int STATE_REJOINING = (1<<27); // replica has not joined w/ primary copy
+ const static int STATE_REJOINUNDEF = (1<<26); // contents undefined.
+
+
+ // -- wait --
+ const static uint64_t WAIT_ORDERED = (1ull<<61);
+ const static uint64_t WAIT_SINGLEAUTH = (1ull<<60);
+ const static uint64_t WAIT_UNFREEZE = (1ull<<59); // pka AUTHPINNABLE
+
+
+ // ============================================
+ // cons
+ public:
+ MDSCacheObject() {}
+ virtual ~MDSCacheObject() {}
+
+ // printing
+ virtual void print(std::ostream& out) = 0;
+ virtual std::ostream& print_db_line_prefix(std::ostream& out) {
+ return out << "mdscacheobject(" << this << ") ";
+ }
+
+ // --------------------------------------------
+ // state
+ protected:
+ __u32 state = 0; // state bits
+
+ public:
+ unsigned get_state() const { return state; }
+ unsigned state_test(unsigned mask) const { return (state & mask); }
+ void state_clear(unsigned mask) { state &= ~mask; }
+ void state_set(unsigned mask) { state |= mask; }
+ void state_reset(unsigned s) { state = s; }
+
+ bool is_auth() const { return state_test(STATE_AUTH); }
+ bool is_dirty() const { return state_test(STATE_DIRTY); }
+ bool is_clean() const { return !is_dirty(); }
+ bool is_rejoining() const { return state_test(STATE_REJOINING); }
+
+ // --------------------------------------------
+ // authority
+ virtual mds_authority_t authority() const = 0;
+ bool is_ambiguous_auth() const {
+ return authority().second != CDIR_AUTH_UNKNOWN;
+ }
+
+ // --------------------------------------------
+ // pins
+protected:
+ __s32 ref = 0; // reference count
+#ifdef MDS_REF_SET
+ mempool::mds_co::flat_map<int,int> ref_map;
+#endif
+
+ public:
+ int get_num_ref(int by = -1) const {
+#ifdef MDS_REF_SET
+ if (by >= 0) {
+ if (ref_map.find(by) == ref_map.end()) {
+ return 0;
+ } else {
+ return ref_map.find(by)->second;
+ }
+ }
+#endif
+ return ref;
+ }
+ virtual std::string_view pin_name(int by) const = 0;
+ //bool is_pinned_by(int by) { return ref_set.count(by); }
+ //multiset<int>& get_ref_set() { return ref_set; }
+
+ virtual void last_put() {}
+ virtual void bad_put(int by) {
+#ifdef MDS_REF_SET
+ ceph_assert(ref_map[by] > 0);
+#endif
+ ceph_assert(ref > 0);
+ }
+ virtual void _put() {}
+ void put(int by) {
+#ifdef MDS_REF_SET
+ if (ref == 0 || ref_map[by] == 0) {
+#else
+ if (ref == 0) {
+#endif
+ bad_put(by);
+ } else {
+ ref--;
+#ifdef MDS_REF_SET
+ ref_map[by]--;
+#endif
+ if (ref == 0)
+ last_put();
+ if (state_test(STATE_NOTIFYREF))
+ _put();
+ }
+ }
+
+ virtual void first_get() {}
+ virtual void bad_get(int by) {
+#ifdef MDS_REF_SET
+ ceph_assert(by < 0 || ref_map[by] == 0);
+#endif
+ ceph_abort();
+ }
+ void get(int by) {
+ if (ref == 0)
+ first_get();
+ ref++;
+#ifdef MDS_REF_SET
+ if (ref_map.find(by) == ref_map.end())
+ ref_map[by] = 0;
+ ref_map[by]++;
+#endif
+ }
+
+ void print_pin_set(std::ostream& out) const {
+#ifdef MDS_REF_SET
+ for(auto const &p : ref_map) {
+ out << " " << pin_name(p.first) << "=" << p.second;
+ }
+#else
+ out << " nref=" << ref;
+#endif
+ }
+
+protected:
+ int auth_pins = 0;
+#ifdef MDS_AUTHPIN_SET
+ mempool::mds_co::multiset<void*> auth_pin_set;
+#endif
+
+public:
+ int get_num_auth_pins() const { return auth_pins; }
+#ifdef MDS_AUTHPIN_SET
+ void print_authpin_set(std::ostream& out) const {
+ out << " (" << auth_pin_set << ")";
+ }
+#endif
+
+ void dump_states(Formatter *f) const;
+ void dump(Formatter *f) const;
+
+ // --------------------------------------------
+ // auth pins
+ enum {
+ // can_auth_pin() error codes
+ ERR_NOT_AUTH = 1,
+ ERR_EXPORTING_TREE,
+ ERR_FRAGMENTING_DIR,
+ ERR_EXPORTING_INODE,
+ };
+ virtual bool can_auth_pin(int *err_code=nullptr) const = 0;
+ virtual void auth_pin(void *who) = 0;
+ virtual void auth_unpin(void *who) = 0;
+ virtual bool is_frozen() const = 0;
+ virtual bool is_freezing() const = 0;
+ virtual bool is_freezing_or_frozen() const {
+ return is_frozen() || is_freezing();
+ }
+
+
+ // --------------------------------------------
+ // replication (across mds cluster)
+ protected:
+ unsigned replica_nonce = 0; // [replica] defined on replica
+ typedef mempool::mds_co::compact_map<mds_rank_t,unsigned> replica_map_type;
+ replica_map_type replica_map; // [auth] mds -> nonce
+
+ public:
+ bool is_replicated() const { return !get_replicas().empty(); }
+ bool is_replica(mds_rank_t mds) const { return get_replicas().count(mds); }
+ int num_replicas() const { return get_replicas().size(); }
+ unsigned add_replica(mds_rank_t mds) {
+ if (get_replicas().count(mds))
+ return ++get_replicas()[mds]; // inc nonce
+ if (get_replicas().empty())
+ get(PIN_REPLICATED);
+ return get_replicas()[mds] = 1;
+ }
+ void add_replica(mds_rank_t mds, unsigned nonce) {
+ if (get_replicas().empty())
+ get(PIN_REPLICATED);
+ get_replicas()[mds] = nonce;
+ }
+ unsigned get_replica_nonce(mds_rank_t mds) {
+ ceph_assert(get_replicas().count(mds));
+ return get_replicas()[mds];
+ }
+ void remove_replica(mds_rank_t mds) {
+ ceph_assert(get_replicas().count(mds));
+ get_replicas().erase(mds);
+ if (get_replicas().empty()) {
+ put(PIN_REPLICATED);
+ }
+ }
+ void clear_replica_map() {
+ if (!get_replicas().empty())
+ put(PIN_REPLICATED);
+ replica_map.clear();
+ }
+ replica_map_type& get_replicas() { return replica_map; }
+ const replica_map_type& get_replicas() const { return replica_map; }
+ void list_replicas(std::set<mds_rank_t>& ls) const {
+ for (const auto &p : get_replicas()) {
+ ls.insert(p.first);
+ }
+ }
+
+ unsigned get_replica_nonce() const { return replica_nonce; }
+ void set_replica_nonce(unsigned n) { replica_nonce = n; }
+
+
+ // ---------------------------------------------
+ // waiting
+ private:
+ mempool::mds_co::compact_multimap<uint64_t, std::pair<uint64_t, MDSContext*>> waiting;
+ static uint64_t last_wait_seq;
+
+ public:
+ bool is_waiter_for(uint64_t mask, uint64_t min=0) {
+ if (!min) {
+ min = mask;
+ while (min & (min-1)) // if more than one bit is set
+ min &= min-1; // clear LSB
+ }
+ for (auto p = waiting.lower_bound(min); p != waiting.end(); ++p) {
+ if (p->first & mask) return true;
+ if (p->first > mask) return false;
+ }
+ return false;
+ }
+ virtual void add_waiter(uint64_t mask, MDSContext *c) {
+ if (waiting.empty())
+ get(PIN_WAITER);
+
+ uint64_t seq = 0;
+ if (mask & WAIT_ORDERED) {
+ seq = ++last_wait_seq;
+ mask &= ~WAIT_ORDERED;
+ }
+ waiting.insert(pair<uint64_t, pair<uint64_t, MDSContext*> >(
+ mask,
+ pair<uint64_t, MDSContext*>(seq, c)));
+// pdout(10,g_conf()->debug_mds) << (mdsco_db_line_prefix(this))
+// << "add_waiter " << hex << mask << dec << " " << c
+// << " on " << *this
+// << dendl;
+
+ }
+ virtual void take_waiting(uint64_t mask, MDSContext::vec& ls) {
+ if (waiting.empty()) return;
+
+ // process ordered waiters in the same order that they were added.
+ std::map<uint64_t, MDSContext*> ordered_waiters;
+
+ for (auto it = waiting.begin(); it != waiting.end(); ) {
+ if (it->first & mask) {
+ if (it->second.first > 0) {
+ ordered_waiters.insert(it->second);
+ } else {
+ ls.push_back(it->second.second);
+ }
+// pdout(10,g_conf()->debug_mds) << (mdsco_db_line_prefix(this))
+// << "take_waiting mask " << hex << mask << dec << " took " << it->second
+// << " tag " << hex << it->first << dec
+// << " on " << *this
+// << dendl;
+ waiting.erase(it++);
+ } else {
+// pdout(10,g_conf()->debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second
+// << " tag " << hex << it->first << dec
+// << " on " << *this
+// << dendl;
+ ++it;
+ }
+ }
+ for (auto it = ordered_waiters.begin(); it != ordered_waiters.end(); ++it) {
+ ls.push_back(it->second);
+ }
+ if (waiting.empty()) {
+ put(PIN_WAITER);
+ waiting.clear();
+ }
+ }
+ void finish_waiting(uint64_t mask, int result = 0);
+
+ // ---------------------------------------------
+ // locking
+ // noop unless overloaded.
+ virtual SimpleLock* get_lock(int type) { ceph_abort(); return 0; }
+ virtual void set_object_info(MDSCacheObjectInfo &info) { ceph_abort(); }
+ virtual void encode_lock_state(int type, bufferlist& bl) { ceph_abort(); }
+ virtual void decode_lock_state(int type, const bufferlist& bl) { ceph_abort(); }
+ virtual void finish_lock_waiters(int type, uint64_t mask, int r=0) { ceph_abort(); }
+ virtual void add_lock_waiter(int type, uint64_t mask, MDSContext *c) { ceph_abort(); }
+ virtual bool is_lock_waiting(int type, uint64_t mask) { ceph_abort(); return false; }
+
+ virtual void clear_dirty_scattered(int type) { ceph_abort(); }
+
+ // ---------------------------------------------
+ // ordering
+ virtual bool is_lt(const MDSCacheObject *r) const = 0;
+ struct ptr_lt {
+ bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const {
+ return l->is_lt(r);
+ }
+ };
+
+};
+
+inline std::ostream& operator<<(std::ostream& out, MDSCacheObject &o) {
+ o.print(out);
+ return out;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o) {
+ o.object->print_db_line_prefix(out);
+ return out;
+}
+
+#endif
diff --git a/src/mds/MDSContext.cc b/src/mds/MDSContext.cc
new file mode 100644
index 00000000..b5b76847
--- /dev/null
+++ b/src/mds/MDSContext.cc
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "MDSRank.h"
+
+#include "MDSContext.h"
+
+#include "common/dout.h"
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+void MDSContext::complete(int r) {
+ MDSRank *mds = get_mds();
+ ceph_assert(mds != nullptr);
+ ceph_assert(mds->mds_lock.is_locked_by_me());
+ dout(10) << "MDSContext::complete: " << typeid(*this).name() << dendl;
+ return Context::complete(r);
+}
+
+void MDSInternalContextWrapper::finish(int r)
+{
+ fin->complete(r);
+}
+
+struct MDSIOContextList {
+ elist<MDSIOContextBase*> list;
+ ceph::spinlock lock;
+ MDSIOContextList() : list(member_offset(MDSIOContextBase, list_item)) {}
+ ~MDSIOContextList() {
+ list.clear(); // avoid assertion in elist's destructor
+ }
+} ioctx_list;
+
+MDSIOContextBase::MDSIOContextBase(bool track)
+{
+ created_at = ceph::coarse_mono_clock::now();
+ if (track) {
+ ioctx_list.lock.lock();
+ ioctx_list.list.push_back(&list_item);
+ ioctx_list.lock.unlock();
+ }
+}
+
+MDSIOContextBase::~MDSIOContextBase()
+{
+ ioctx_list.lock.lock();
+ list_item.remove_myself();
+ ioctx_list.lock.unlock();
+}
+
+bool MDSIOContextBase::check_ios_in_flight(ceph::coarse_mono_time cutoff,
+ std::string& slow_count,
+ ceph::coarse_mono_time& oldest)
+{
+ static const unsigned MAX_COUNT = 100;
+ unsigned slow = 0;
+
+ ioctx_list.lock.lock();
+ for (elist<MDSIOContextBase*>::iterator p = ioctx_list.list.begin(); !p.end(); ++p) {
+ MDSIOContextBase *c = *p;
+ if (c->created_at >= cutoff)
+ break;
+ ++slow;
+ if (slow > MAX_COUNT)
+ break;
+ if (slow == 1)
+ oldest = c->created_at;
+ }
+ ioctx_list.lock.unlock();
+
+ if (slow > 0) {
+ if (slow > MAX_COUNT)
+ slow_count = std::to_string(MAX_COUNT) + "+";
+ else
+ slow_count = std::to_string(slow);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void MDSIOContextBase::complete(int r) {
+ MDSRank *mds = get_mds();
+
+ dout(10) << "MDSIOContextBase::complete: " << typeid(*this).name() << dendl;
+ ceph_assert(mds != NULL);
+ // Note, MDSIOContext is passed outside the MDS and, strangely, we grab the
+ // lock here when MDSContext::complete would otherwise assume the lock is
+ // already acquired.
+ std::lock_guard l(mds->mds_lock);
+
+ if (mds->is_daemon_stopping()) {
+ dout(4) << "MDSIOContextBase::complete: dropping for stopping "
+ << typeid(*this).name() << dendl;
+ return;
+ }
+
+ if (r == -EBLACKLISTED) {
+ derr << "MDSIOContextBase: blacklisted! Restarting..." << dendl;
+ mds->respawn();
+ } else {
+ MDSContext::complete(r);
+ }
+}
+
+void MDSLogContextBase::complete(int r) {
+ MDLog *mdlog = get_mds()->mdlog;
+ uint64_t safe_pos = write_pos;
+ pre_finish(r);
+ // MDSContextBase::complete() free this
+ MDSIOContextBase::complete(r);
+ mdlog->set_safe_pos(safe_pos);
+}
+
+void MDSIOContextWrapper::finish(int r)
+{
+ fin->complete(r);
+}
+
+void C_IO_Wrapper::complete(int r)
+{
+ if (async) {
+ async = false;
+ get_mds()->finisher->queue(this, r);
+ } else {
+ MDSIOContext::complete(r);
+ }
+}
diff --git a/src/mds/MDSContext.h b/src/mds/MDSContext.h
new file mode 100644
index 00000000..24269008
--- /dev/null
+++ b/src/mds/MDSContext.h
@@ -0,0 +1,212 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef MDS_CONTEXT_H
+#define MDS_CONTEXT_H
+
+#include <vector>
+#include <deque>
+
+#include "include/Context.h"
+#include "include/elist.h"
+#include "include/spinlock.h"
+#include "common/ceph_time.h"
+
+class MDSRank;
+
+/**
+ * Completion which has access to a reference to the global MDS instance.
+ *
+ * This class exists so that Context subclasses can provide the MDS pointer
+ * from a pointer they already had, e.g. MDCache or Locker, rather than
+ * necessarily having to carry around an extra MDS* pointer.
+ */
+class MDSContext : public Context
+{
+public:
+template<template<typename> class A>
+ using vec_alloc = std::vector<MDSContext*, A<MDSContext*>>;
+ using vec = vec_alloc<std::allocator>;
+
+template<template<typename> class A>
+ using que_alloc = std::deque<MDSContext*, A<MDSContext*>>;
+ using que = que_alloc<std::allocator>;
+
+ void complete(int r) override;
+ virtual MDSRank *get_mds() = 0;
+};
+
+/* Children of this could have used multiple inheritance with MDSHolder and
+ * MDSContext but then get_mds() would be ambiguous.
+ */
+template<class T>
+class MDSHolder : public T
+{
+public:
+ MDSRank* get_mds() override {
+ return mds;
+ }
+
+protected:
+ MDSHolder() = delete;
+ MDSHolder(MDSRank* mds) : mds(mds) {
+ ceph_assert(mds != nullptr);
+ }
+
+ MDSRank* mds;
+};
+
+/**
+ * General purpose, lets you pass in an MDS pointer.
+ */
+class MDSInternalContext : public MDSHolder<MDSContext>
+{
+public:
+ MDSInternalContext() = delete;
+
+protected:
+ explicit MDSInternalContext(MDSRank *mds_) : MDSHolder(mds_) {}
+};
+
+/**
+ * Wrap a regular Context up as an Internal context. Useful
+ * if you're trying to work with one of our more generic frameworks.
+ */
+class MDSInternalContextWrapper : public MDSInternalContext
+{
+protected:
+ Context *fin = nullptr;
+ void finish(int r) override;
+public:
+ MDSInternalContextWrapper(MDSRank *m, Context *c) : MDSInternalContext(m), fin(c) {}
+};
+
+class MDSIOContextBase : public MDSContext
+{
+public:
+ MDSIOContextBase(bool track=true);
+ virtual ~MDSIOContextBase();
+ MDSIOContextBase(const MDSIOContextBase&) = delete;
+ MDSIOContextBase& operator=(const MDSIOContextBase&) = delete;
+
+ void complete(int r) override;
+
+ virtual void print(ostream& out) const = 0;
+
+ static bool check_ios_in_flight(ceph::coarse_mono_time cutoff,
+ std::string& slow_count,
+ ceph::coarse_mono_time& oldest);
+private:
+ ceph::coarse_mono_time created_at;
+ elist<MDSIOContextBase*>::item list_item;
+
+ friend struct MDSIOContextList;
+};
+
+/**
+ * Completion for an log operation, takes big MDSRank lock
+ * before executing finish function. Update log's safe pos
+ * after finish functuon return.
+ */
+class MDSLogContextBase : public MDSIOContextBase
+{
+protected:
+ uint64_t write_pos = 0;
+public:
+ MDSLogContextBase() = default;
+ void complete(int r) final;
+ void set_write_pos(uint64_t wp) { write_pos = wp; }
+ virtual void pre_finish(int r) {}
+ void print(ostream& out) const override {
+ out << "log_event(" << write_pos << ")";
+ }
+};
+
+/**
+ * Completion for an I/O operation, takes big MDSRank lock
+ * before executing finish function.
+ */
+class MDSIOContext : public MDSHolder<MDSIOContextBase>
+{
+public:
+ explicit MDSIOContext(MDSRank *mds_) : MDSHolder(mds_) {}
+};
+
+/**
+ * Wrap a regular Context up as an IO Context. Useful
+ * if you're trying to work with one of our more generic frameworks.
+ */
+class MDSIOContextWrapper : public MDSHolder<MDSIOContextBase>
+{
+protected:
+ Context *fin;
+public:
+ MDSIOContextWrapper(MDSRank *m, Context *c) : MDSHolder(m), fin(c) {}
+ void finish(int r) override;
+ void print(ostream& out) const override {
+ out << "io_context_wrapper(" << fin << ")";
+ }
+};
+
+/**
+ * No-op for callers expecting MDSInternalContext
+ */
+class C_MDSInternalNoop : public MDSContext
+{
+public:
+ void finish(int r) override {}
+ void complete(int r) override { delete this; }
+protected:
+ MDSRank* get_mds() override final {ceph_abort();}
+};
+
+
+/**
+ * This class is used where you have an MDSInternalContext but
+ * you sometimes want to call it back from an I/O completion.
+ */
+class C_IO_Wrapper : public MDSIOContext
+{
+protected:
+ bool async;
+ MDSContext *wrapped;
+ void finish(int r) override {
+ wrapped->complete(r);
+ wrapped = nullptr;
+ }
+public:
+ C_IO_Wrapper(MDSRank *mds_, MDSContext *wrapped_) :
+ MDSIOContext(mds_), async(true), wrapped(wrapped_) {
+ ceph_assert(wrapped != NULL);
+ }
+
+ ~C_IO_Wrapper() override {
+ if (wrapped != nullptr) {
+ delete wrapped;
+ wrapped = nullptr;
+ }
+ }
+ void complete(int r) final;
+ void print(ostream& out) const override {
+ out << "io_wrapper(" << wrapped << ")";
+ }
+};
+
+using MDSGather = C_GatherBase<MDSContext, C_MDSInternalNoop>;
+using MDSGatherBuilder = C_GatherBuilderBase<MDSContext, MDSGather>;
+
+using MDSContextFactory = ContextFactory<MDSContext>;
+
+#endif // MDS_CONTEXT_H
diff --git a/src/mds/MDSContinuation.h b/src/mds/MDSContinuation.h
new file mode 100644
index 00000000..97bae912
--- /dev/null
+++ b/src/mds/MDSContinuation.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/Continuation.h"
+#include "mds/Mutation.h"
+#include "mds/Server.h"
+
+#include "MDSContext.h"
+
+class MDSContinuation : public Continuation {
+protected:
+ Server *server;
+ MDSContext *get_internal_callback(int stage) {
+ return new MDSInternalContextWrapper(server->mds, get_callback(stage));
+ }
+ MDSIOContextBase *get_io_callback(int stage) {
+ return new MDSIOContextWrapper(server->mds, get_callback(stage));
+ }
+public:
+ MDSContinuation(Server *s) :
+ Continuation(NULL), server(s) {}
+};
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc
new file mode 100644
index 00000000..4ef06740
--- /dev/null
+++ b/src/mds/MDSDaemon.cc
@@ -0,0 +1,1268 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+
+#include "include/compat.h"
+#include "include/types.h"
+#include "include/str_list.h"
+
+#include "common/Clock.h"
+#include "common/HeartbeatMap.h"
+#include "common/Timer.h"
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/entity_name.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "common/signal.h"
+#include "common/version.h"
+
+#include "global/signal_handler.h"
+
+#include "msg/Messenger.h"
+#include "mon/MonClient.h"
+
+#include "osdc/Objecter.h"
+
+#include "MDSMap.h"
+
+#include "MDSDaemon.h"
+#include "Server.h"
+#include "Locker.h"
+
+#include "SnapServer.h"
+#include "SnapClient.h"
+
+#include "events/ESession.h"
+#include "events/ESubtreeMap.h"
+
+#include "auth/AuthAuthorizeHandler.h"
+#include "auth/RotatingKeyRing.h"
+#include "auth/KeyRing.h"
+
+#include "perfglue/cpu_profiler.h"
+#include "perfglue/heap_profiler.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << name << ' '
+
+// cons/des
+MDSDaemon::MDSDaemon(std::string_view n, Messenger *m, MonClient *mc) :
+ Dispatcher(m->cct),
+ mds_lock("MDSDaemon::mds_lock"),
+ stopping(false),
+ timer(m->cct, mds_lock),
+ gss_ktfile_client(m->cct->_conf.get_val<std::string>("gss_ktab_client_file")),
+ beacon(m->cct, mc, n),
+ name(n),
+ messenger(m),
+ monc(mc),
+ mgrc(m->cct, m),
+ log_client(m->cct, messenger, &mc->monmap, LogClient::NO_FLAGS),
+ mds_rank(NULL),
+ asok_hook(NULL),
+ starttime(mono_clock::now())
+{
+ orig_argc = 0;
+ orig_argv = NULL;
+
+ clog = log_client.create_channel();
+ if (!gss_ktfile_client.empty()) {
+ // Assert we can export environment variable
+ /*
+ The default client keytab is used, if it is present and readable,
+ to automatically obtain initial credentials for GSSAPI client
+ applications. The principal name of the first entry in the client
+ keytab is used by default when obtaining initial credentials.
+ 1. The KRB5_CLIENT_KTNAME environment variable.
+ 2. The default_client_keytab_name profile variable in [libdefaults].
+ 3. The hardcoded default, DEFCKTNAME.
+ */
+ const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
+ gss_ktfile_client.c_str(), 1));
+ ceph_assert(set_result == 0);
+ }
+
+ monc->set_messenger(messenger);
+
+ mdsmap.reset(new MDSMap);
+}
+
+MDSDaemon::~MDSDaemon() {
+ std::lock_guard lock(mds_lock);
+
+ delete mds_rank;
+ mds_rank = NULL;
+}
+
+class MDSSocketHook : public AdminSocketHook {
+ MDSDaemon *mds;
+public:
+ explicit MDSSocketHook(MDSDaemon *m) : mds(m) {}
+ bool call(std::string_view command, const cmdmap_t& cmdmap,
+ std::string_view format, bufferlist& out) override {
+ stringstream ss;
+ bool r = mds->asok_command(command, cmdmap, format, ss);
+ out.append(ss);
+ return r;
+ }
+};
+
+bool MDSDaemon::asok_command(std::string_view command, const cmdmap_t& cmdmap,
+ std::string_view format, std::ostream& ss)
+{
+ dout(1) << "asok_command: " << command << " (starting...)" << dendl;
+
+ Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
+ bool handled = false;
+ if (command == "status") {
+ dump_status(f);
+ handled = true;
+ } else {
+ if (mds_rank == NULL) {
+ dout(1) << "Can't run that command on an inactive MDS!" << dendl;
+ f->dump_string("error", "mds_not_active");
+ } else {
+ try {
+ handled = mds_rank->handle_asok_command(command, cmdmap, f, ss);
+ } catch (const bad_cmd_get& e) {
+ ss << e.what();
+ }
+ }
+ }
+ f->flush(ss);
+ delete f;
+
+ dout(1) << "asok_command: " << command << " (complete)" << dendl;
+
+ return handled;
+}
+
+void MDSDaemon::dump_status(Formatter *f)
+{
+ f->open_object_section("status");
+ f->dump_stream("cluster_fsid") << monc->get_fsid();
+ if (mds_rank) {
+ f->dump_int("whoami", mds_rank->get_nodeid());
+ } else {
+ f->dump_int("whoami", MDS_RANK_NONE);
+ }
+
+ f->dump_int("id", monc->get_global_id());
+ f->dump_string("want_state", ceph_mds_state_name(beacon.get_want_state()));
+ f->dump_string("state", ceph_mds_state_name(mdsmap->get_state_gid(mds_gid_t(
+ monc->get_global_id()))));
+ if (mds_rank) {
+ std::lock_guard l(mds_lock);
+ mds_rank->dump_status(f);
+ }
+
+ f->dump_unsigned("mdsmap_epoch", mdsmap->get_epoch());
+ if (mds_rank) {
+ f->dump_unsigned("osdmap_epoch", mds_rank->get_osd_epoch());
+ f->dump_unsigned("osdmap_epoch_barrier", mds_rank->get_osd_epoch_barrier());
+ } else {
+ f->dump_unsigned("osdmap_epoch", 0);
+ f->dump_unsigned("osdmap_epoch_barrier", 0);
+ }
+
+ f->dump_float("uptime", get_uptime().count());
+
+ f->close_section(); // status
+}
+
+void MDSDaemon::set_up_admin_socket()
+{
+ int r;
+ AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+ ceph_assert(asok_hook == nullptr);
+ asok_hook = new MDSSocketHook(this);
+ r = admin_socket->register_command("status", "status", asok_hook,
+ "high-level status of MDS");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_ops_in_flight",
+ "dump_ops_in_flight", asok_hook,
+ "show the ops currently in flight");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("ops",
+ "ops", asok_hook,
+ "show the ops currently in flight");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_blocked_ops", "dump_blocked_ops",
+ asok_hook,
+ "show the blocked ops currently in flight");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops",
+ asok_hook,
+ "show recent ops");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump_historic_ops_by_duration", "dump_historic_ops_by_duration",
+ asok_hook,
+ "show recent ops, sorted by op duration");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("scrub_path",
+ "scrub_path name=path,type=CephString "
+ "name=scrubops,type=CephChoices,"
+ "strings=force|recursive|repair,n=N,req=false",
+ asok_hook,
+ "scrub an inode and output results");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("tag path",
+ "tag path name=path,type=CephString"
+ " name=tag,type=CephString",
+ asok_hook,
+ "Apply scrub tag recursively");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("flush_path",
+ "flush_path name=path,type=CephString",
+ asok_hook,
+ "flush an inode (and its dirfrags)");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("export dir",
+ "export dir "
+ "name=path,type=CephString "
+ "name=rank,type=CephInt",
+ asok_hook,
+ "migrate a subtree to named MDS");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump cache",
+ "dump cache name=path,type=CephString,req=false",
+ asok_hook,
+ "dump metadata cache (optionally to a file)");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("cache status",
+ "cache status",
+ asok_hook,
+ "show cache status");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump tree",
+ "dump tree "
+ "name=root,type=CephString,req=true "
+ "name=depth,type=CephInt,req=false ",
+ asok_hook,
+ "dump metadata cache for subtree");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump loads",
+ "dump loads",
+ asok_hook,
+ "dump metadata loads");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump snaps",
+ "dump snaps name=server,type=CephChoices,strings=--server,req=false",
+ asok_hook,
+ "dump snapshots");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("session evict",
+ "session evict name=client_id,type=CephString",
+ asok_hook,
+ "Evict a CephFS client");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("session ls",
+ "session ls",
+ asok_hook,
+ "Enumerate connected CephFS clients");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("session config",
+ "session config name=client_id,type=CephInt,req=true "
+ "name=option,type=CephString,req=true "
+ "name=value,type=CephString,req=false ",
+ asok_hook,
+ "Config a CephFS client session");
+ assert(r == 0);
+ r = admin_socket->register_command("osdmap barrier",
+ "osdmap barrier name=target_epoch,type=CephInt",
+ asok_hook,
+ "Wait until the MDS has this OSD map epoch");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("flush journal",
+ "flush journal",
+ asok_hook,
+ "Flush the journal to the backing store");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("force_readonly",
+ "force_readonly",
+ asok_hook,
+ "Force MDS to read-only mode");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("get subtrees",
+ "get subtrees",
+ asok_hook,
+ "Return the subtree map");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dirfrag split",
+ "dirfrag split "
+ "name=path,type=CephString,req=true "
+ "name=frag,type=CephString,req=true "
+ "name=bits,type=CephInt,req=true ",
+ asok_hook,
+ "Fragment directory by path");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dirfrag merge",
+ "dirfrag merge "
+ "name=path,type=CephString,req=true "
+ "name=frag,type=CephString,req=true",
+ asok_hook,
+ "De-fragment directory by path");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dirfrag ls",
+ "dirfrag ls "
+ "name=path,type=CephString,req=true",
+ asok_hook,
+ "List fragments in directory");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("openfiles ls",
+ "openfiles ls",
+ asok_hook,
+ "List the opening files and their caps");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("dump inode",
+ "dump inode "
+ "name=number,type=CephInt,req=true",
+ asok_hook,
+ "dump inode by inode number");
+ ceph_assert(r == 0);
+}
+
+void MDSDaemon::clean_up_admin_socket()
+{
+ g_ceph_context->get_admin_socket()->unregister_commands(asok_hook);
+ delete asok_hook;
+ asok_hook = NULL;
+}
+
+int MDSDaemon::init()
+{
+ dout(10) << sizeof(MDSCacheObject) << "\tMDSCacheObject" << dendl;
+ dout(10) << sizeof(CInode) << "\tCInode" << dendl;
+ dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item *7=" << 7*sizeof(elist<void*>::item) << dendl;
+ dout(10) << sizeof(CInode::mempool_inode) << "\t inode " << dendl;
+ dout(10) << sizeof(CInode::mempool_old_inode) << "\t old_inode " << dendl;
+ dout(10) << sizeof(nest_info_t) << "\t nest_info_t " << dendl;
+ dout(10) << sizeof(frag_info_t) << "\t frag_info_t " << dendl;
+ dout(10) << sizeof(SimpleLock) << "\t SimpleLock *5=" << 5*sizeof(SimpleLock) << dendl;
+ dout(10) << sizeof(ScatterLock) << "\t ScatterLock *3=" << 3*sizeof(ScatterLock) << dendl;
+ dout(10) << sizeof(CDentry) << "\tCDentry" << dendl;
+ dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item" << dendl;
+ dout(10) << sizeof(SimpleLock) << "\t SimpleLock" << dendl;
+ dout(10) << sizeof(CDir) << "\tCDir " << dendl;
+ dout(10) << sizeof(elist<void*>::item) << "\t elist<>::item *2=" << 2*sizeof(elist<void*>::item) << dendl;
+ dout(10) << sizeof(fnode_t) << "\t fnode_t " << dendl;
+ dout(10) << sizeof(nest_info_t) << "\t nest_info_t *2" << dendl;
+ dout(10) << sizeof(frag_info_t) << "\t frag_info_t *2" << dendl;
+ dout(10) << sizeof(Capability) << "\tCapability " << dendl;
+ dout(10) << sizeof(xlist<void*>::item) << "\t xlist<>::item *2=" << 2*sizeof(xlist<void*>::item) << dendl;
+
+ messenger->add_dispatcher_tail(&beacon);
+ messenger->add_dispatcher_tail(this);
+
+ // init monc
+ monc->set_messenger(messenger);
+
+ monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD |
+ CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_MGR);
+ int r = 0;
+ r = monc->init();
+ if (r < 0) {
+ derr << "ERROR: failed to init monc: " << cpp_strerror(-r) << dendl;
+ mds_lock.Lock();
+ suicide();
+ mds_lock.Unlock();
+ return r;
+ }
+
+ messenger->set_auth_client(monc);
+ messenger->set_auth_server(monc);
+ monc->set_handle_authentication_dispatcher(this);
+
+ // tell monc about log_client so it will know about mon session resets
+ monc->set_log_client(&log_client);
+
+ r = monc->authenticate();
+ if (r < 0) {
+ derr << "ERROR: failed to authenticate: " << cpp_strerror(-r) << dendl;
+ mds_lock.Lock();
+ suicide();
+ mds_lock.Unlock();
+ return r;
+ }
+
+ int rotating_auth_attempts = 0;
+ auto rotating_auth_timeout =
+ g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
+ while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
+ if (++rotating_auth_attempts <= g_conf()->max_rotating_auth_attempts) {
+ derr << "unable to obtain rotating service keys; retrying" << dendl;
+ continue;
+ }
+ derr << "ERROR: failed to refresh rotating keys, "
+ << "maximum retry time reached." << dendl;
+ mds_lock.Lock();
+ suicide();
+ mds_lock.Unlock();
+ return -ETIMEDOUT;
+ }
+
+ mds_lock.Lock();
+ if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) {
+ dout(4) << __func__ << ": terminated already, dropping out" << dendl;
+ mds_lock.Unlock();
+ return 0;
+ }
+
+ monc->sub_want("mdsmap", 0, 0);
+ monc->renew_subs();
+
+ mds_lock.Unlock();
+
+ // Set up admin socket before taking mds_lock, so that ordering
+ // is consistent (later we take mds_lock within asok callbacks)
+ set_up_admin_socket();
+ mds_lock.Lock();
+ if (beacon.get_want_state() == MDSMap::STATE_DNE) {
+ suicide(); // we could do something more graceful here
+ dout(4) << __func__ << ": terminated already, dropping out" << dendl;
+ mds_lock.Unlock();
+ return 0;
+ }
+
+ timer.init();
+
+ beacon.init(*mdsmap);
+ messenger->set_myname(entity_name_t::MDS(MDS_RANK_NONE));
+
+ // schedule tick
+ reset_tick();
+ mds_lock.Unlock();
+
+ return 0;
+}
+
+void MDSDaemon::reset_tick()
+{
+ // cancel old
+ if (tick_event) timer.cancel_event(tick_event);
+
+ // schedule
+ tick_event = timer.add_event_after(
+ g_conf()->mds_tick_interval,
+ new FunctionContext([this](int) {
+ ceph_assert(mds_lock.is_locked_by_me());
+ tick();
+ }));
+}
+
+void MDSDaemon::tick()
+{
+ // reschedule
+ reset_tick();
+
+ // Call through to subsystems' tick functions
+ if (mds_rank) {
+ mds_rank->tick();
+ }
+}
+
+void MDSDaemon::send_command_reply(const MCommand::const_ref &m, MDSRank *mds_rank,
+ int r, bufferlist outbl,
+ std::string_view outs)
+{
+ auto priv = m->get_connection()->get_priv();
+ auto session = static_cast<Session *>(priv.get());
+ ceph_assert(session != NULL);
+ // If someone is using a closed session for sending commands (e.g.
+ // the ceph CLI) then we should feel free to clean up this connection
+ // as soon as we've sent them a response.
+ const bool live_session =
+ session->get_state_seq() > 0 &&
+ mds_rank &&
+ mds_rank->sessionmap.get_session(session->info.inst.name);
+
+ if (!live_session) {
+ // This session only existed to issue commands, so terminate it
+ // as soon as we can.
+ ceph_assert(session->is_closed());
+ session->get_connection()->mark_disposable();
+ }
+ priv.reset();
+
+ auto reply = MCommandReply::create(r, outs);
+ reply->set_tid(m->get_tid());
+ reply->set_data(outbl);
+ m->get_connection()->send_message2(reply);
+}
+
+void MDSDaemon::handle_command(const MCommand::const_ref &m)
+{
+ auto priv = m->get_connection()->get_priv();
+ auto session = static_cast<Session *>(priv.get());
+ ceph_assert(session != NULL);
+
+ int r = 0;
+ cmdmap_t cmdmap;
+ std::stringstream ss;
+ std::string outs;
+ bufferlist outbl;
+ Context *run_after = NULL;
+ bool need_reply = true;
+
+ if (!session->auth_caps.allow_all()) {
+ dout(1) << __func__
+ << ": received command from client without `tell` capability: "
+ << *m->get_connection()->peer_addrs << dendl;
+
+ ss << "permission denied";
+ r = -EPERM;
+ } else if (m->cmd.empty()) {
+ r = -EINVAL;
+ ss << "no command given";
+ outs = ss.str();
+ } else if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+ r = -EINVAL;
+ outs = ss.str();
+ } else {
+ try {
+ r = _handle_command(cmdmap, m, &outbl, &outs, &run_after, &need_reply);
+ } catch (const bad_cmd_get& e) {
+ outs = e.what();
+ r = -EINVAL;
+ }
+ }
+ priv.reset();
+
+ if (need_reply) {
+ send_command_reply(m, mds_rank, r, outbl, outs);
+ }
+
+ if (run_after) {
+ run_after->complete(0);
+ }
+}
+
+const std::vector<MDSDaemon::MDSCommand>& MDSDaemon::get_commands()
+{
+ static const std::vector<MDSCommand> commands = {
+ MDSCommand("injectargs name=injected_args,type=CephString,n=N", "inject configuration arguments into running MDS"),
+ MDSCommand("config set name=key,type=CephString name=value,type=CephString", "Set a configuration option at runtime (not persistent)"),
+ MDSCommand("config unset name=key,type=CephString", "Unset a configuration option at runtime (not persistent)"),
+ MDSCommand("exit", "Terminate this MDS"),
+ MDSCommand("respawn", "Restart this MDS"),
+ MDSCommand("session kill name=session_id,type=CephInt", "End a client session"),
+ MDSCommand("cpu_profiler name=arg,type=CephChoices,strings=status|flush", "run cpu profiling on daemon"),
+ MDSCommand("session ls name=filters,type=CephString,n=N,req=false", "List client sessions"),
+ MDSCommand("client ls name=filters,type=CephString,n=N,req=false", "List client sessions"),
+ MDSCommand("session evict name=filters,type=CephString,n=N,req=false", "Evict client session(s)"),
+ MDSCommand("client evict name=filters,type=CephString,n=N,req=false", "Evict client session(s)"),
+ MDSCommand("session config name=client_id,type=CephInt name=option,type=CephString name=value,type=CephString,req=false",
+ "Config a client session"),
+ MDSCommand("client config name=client_id,type=CephInt name=option,type=CephString name=value,type=CephString,req=false",
+ "Config a client session"),
+ MDSCommand("damage ls", "List detected metadata damage"),
+ MDSCommand("damage rm name=damage_id,type=CephInt", "Remove a damage table entry"),
+ MDSCommand("version", "report version of MDS"),
+ MDSCommand("heap "
+ "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats",
+ "show heap usage info (available only if compiled with tcmalloc)"),
+ MDSCommand("cache drop name=timeout,type=CephInt,range=0,req=false", "trim cache and optionally request client to release all caps and flush the journal"),
+ MDSCommand("scrub start name=path,type=CephString name=scrubops,type=CephChoices,strings=force|recursive|repair,n=N,req=false name=tag,type=CephString,req=false",
+ "scrub an inode and output results"),
+ MDSCommand("scrub abort", "Abort in progress scrub operation(s)"),
+ MDSCommand("scrub pause", "Pause in progress scrub operation(s)"),
+ MDSCommand("scrub resume", "Resume paused scrub operation(s)"),
+ MDSCommand("scrub status", "Status of scrub operation"),
+ };
+ return commands;
+};
+
+int MDSDaemon::_handle_command(
+ const cmdmap_t &cmdmap,
+ const MCommand::const_ref &m,
+ bufferlist *outbl,
+ std::string *outs,
+ Context **run_later,
+ bool *need_reply)
+{
+ ceph_assert(outbl != NULL);
+ ceph_assert(outs != NULL);
+
+ class SuicideLater : public Context
+ {
+ MDSDaemon *mds;
+
+ public:
+ explicit SuicideLater(MDSDaemon *mds_) : mds(mds_) {}
+ void finish(int r) override {
+ // Wait a little to improve chances of caller getting
+ // our response before seeing us disappear from mdsmap
+ sleep(1);
+
+ mds->suicide();
+ }
+ };
+
+
+ class RespawnLater : public Context
+ {
+ MDSDaemon *mds;
+
+ public:
+
+ explicit RespawnLater(MDSDaemon *mds_) : mds(mds_) {}
+ void finish(int r) override {
+ // Wait a little to improve chances of caller getting
+ // our response before seeing us disappear from mdsmap
+ sleep(1);
+
+ mds->respawn();
+ }
+ };
+
+ std::stringstream ds;
+ std::stringstream ss;
+ std::string prefix;
+ std::string format;
+ std::unique_ptr<Formatter> f(Formatter::create(format));
+ cmd_getval(cct, cmdmap, "prefix", prefix);
+
+ int r = 0;
+
+ if (prefix == "get_command_descriptions") {
+ int cmdnum = 0;
+ std::unique_ptr<JSONFormatter> f(std::make_unique<JSONFormatter>());
+ f->open_object_section("command_descriptions");
+ for (auto& c : get_commands()) {
+ ostringstream secname;
+ secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
+ dump_cmddesc_to_json(f.get(), m->get_connection()->get_features(),
+ secname.str(), c.cmdstring, c.helpstring,
+ c.module, "*", 0);
+ cmdnum++;
+ }
+ f->close_section(); // command_descriptions
+
+ f->flush(ds);
+ goto out;
+ }
+
+ cmd_getval(cct, cmdmap, "format", format);
+ if (prefix == "version") {
+ if (f) {
+ f->open_object_section("version");
+ f->dump_string("version", pretty_version_to_str());
+ f->close_section();
+ f->flush(ds);
+ } else {
+ ds << pretty_version_to_str();
+ }
+ } else if (prefix == "injectargs") {
+ vector<string> argsvec;
+ cmd_getval(cct, cmdmap, "injected_args", argsvec);
+
+ if (argsvec.empty()) {
+ r = -EINVAL;
+ ss << "ignoring empty injectargs";
+ goto out;
+ }
+ string args = argsvec.front();
+ for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
+ args += " " + *a;
+ r = cct->_conf.injectargs(args, &ss);
+ } else if (prefix == "config set") {
+ std::string key;
+ cmd_getval(cct, cmdmap, "key", key);
+ std::string val;
+ cmd_getval(cct, cmdmap, "value", val);
+ r = cct->_conf.set_val(key, val, &ss);
+ if (r == 0) {
+ cct->_conf.apply_changes(nullptr);
+ }
+ } else if (prefix == "config unset") {
+ std::string key;
+ cmd_getval(cct, cmdmap, "key", key);
+ r = cct->_conf.rm_val(key);
+ if (r == 0) {
+ cct->_conf.apply_changes(nullptr);
+ }
+ if (r == -ENOENT) {
+ r = 0; // idempotent
+ }
+ } else if (prefix == "exit") {
+ // We will send response before executing
+ ss << "Exiting...";
+ *run_later = new SuicideLater(this);
+ } else if (prefix == "respawn") {
+ // We will send response before executing
+ ss << "Respawning...";
+ *run_later = new RespawnLater(this);
+ } else if (prefix == "session kill") {
+ if (mds_rank == NULL) {
+ r = -EINVAL;
+ ss << "MDS not active";
+ goto out;
+ }
+ // FIXME harmonize `session kill` with admin socket session evict
+ int64_t session_id = 0;
+ bool got = cmd_getval(cct, cmdmap, "session_id", session_id);
+ ceph_assert(got);
+ bool killed = mds_rank->evict_client(session_id, false,
+ g_conf()->mds_session_blacklist_on_evict,
+ ss);
+ if (!killed)
+ r = -ENOENT;
+ } else if (prefix == "heap") {
+ if (!ceph_using_tcmalloc()) {
+ r = -EOPNOTSUPP;
+ ss << "could not issue heap profiler command -- not using tcmalloc!";
+ } else {
+ string heapcmd;
+ cmd_getval(cct, cmdmap, "heapcmd", heapcmd);
+ vector<string> heapcmd_vec;
+ get_str_vec(heapcmd, heapcmd_vec);
+ string value;
+ if (cmd_getval(cct, cmdmap, "value", value))
+ heapcmd_vec.push_back(value);
+ ceph_heap_profiler_handle_command(heapcmd_vec, ds);
+ }
+ } else if (prefix == "cpu_profiler") {
+ string arg;
+ cmd_getval(cct, cmdmap, "arg", arg);
+ vector<string> argvec;
+ get_str_vec(arg, argvec);
+ cpu_profiler_handle_command(argvec, ds);
+ } else {
+ // Give MDSRank a shot at the command
+ if (!mds_rank) {
+ ss << "MDS not active";
+ r = -EINVAL;
+ }
+ else {
+ bool handled;
+ try {
+ handled = mds_rank->handle_command(cmdmap, m, &r, &ds, &ss,
+ run_later, need_reply);
+ if (!handled) {
+ // MDSDaemon doesn't know this command
+ ss << "unrecognized command! " << prefix;
+ r = -EINVAL;
+ }
+ } catch (const bad_cmd_get& e) {
+ ss << e.what();
+ r = -EINVAL;
+ }
+ }
+ }
+
+out:
+ *outs = ss.str();
+ outbl->append(ds);
+ return r;
+}
+
+void MDSDaemon::handle_mds_map(const MMDSMap::const_ref &m)
+{
+ version_t epoch = m->get_epoch();
+
+ // is it new?
+ if (epoch <= mdsmap->get_epoch()) {
+ dout(5) << "handle_mds_map old map epoch " << epoch << " <= "
+ << mdsmap->get_epoch() << ", discarding" << dendl;
+ return;
+ }
+
+ dout(1) << "Updating MDS map to version " << epoch << " from " << m->get_source() << dendl;
+
+ // keep old map, for a moment
+ std::unique_ptr<MDSMap> oldmap;
+ oldmap.swap(mdsmap);
+
+ // decode and process
+ mdsmap.reset(new MDSMap);
+ mdsmap->decode(m->get_encoded());
+
+ monc->sub_got("mdsmap", mdsmap->get_epoch());
+
+ // verify compatset
+ CompatSet mdsmap_compat(MDSMap::get_compat_set_all());
+ dout(10) << " my compat " << mdsmap_compat << dendl;
+ dout(10) << " mdsmap compat " << mdsmap->compat << dendl;
+ if (!mdsmap_compat.writeable(mdsmap->compat)) {
+ dout(0) << "handle_mds_map mdsmap compatset " << mdsmap->compat
+ << " not writeable with daemon features " << mdsmap_compat
+ << ", killing myself" << dendl;
+ suicide();
+ return;
+ }
+
+ // Calculate my effective rank (either my owned rank or the rank I'm following if STATE_STANDBY_REPLAY
+ const auto addrs = messenger->get_myaddrs();
+ const auto myid = monc->get_global_id();
+ const auto mygid = mds_gid_t(myid);
+ const auto whoami = mdsmap->get_rank_gid(mygid);
+ const auto old_state = oldmap->get_state_gid(mygid);
+ const auto new_state = mdsmap->get_state_gid(mygid);
+ const auto incarnation = mdsmap->get_inc_gid(mygid);
+ dout(10) << "my gid is " << myid << dendl;
+ dout(10) << "map says I am mds." << whoami << "." << incarnation
+ << " state " << ceph_mds_state_name(new_state) << dendl;
+ dout(10) << "msgr says I am " << addrs << dendl;
+
+ // If we're removed from the MDSMap, stop all processing.
+ using DS = MDSMap::DaemonState;
+ if (old_state != DS::STATE_NULL && new_state == DS::STATE_NULL) {
+ const auto& oldinfo = oldmap->get_info_gid(mygid);
+ dout(1) << "Map removed me " << oldinfo
+ << " from cluster; respawning! See cluster/monitor logs for details." << dendl;
+ respawn();
+ }
+
+ if (old_state == DS::STATE_NULL && new_state != DS::STATE_NULL) {
+ /* The MDS has been added to the FSMap, now we can init the MgrClient */
+ mgrc.init();
+ messenger->add_dispatcher_tail(&mgrc);
+ monc->sub_want("mgrmap", 0, 0);
+ monc->renew_subs(); /* MgrMap receipt drives connection to ceph-mgr */
+ }
+
+ // mark down any failed peers
+ for (const auto& [gid, info] : oldmap->get_mds_info()) {
+ if (mdsmap->get_mds_info().count(gid) == 0) {
+ dout(10) << " peer mds gid " << gid << " removed from map" << dendl;
+ messenger->mark_down_addrs(info.addrs);
+ }
+ }
+
+ if (whoami == MDS_RANK_NONE) {
+ // We do not hold a rank:
+ dout(10) << __func__ << ": handling map in rankless mode" << dendl;
+
+ if (new_state == DS::STATE_STANDBY) {
+ /* Note: STATE_BOOT is never an actual state in the FSMap. The Monitors
+ * generally mark a new MDS as STANDBY (although it's possible to
+ * immediately be assigned a rank).
+ */
+ if (old_state == DS::STATE_NULL) {
+ dout(1) << "Monitors have assigned me to become a standby." << dendl;
+ beacon.set_want_state(*mdsmap, new_state);
+ } else if (old_state == DS::STATE_STANDBY) {
+ dout(5) << "I am still standby" << dendl;
+ }
+ } else if (new_state == DS::STATE_NULL) {
+ /* We are not in the MDSMap yet! Keep waiting: */
+ ceph_assert(beacon.get_want_state() == DS::STATE_BOOT);
+ dout(10) << "not in map yet" << dendl;
+ } else {
+ /* We moved to standby somehow from another state */
+ ceph_abort("invalid transition to standby");
+ }
+ } else {
+ // Did we already hold a different rank? MDSMonitor shouldn't try
+ // to change that out from under me!
+ if (mds_rank && whoami != mds_rank->get_nodeid()) {
+ derr << "Invalid rank transition " << mds_rank->get_nodeid() << "->"
+ << whoami << dendl;
+ respawn();
+ }
+
+ // Did I previously not hold a rank? Initialize!
+ if (mds_rank == NULL) {
+ mds_rank = new MDSRankDispatcher(whoami, mds_lock, clog,
+ timer, beacon, mdsmap, messenger, monc, &mgrc,
+ new FunctionContext([this](int r){respawn();}),
+ new FunctionContext([this](int r){suicide();}));
+ dout(10) << __func__ << ": initializing MDS rank "
+ << mds_rank->get_nodeid() << dendl;
+ mds_rank->init();
+ }
+
+ // MDSRank is active: let him process the map, we have no say.
+ dout(10) << __func__ << ": handling map as rank "
+ << mds_rank->get_nodeid() << dendl;
+ mds_rank->handle_mds_map(m, *oldmap);
+ }
+
+ beacon.notify_mdsmap(*mdsmap);
+}
+
+void MDSDaemon::handle_signal(int signum)
+{
+ ceph_assert(signum == SIGINT || signum == SIGTERM);
+ derr << "*** got signal " << sig_str(signum) << " ***" << dendl;
+ {
+ std::lock_guard l(mds_lock);
+ if (stopping) {
+ return;
+ }
+ suicide();
+ }
+}
+
+void MDSDaemon::suicide()
+{
+ ceph_assert(mds_lock.is_locked());
+
+ // make sure we don't suicide twice
+ ceph_assert(stopping == false);
+ stopping = true;
+
+ dout(1) << "suicide! Wanted state "
+ << ceph_mds_state_name(beacon.get_want_state()) << dendl;
+
+ if (tick_event) {
+ timer.cancel_event(tick_event);
+ tick_event = 0;
+ }
+
+ clean_up_admin_socket();
+
+ // Inform MDS we are going away, then shut down beacon
+ beacon.set_want_state(*mdsmap, MDSMap::STATE_DNE);
+ if (!mdsmap->is_dne_gid(mds_gid_t(monc->get_global_id()))) {
+ // Notify the MDSMonitor that we're dying, so that it doesn't have to
+ // wait for us to go laggy. Only do this if we're actually in the
+ // MDSMap, because otherwise the MDSMonitor will drop our message.
+ beacon.send_and_wait(1);
+ }
+ beacon.shutdown();
+
+ if (mgrc.is_initialized())
+ mgrc.shutdown();
+
+ if (mds_rank) {
+ mds_rank->shutdown();
+ } else {
+ timer.shutdown();
+
+ monc->shutdown();
+ messenger->shutdown();
+ }
+}
+
+void MDSDaemon::respawn()
+{
+ // --- WARNING TO FUTURE COPY/PASTERS ---
+ // You must also add a call like
+ //
+ // ceph_pthread_setname(pthread_self(), "ceph-mds");
+ //
+ // to main() so that /proc/$pid/stat field 2 contains "(ceph-mds)"
+ // instead of "(exe)", so that killall (and log rotation) will work.
+
+ dout(1) << "respawn!" << dendl;
+
+ /* Dump recent in case the MDS was stuck doing something which caused it to
+ * be removed from the MDSMap leading to respawn. */
+ g_ceph_context->_log->dump_recent();
+
+ char *new_argv[orig_argc+1];
+ dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
+ for (int i=0; i<orig_argc; i++) {
+ new_argv[i] = (char *)orig_argv[i];
+ dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl;
+ }
+ new_argv[orig_argc] = NULL;
+
+ /* Determine the path to our executable, test if Linux /proc/self/exe exists.
+ * This allows us to exec the same executable even if it has since been
+ * unlinked.
+ */
+ char exe_path[PATH_MAX] = "";
+#ifdef PROCPREFIX
+ if (readlink(PROCPREFIX "/proc/self/exe", exe_path, PATH_MAX-1) != -1) {
+ dout(1) << "respawning with exe " << exe_path << dendl;
+ strcpy(exe_path, PROCPREFIX "/proc/self/exe");
+ } else {
+#else
+ {
+#endif
+ /* Print CWD for the user's interest */
+ char buf[PATH_MAX];
+ char *cwd = getcwd(buf, sizeof(buf));
+ ceph_assert(cwd);
+ dout(1) << " cwd " << cwd << dendl;
+
+ /* Fall back to a best-effort: just running in our CWD */
+ strncpy(exe_path, orig_argv[0], PATH_MAX-1);
+ }
+
+ dout(1) << " exe_path " << exe_path << dendl;
+
+ unblock_all_signals(NULL);
+ execv(exe_path, new_argv);
+
+ dout(0) << "respawn execv " << orig_argv[0]
+ << " failed with " << cpp_strerror(errno) << dendl;
+
+ // We have to assert out here, because suicide() returns, and callers
+ // to respawn expect it never to return.
+ ceph_abort();
+}
+
+
+
+bool MDSDaemon::ms_dispatch2(const Message::ref &m)
+{
+ std::lock_guard l(mds_lock);
+ if (stopping) {
+ return false;
+ }
+
+ // Drop out early if shutting down
+ if (beacon.get_want_state() == CEPH_MDS_STATE_DNE) {
+ dout(10) << " stopping, discarding " << *m << dendl;
+ return true;
+ }
+
+ // First see if it's a daemon message
+ const bool handled_core = handle_core_message(m);
+ if (handled_core) {
+ return true;
+ }
+
+ // Not core, try it as a rank message
+ if (mds_rank) {
+ return mds_rank->ms_dispatch(m);
+ } else {
+ return false;
+ }
+}
+
+bool MDSDaemon::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
+{
+ dout(10) << "MDSDaemon::ms_get_authorizer type="
+ << ceph_entity_type_name(dest_type) << dendl;
+
+ /* monitor authorization is being handled on different layer */
+ if (dest_type == CEPH_ENTITY_TYPE_MON)
+ return true;
+
+ *authorizer = monc->build_authorizer(dest_type);
+ return *authorizer != NULL;
+}
+
+
+/*
+ * high priority messages we always process
+ */
+
+#define ALLOW_MESSAGES_FROM(peers) \
+ do { \
+ if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
+ dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" \
+ << m->get_connection()->get_peer_type() << " allowing=" \
+ << #peers << " message=" << *m << dendl; \
+ return true; \
+ } \
+ } while (0)
+
+bool MDSDaemon::handle_core_message(const Message::const_ref &m)
+{
+ switch (m->get_type()) {
+ case CEPH_MSG_MON_MAP:
+ ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON);
+ break;
+
+ // MDS
+ case CEPH_MSG_MDS_MAP:
+ ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_MDS);
+ handle_mds_map(MMDSMap::msgref_cast(m));
+ break;
+
+ // OSD
+ case MSG_COMMAND:
+ handle_command(MCommand::msgref_cast(m));
+ break;
+ case CEPH_MSG_OSD_MAP:
+ ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD);
+
+ if (mds_rank) {
+ mds_rank->handle_osd_map();
+ }
+ break;
+
+ case MSG_MON_COMMAND:
+ ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON);
+ clog->warn() << "dropping `mds tell` command from legacy monitor";
+ break;
+
+ default:
+ return false;
+ }
+ return true;
+}
+
+void MDSDaemon::ms_handle_connect(Connection *con)
+{
+}
+
+bool MDSDaemon::ms_handle_reset(Connection *con)
+{
+ if (con->get_peer_type() != CEPH_ENTITY_TYPE_CLIENT)
+ return false;
+
+ std::lock_guard l(mds_lock);
+ if (stopping) {
+ return false;
+ }
+ dout(5) << "ms_handle_reset on " << con->get_peer_socket_addr() << dendl;
+ if (beacon.get_want_state() == CEPH_MDS_STATE_DNE)
+ return false;
+
+ auto priv = con->get_priv();
+ if (auto session = static_cast<Session *>(priv.get()); session) {
+ if (session->is_closed()) {
+ dout(3) << "ms_handle_reset closing connection for session " << session->info.inst << dendl;
+ con->mark_down();
+ con->set_priv(nullptr);
+ }
+ } else {
+ con->mark_down();
+ }
+ return false;
+}
+
+
+void MDSDaemon::ms_handle_remote_reset(Connection *con)
+{
+ if (con->get_peer_type() != CEPH_ENTITY_TYPE_CLIENT)
+ return;
+
+ std::lock_guard l(mds_lock);
+ if (stopping) {
+ return;
+ }
+
+ dout(5) << "ms_handle_remote_reset on " << con->get_peer_socket_addr() << dendl;
+ if (beacon.get_want_state() == CEPH_MDS_STATE_DNE)
+ return;
+
+ auto priv = con->get_priv();
+ if (auto session = static_cast<Session *>(priv.get()); session) {
+ if (session->is_closed()) {
+ dout(3) << "ms_handle_remote_reset closing connection for session " << session->info.inst << dendl;
+ con->mark_down();
+ con->set_priv(nullptr);
+ }
+ }
+}
+
+bool MDSDaemon::ms_handle_refused(Connection *con)
+{
+ // do nothing for now
+ return false;
+}
+
+KeyStore *MDSDaemon::ms_get_auth1_authorizer_keystore()
+{
+ return monc->rotating_secrets.get();
+}
+
+bool MDSDaemon::parse_caps(const AuthCapsInfo& info, MDSAuthCaps& caps)
+{
+ caps.clear();
+ if (info.allow_all) {
+ caps.set_allow_all();
+ return true;
+ } else {
+ auto it = info.caps.begin();
+ string auth_cap_str;
+ try {
+ decode(auth_cap_str, it);
+ } catch (const buffer::error& e) {
+ dout(1) << __func__ << ": cannot decode auth caps buffer of length " << info.caps.length() << dendl;
+ return false;
+ }
+
+ dout(10) << __func__ << ": parsing auth_cap_str='" << auth_cap_str << "'" << dendl;
+ CachedStackStringStream cs;
+ if (caps.parse(g_ceph_context, auth_cap_str, cs.get())) {
+ return true;
+ } else {
+ dout(1) << __func__ << ": auth cap parse error: " << cs->strv() << " parsing '" << auth_cap_str << "'" << dendl;
+ return false;
+ }
+ }
+}
+
+int MDSDaemon::ms_handle_authentication(Connection *con)
+{
+ /* N.B. without mds_lock! */
+ MDSAuthCaps caps;
+ return parse_caps(con->get_peer_caps_info(), caps) ? 0 : -1;
+}
+
+void MDSDaemon::ms_handle_accept(Connection *con)
+{
+ entity_name_t n(con->get_peer_type(), con->get_peer_global_id());
+ std::lock_guard l(mds_lock);
+ if (stopping) {
+ return;
+ }
+
+ // We allow connections and assign Session instances to connections
+ // even if we have not been assigned a rank, because clients with
+ // "allow *" are allowed to connect and do 'tell' operations before
+ // we have a rank.
+ Session *s = NULL;
+ if (mds_rank) {
+ // If we do hold a rank, see if this is an existing client establishing
+ // a new connection, rather than a new client
+ s = mds_rank->sessionmap.get_session(n);
+ }
+
+ // Wire up a Session* to this connection
+ // It doesn't go into a SessionMap instance until it sends an explicit
+ // request to open a session (initial state of Session is `closed`)
+ if (!s) {
+ s = new Session(con);
+ dout(10) << " new session " << s << " for " << s->info.inst
+ << " con " << con << dendl;
+ con->set_priv(RefCountedPtr{s, false});
+ if (mds_rank) {
+ mds_rank->kick_waiters_for_any_client_connection();
+ }
+ } else {
+ dout(10) << " existing session " << s << " for " << s->info.inst
+ << " existing con " << s->get_connection()
+ << ", new/authorizing con " << con << dendl;
+ con->set_priv(RefCountedPtr{s});
+ }
+
+ parse_caps(con->get_peer_caps_info(), s->auth_caps);
+
+ dout(10) << "ms_handle_accept " << con->get_peer_socket_addr() << " con " << con << " session " << s << dendl;
+ if (s) {
+ if (s->get_connection() != con) {
+ dout(10) << " session connection " << s->get_connection()
+ << " -> " << con << dendl;
+ s->set_connection(con);
+
+ // send out any queued messages
+ while (!s->preopen_out_queue.empty()) {
+ con->send_message2(s->preopen_out_queue.front());
+ s->preopen_out_queue.pop_front();
+ }
+ }
+ }
+}
+
+bool MDSDaemon::is_clean_shutdown()
+{
+ if (mds_rank) {
+ return mds_rank->is_stopped();
+ } else {
+ return true;
+ }
+}
diff --git a/src/mds/MDSDaemon.h b/src/mds/MDSDaemon.h
new file mode 100644
index 00000000..8add46d6
--- /dev/null
+++ b/src/mds/MDSDaemon.h
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_H
+#define CEPH_MDS_H
+
+#include <string_view>
+
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MGenericMessage.h"
+#include "messages/MMDSMap.h"
+#include "messages/MMonCommand.h"
+
+#include "common/LogClient.h"
+#include "common/Mutex.h"
+#include "common/Timer.h"
+#include "include/Context.h"
+#include "include/types.h"
+#include "mgr/MgrClient.h"
+#include "msg/Dispatcher.h"
+
+#include "Beacon.h"
+#include "MDSMap.h"
+#include "MDSRank.h"
+
+#define CEPH_MDS_PROTOCOL 34 /* cluster internal */
+
+class Messenger;
+class MonClient;
+
+class MDSDaemon : public Dispatcher {
+ public:
+ /* Global MDS lock: every time someone takes this, they must
+ * also check the `stopping` flag. If stopping is true, you
+ * must either do nothing and immediately drop the lock, or
+ * never drop the lock again (i.e. call respawn()) */
+ Mutex mds_lock;
+ bool stopping;
+
+ SafeTimer timer;
+ std::string gss_ktfile_client{};
+
+ mono_time get_starttime() const {
+ return starttime;
+ }
+ chrono::duration<double> get_uptime() const {
+ mono_time now = mono_clock::now();
+ return chrono::duration<double>(now-starttime);
+ }
+
+ protected:
+ Beacon beacon;
+
+ std::string name;
+
+ Messenger *messenger;
+ MonClient *monc;
+ MgrClient mgrc;
+ std::unique_ptr<MDSMap> mdsmap;
+ LogClient log_client;
+ LogChannelRef clog;
+
+ MDSRankDispatcher *mds_rank;
+
+ public:
+ MDSDaemon(std::string_view n, Messenger *m, MonClient *mc);
+ ~MDSDaemon() override;
+ int orig_argc;
+ const char **orig_argv;
+
+ // handle a signal (e.g., SIGTERM)
+ void handle_signal(int signum);
+
+ int init();
+
+ /**
+ * Hint at whether we were shutdown gracefully (i.e. we were only
+ * in standby, or our rank was stopped). Should be removed once
+ * we handle shutdown properly (e.g. clear out all message queues)
+ * such that deleting xlists doesn't assert.
+ */
+ bool is_clean_shutdown();
+ protected:
+ // tick and other timer fun
+ Context *tick_event = nullptr;
+ void reset_tick();
+
+ void wait_for_omap_osds();
+
+ private:
+ bool ms_dispatch2(const Message::ref &m) override;
+ bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) override;
+ int ms_handle_authentication(Connection *con) override;
+ KeyStore *ms_get_auth1_authorizer_keystore() override;
+ void ms_handle_accept(Connection *con) override;
+ void ms_handle_connect(Connection *con) override;
+ bool ms_handle_reset(Connection *con) override;
+ void ms_handle_remote_reset(Connection *con) override;
+ bool ms_handle_refused(Connection *con) override;
+
+ protected:
+ // admin socket handling
+ friend class MDSSocketHook;
+ class MDSSocketHook *asok_hook;
+ void set_up_admin_socket();
+ void clean_up_admin_socket();
+ void check_ops_in_flight(); // send off any slow ops to monitor
+ bool asok_command(std::string_view command, const cmdmap_t& cmdmap,
+ std::string_view format, ostream& ss);
+
+ void dump_status(Formatter *f);
+
+ /**
+ * Terminate this daemon process.
+ *
+ * This function will return, but once it does so the calling thread
+ * must do no more work as all subsystems will have been shut down.
+ */
+ void suicide();
+
+ /**
+ * Start a new daemon process with the same command line parameters that
+ * this process was run with, then terminate this process
+ */
+ void respawn();
+
+ void tick();
+
+protected:
+ bool handle_core_message(const Message::const_ref &m);
+
+ // special message types
+ friend class C_MDS_Send_Command_Reply;
+ static void send_command_reply(const MCommand::const_ref &m, MDSRank* mds_rank, int r,
+ bufferlist outbl, std::string_view outs);
+ int _handle_command(
+ const cmdmap_t &cmdmap,
+ const MCommand::const_ref &m,
+ bufferlist *outbl,
+ std::string *outs,
+ Context **run_later,
+ bool *need_reply);
+ void handle_command(const MCommand::const_ref &m);
+ void handle_mds_map(const MMDSMap::const_ref &m);
+
+private:
+ struct MDSCommand {
+ MDSCommand(std::string_view signature, std::string_view help)
+ : cmdstring(signature), helpstring(help)
+ {}
+
+ std::string cmdstring;
+ std::string helpstring;
+ std::string module = "mds";
+ };
+
+ static const std::vector<MDSCommand>& get_commands();
+
+ bool parse_caps(const AuthCapsInfo&, MDSAuthCaps&);
+
+ mono_time starttime = mono_clock::zero();
+};
+
+#endif
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc
new file mode 100644
index 00000000..27753c6e
--- /dev/null
+++ b/src/mds/MDSMap.cc
@@ -0,0 +1,930 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/debug.h"
+#include "mon/health_check.h"
+
+#include "MDSMap.h"
+
+#include <sstream>
+using std::stringstream;
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_
+
+// features
+CompatSet MDSMap::get_compat_set_all() {
+ CompatSet::FeatureSet feature_compat;
+ CompatSet::FeatureSet feature_ro_compat;
+ CompatSet::FeatureSet feature_incompat;
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2);
+
+ return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
+}
+
+CompatSet MDSMap::get_compat_set_default() {
+ CompatSet::FeatureSet feature_compat;
+ CompatSet::FeatureSet feature_ro_compat;
+ CompatSet::FeatureSet feature_incompat;
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
+ feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2);
+
+ return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
+}
+
+// base (pre v0.20)
+CompatSet MDSMap::get_compat_set_base() {
+ CompatSet::FeatureSet feature_compat_base;
+ CompatSet::FeatureSet feature_incompat_base;
+ feature_incompat_base.insert(MDS_FEATURE_INCOMPAT_BASE);
+ CompatSet::FeatureSet feature_ro_compat_base;
+
+ return CompatSet(feature_compat_base, feature_ro_compat_base, feature_incompat_base);
+}
+
+void MDSMap::mds_info_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("gid", global_id);
+ f->dump_string("name", name);
+ f->dump_int("rank", rank);
+ f->dump_int("incarnation", inc);
+ f->dump_stream("state") << ceph_mds_state_name(state);
+ f->dump_int("state_seq", state_seq);
+ f->dump_stream("addr") << addrs.get_legacy_str();
+ f->dump_object("addrs", addrs);
+ if (laggy_since != utime_t())
+ f->dump_stream("laggy_since") << laggy_since;
+
+ f->open_array_section("export_targets");
+ for (set<mds_rank_t>::iterator p = export_targets.begin();
+ p != export_targets.end(); ++p) {
+ f->dump_int("mds", *p);
+ }
+ f->close_section();
+ f->dump_unsigned("features", mds_features);
+ f->dump_unsigned("flags", flags);
+}
+
+void MDSMap::mds_info_t::dump(std::ostream& o) const
+{
+ o << "[mds." << name << "{" << rank << ":" << global_id << "}"
+ << " state " << ceph_mds_state_name(state)
+ << " seq " << state_seq;
+ if (laggy()) {
+ o << " laggy since " << laggy_since;
+ }
+ if (!export_targets.empty()) {
+ o << " export targets " << export_targets;
+ }
+ if (is_frozen()) {
+ o << " frozen";
+ }
+ o << " addr " << addrs << "]";
+}
+
+void MDSMap::mds_info_t::generate_test_instances(list<mds_info_t*>& ls)
+{
+ mds_info_t *sample = new mds_info_t();
+ ls.push_back(sample);
+ sample = new mds_info_t();
+ sample->global_id = 1;
+ sample->name = "test_instance";
+ sample->rank = 0;
+ ls.push_back(sample);
+}
+
+void MDSMap::dump(Formatter *f) const
+{
+ f->dump_int("epoch", epoch);
+ f->dump_unsigned("flags", flags);
+ f->dump_unsigned("ever_allowed_features", ever_allowed_features);
+ f->dump_unsigned("explicitly_allowed_features", explicitly_allowed_features);
+ f->dump_stream("created") << created;
+ f->dump_stream("modified") << modified;
+ f->dump_int("tableserver", tableserver);
+ f->dump_int("root", root);
+ f->dump_int("session_timeout", session_timeout);
+ f->dump_int("session_autoclose", session_autoclose);
+ f->dump_stream("min_compat_client") << (int)min_compat_client << " ("
+ << ceph_release_name(min_compat_client) << ")";
+ f->dump_int("max_file_size", max_file_size);
+ f->dump_int("last_failure", last_failure);
+ f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
+ f->open_object_section("compat");
+ compat.dump(f);
+ f->close_section();
+ f->dump_int("max_mds", max_mds);
+ f->open_array_section("in");
+ for (set<mds_rank_t>::const_iterator p = in.begin(); p != in.end(); ++p)
+ f->dump_int("mds", *p);
+ f->close_section();
+ f->open_object_section("up");
+ for (map<mds_rank_t,mds_gid_t>::const_iterator p = up.begin(); p != up.end(); ++p) {
+ char s[14];
+ sprintf(s, "mds_%d", int(p->first));
+ f->dump_int(s, p->second);
+ }
+ f->close_section();
+ f->open_array_section("failed");
+ for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p)
+ f->dump_int("mds", *p);
+ f->close_section();
+ f->open_array_section("damaged");
+ for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p)
+ f->dump_int("mds", *p);
+ f->close_section();
+ f->open_array_section("stopped");
+ for (set<mds_rank_t>::const_iterator p = stopped.begin(); p != stopped.end(); ++p)
+ f->dump_int("mds", *p);
+ f->close_section();
+ f->open_object_section("info");
+ for (const auto& [gid, info] : mds_info) {
+ char s[25]; // 'gid_' + len(str(ULLONG_MAX)) + '\0'
+ sprintf(s, "gid_%llu", (long long unsigned)gid);
+ f->open_object_section(s);
+ info.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("data_pools");
+ for (const auto& p: data_pools)
+ f->dump_int("pool", p);
+ f->close_section();
+ f->dump_int("metadata_pool", metadata_pool);
+ f->dump_bool("enabled", enabled);
+ f->dump_string("fs_name", fs_name);
+ f->dump_string("balancer", balancer);
+ f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted));
+}
+
+void MDSMap::generate_test_instances(list<MDSMap*>& ls)
+{
+ MDSMap *m = new MDSMap();
+ m->max_mds = 1;
+ m->data_pools.push_back(0);
+ m->metadata_pool = 1;
+ m->cas_pool = 2;
+ m->compat = get_compat_set_all();
+
+ // these aren't the defaults, just in case anybody gets confused
+ m->session_timeout = 61;
+ m->session_autoclose = 301;
+ m->max_file_size = 1<<24;
+ ls.push_back(m);
+}
+
+void MDSMap::print(ostream& out) const
+{
+ out << "fs_name\t" << fs_name << "\n";
+ out << "epoch\t" << epoch << "\n";
+ out << "flags\t" << hex << flags << dec << "\n";
+ out << "created\t" << created << "\n";
+ out << "modified\t" << modified << "\n";
+ out << "tableserver\t" << tableserver << "\n";
+ out << "root\t" << root << "\n";
+ out << "session_timeout\t" << session_timeout << "\n"
+ << "session_autoclose\t" << session_autoclose << "\n";
+ out << "max_file_size\t" << max_file_size << "\n";
+ out << "min_compat_client\t" << (int)min_compat_client << " ("
+ << ceph_release_name(min_compat_client) << ")\n";
+ out << "last_failure\t" << last_failure << "\n"
+ << "last_failure_osd_epoch\t" << last_failure_osd_epoch << "\n";
+ out << "compat\t" << compat << "\n";
+ out << "max_mds\t" << max_mds << "\n";
+ out << "in\t" << in << "\n"
+ << "up\t" << up << "\n"
+ << "failed\t" << failed << "\n"
+ << "damaged\t" << damaged << "\n"
+ << "stopped\t" << stopped << "\n";
+ out << "data_pools\t" << data_pools << "\n";
+ out << "metadata_pool\t" << metadata_pool << "\n";
+ out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n";
+ out << "balancer\t" << balancer << "\n";
+ out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n";
+
+ multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo;
+ for (const auto &p : mds_info) {
+ foo.insert(std::make_pair(
+ std::make_pair(p.second.rank, p.second.inc-1), p.first));
+ }
+
+ for (const auto &p : foo) {
+ out << mds_info.at(p.second) << "\n";
+ }
+}
+
+void MDSMap::print_summary(Formatter *f, ostream *out) const
+{
+ map<mds_rank_t,string> by_rank;
+ map<string,int> by_state;
+
+ if (f) {
+ f->dump_unsigned("epoch", get_epoch());
+ f->dump_unsigned("up", up.size());
+ f->dump_unsigned("in", in.size());
+ f->dump_unsigned("max", max_mds);
+ } else {
+ *out << "e" << get_epoch() << ": " << up.size() << "/" << in.size() << "/" << max_mds << " up";
+ }
+
+ if (f)
+ f->open_array_section("by_rank");
+ for (const auto &p : mds_info) {
+ string s = ceph_mds_state_name(p.second.state);
+ if (p.second.laggy())
+ s += "(laggy or crashed)";
+
+ if (p.second.rank >= 0 && p.second.state != MDSMap::STATE_STANDBY_REPLAY) {
+ if (f) {
+ f->open_object_section("mds");
+ f->dump_unsigned("rank", p.second.rank);
+ f->dump_string("name", p.second.name);
+ f->dump_string("status", s);
+ f->close_section();
+ } else {
+ by_rank[p.second.rank] = p.second.name + "=" + s;
+ }
+ } else {
+ by_state[s]++;
+ }
+ }
+ if (f) {
+ f->close_section();
+ } else {
+ if (!by_rank.empty())
+ *out << " " << by_rank;
+ }
+
+ for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) {
+ if (f) {
+ f->dump_unsigned(p->first.c_str(), p->second);
+ } else {
+ *out << ", " << p->second << " " << p->first;
+ }
+ }
+
+ if (!failed.empty()) {
+ if (f) {
+ f->dump_unsigned("failed", failed.size());
+ } else {
+ *out << ", " << failed.size() << " failed";
+ }
+ }
+
+ if (!damaged.empty()) {
+ if (f) {
+ f->dump_unsigned("damaged", damaged.size());
+ } else {
+ *out << ", " << damaged.size() << " damaged";
+ }
+ }
+ //if (stopped.size())
+ //out << ", " << stopped.size() << " stopped";
+}
+
+void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
+ list<pair<health_status_t,string> > *detail) const
+{
+ if (!failed.empty()) {
+ std::ostringstream oss;
+ oss << "mds rank"
+ << ((failed.size() > 1) ? "s ":" ")
+ << failed
+ << ((failed.size() > 1) ? " have":" has")
+ << " failed";
+ summary.push_back(make_pair(HEALTH_ERR, oss.str()));
+ if (detail) {
+ for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p) {
+ std::ostringstream oss;
+ oss << "mds." << *p << " has failed";
+ detail->push_back(make_pair(HEALTH_ERR, oss.str()));
+ }
+ }
+ }
+
+ if (!damaged.empty()) {
+ std::ostringstream oss;
+ oss << "mds rank"
+ << ((damaged.size() > 1) ? "s ":" ")
+ << damaged
+ << ((damaged.size() > 1) ? " are":" is")
+ << " damaged";
+ summary.push_back(make_pair(HEALTH_ERR, oss.str()));
+ if (detail) {
+ for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p) {
+ std::ostringstream oss;
+ oss << "mds." << *p << " is damaged";
+ detail->push_back(make_pair(HEALTH_ERR, oss.str()));
+ }
+ }
+ }
+
+ if (is_degraded()) {
+ summary.push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
+ if (detail) {
+ detail->push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
+ for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
+ if (!is_up(i))
+ continue;
+ mds_gid_t gid = up.find(i)->second;
+ const auto& info = mds_info.at(gid);
+ stringstream ss;
+ if (is_resolve(i))
+ ss << "mds." << info.name << " at " << info.addrs
+ << " rank " << i << " is resolving";
+ if (is_replay(i))
+ ss << "mds." << info.name << " at " << info.addrs
+ << " rank " << i << " is replaying journal";
+ if (is_rejoin(i))
+ ss << "mds." << info.name << " at " << info.addrs
+ << " rank " << i << " is rejoining";
+ if (is_reconnect(i))
+ ss << "mds." << info.name << " at " << info.addrs
+ << " rank " << i << " is reconnecting to clients";
+ if (ss.str().length())
+ detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+ }
+ }
+ }
+
+ {
+ stringstream ss;
+ ss << fs_name << " max_mds " << max_mds;
+ summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+ }
+
+ if ((mds_rank_t)up.size() < max_mds) {
+ stringstream ss;
+ ss << fs_name << " has " << up.size()
+ << " active MDS(s), but has max_mds of " << max_mds;
+ summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+ }
+
+ set<string> laggy;
+ for (const auto &u : up) {
+ const auto& info = mds_info.at(u.second);
+ if (info.laggy()) {
+ laggy.insert(info.name);
+ if (detail) {
+ std::ostringstream oss;
+ oss << "mds." << info.name << " at " << info.addrs
+ << " is laggy/unresponsive";
+ detail->push_back(make_pair(HEALTH_WARN, oss.str()));
+ }
+ }
+ }
+
+ if (!laggy.empty()) {
+ std::ostringstream oss;
+ oss << "mds " << laggy
+ << ((laggy.size() > 1) ? " are":" is")
+ << " laggy";
+ summary.push_back(make_pair(HEALTH_WARN, oss.str()));
+ }
+
+ if (get_max_mds() > 1 &&
+ was_snaps_ever_allowed() && !allows_multimds_snaps()) {
+ std::ostringstream oss;
+ oss << "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
+ summary.push_back(make_pair(HEALTH_WARN, oss.str()));
+ }
+}
+
+void MDSMap::get_health_checks(health_check_map_t *checks) const
+{
+ // MDS_DAMAGE
+ if (!damaged.empty()) {
+ health_check_t& check = checks->get_or_add("MDS_DAMAGE", HEALTH_ERR,
+ "%num% mds daemon%plurals% damaged");
+ for (auto p : damaged) {
+ std::ostringstream oss;
+ oss << "fs " << fs_name << " mds." << p << " is damaged";
+ check.detail.push_back(oss.str());
+ }
+ }
+
+ // FS_DEGRADED
+ if (is_degraded()) {
+ health_check_t& fscheck = checks->get_or_add(
+ "FS_DEGRADED", HEALTH_WARN,
+ "%num% filesystem%plurals% %isorare% degraded");
+ ostringstream ss;
+ ss << "fs " << fs_name << " is degraded";
+ fscheck.detail.push_back(ss.str());
+
+ list<string> detail;
+ for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
+ if (!is_up(i))
+ continue;
+ mds_gid_t gid = up.find(i)->second;
+ const auto& info = mds_info.at(gid);
+ stringstream ss;
+ ss << "fs " << fs_name << " mds." << info.name << " at "
+ << info.addrs << " rank " << i;
+ if (is_resolve(i))
+ ss << " is resolving";
+ if (is_replay(i))
+ ss << " is replaying journal";
+ if (is_rejoin(i))
+ ss << " is rejoining";
+ if (is_reconnect(i))
+ ss << " is reconnecting to clients";
+ if (ss.str().length())
+ detail.push_back(ss.str());
+ }
+ }
+
+ // MDS_UP_LESS_THAN_MAX
+ if ((mds_rank_t)get_num_in_mds() < get_max_mds()) {
+ health_check_t& check = checks->add(
+ "MDS_UP_LESS_THAN_MAX", HEALTH_WARN,
+ "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds");
+ stringstream ss;
+ ss << "fs " << fs_name << " has " << get_num_in_mds()
+ << " MDS online, but wants " << get_max_mds();
+ check.detail.push_back(ss.str());
+ }
+
+ // MDS_ALL_DOWN
+ if ((mds_rank_t)get_num_up_mds() == 0 && get_max_mds() > 0) {
+ health_check_t &check = checks->add(
+ "MDS_ALL_DOWN", HEALTH_ERR,
+ "%num% filesystem%plurals% %isorare% offline");
+ stringstream ss;
+ ss << "fs " << fs_name << " is offline because no MDS is active for it.";
+ check.detail.push_back(ss.str());
+ }
+
+ if (get_max_mds() > 1 &&
+ was_snaps_ever_allowed() && !allows_multimds_snaps()) {
+ health_check_t &check = checks->add(
+ "MULTIMDS_WITH_OLDSNAPS", HEALTH_ERR,
+ "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots");
+ stringstream ss;
+ ss << "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
+ check.detail.push_back(ss.str());
+ }
+}
+
+void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const
+{
+ __u8 v = 9;
+ if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+ v = 7;
+ }
+ ENCODE_START(v, 4, bl);
+ encode(global_id, bl);
+ encode(name, bl);
+ encode(rank, bl);
+ encode(inc, bl);
+ encode((int32_t)state, bl);
+ encode(state_seq, bl);
+ if (v < 8) {
+ encode(addrs.legacy_addr(), bl, features);
+ } else {
+ encode(addrs, bl, features);
+ }
+ encode(laggy_since, bl);
+ encode(MDS_RANK_NONE, bl); /* standby_for_rank */
+ encode(std::string(), bl); /* standby_for_name */
+ encode(export_targets, bl);
+ encode(mds_features, bl);
+ encode(FS_CLUSTER_ID_NONE, bl); /* standby_for_fscid */
+ encode(false, bl);
+ if (v >= 9) {
+ encode(flags, bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+void MDSMap::mds_info_t::encode_unversioned(bufferlist& bl) const
+{
+ __u8 struct_v = 3;
+ using ceph::encode;
+ encode(struct_v, bl);
+ encode(global_id, bl);
+ encode(name, bl);
+ encode(rank, bl);
+ encode(inc, bl);
+ encode((int32_t)state, bl);
+ encode(state_seq, bl);
+ encode(addrs.legacy_addr(), bl, 0);
+ encode(laggy_since, bl);
+ encode(MDS_RANK_NONE, bl);
+ encode(std::string(), bl);
+ encode(export_targets, bl);
+}
+
+void MDSMap::mds_info_t::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl);
+ decode(global_id, bl);
+ decode(name, bl);
+ decode(rank, bl);
+ decode(inc, bl);
+ int32_t raw_state;
+ decode(raw_state, bl);
+ state = (MDSMap::DaemonState)raw_state;
+ decode(state_seq, bl);
+ decode(addrs, bl);
+ decode(laggy_since, bl);
+ {
+ mds_rank_t standby_for_rank;
+ decode(standby_for_rank, bl);
+ }
+ {
+ std::string standby_for_name;
+ decode(standby_for_name, bl);
+ }
+ if (struct_v >= 2)
+ decode(export_targets, bl);
+ if (struct_v >= 5)
+ decode(mds_features, bl);
+ if (struct_v >= 6) {
+ fs_cluster_id_t standby_for_fscid;
+ decode(standby_for_fscid, bl);
+ }
+ if (struct_v >= 7) {
+ bool standby_replay;
+ decode(standby_replay, bl);
+ }
+ if (struct_v >= 9) {
+ decode(flags, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+std::string MDSMap::mds_info_t::human_name() const
+{
+ // Like "daemon mds.myhost restarted", "Activating daemon mds.myhost"
+ std::ostringstream out;
+ out << "daemon mds." << name;
+ return out.str();
+}
+
+void MDSMap::encode(bufferlist& bl, uint64_t features) const
+{
+ std::map<mds_rank_t,int32_t> inc; // Legacy field, fake it so that
+ // old-mon peers have something sane
+ // during upgrade
+ for (const auto rank : in) {
+ inc.insert(std::make_pair(rank, epoch));
+ }
+
+ using ceph::encode;
+ if ((features & CEPH_FEATURE_PGID64) == 0) {
+ __u16 v = 2;
+ encode(v, bl);
+ encode(epoch, bl);
+ encode(flags, bl);
+ encode(last_failure, bl);
+ encode(root, bl);
+ encode(session_timeout, bl);
+ encode(session_autoclose, bl);
+ encode(max_file_size, bl);
+ encode(max_mds, bl);
+ __u32 n = mds_info.size();
+ encode(n, bl);
+ for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
+ i != mds_info.end(); ++i) {
+ encode(i->first, bl);
+ encode(i->second, bl, features);
+ }
+ n = data_pools.size();
+ encode(n, bl);
+ for (const auto p: data_pools) {
+ n = p;
+ encode(n, bl);
+ }
+
+ int32_t m = cas_pool;
+ encode(m, bl);
+ return;
+ } else if ((features & CEPH_FEATURE_MDSENC) == 0) {
+ __u16 v = 3;
+ encode(v, bl);
+ encode(epoch, bl);
+ encode(flags, bl);
+ encode(last_failure, bl);
+ encode(root, bl);
+ encode(session_timeout, bl);
+ encode(session_autoclose, bl);
+ encode(max_file_size, bl);
+ encode(max_mds, bl);
+ __u32 n = mds_info.size();
+ encode(n, bl);
+ for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
+ i != mds_info.end(); ++i) {
+ encode(i->first, bl);
+ encode(i->second, bl, features);
+ }
+ encode(data_pools, bl);
+ encode(cas_pool, bl);
+
+ // kclient ignores everything from here
+ __u16 ev = 5;
+ encode(ev, bl);
+ encode(compat, bl);
+ encode(metadata_pool, bl);
+ encode(created, bl);
+ encode(modified, bl);
+ encode(tableserver, bl);
+ encode(in, bl);
+ encode(inc, bl);
+ encode(up, bl);
+ encode(failed, bl);
+ encode(stopped, bl);
+ encode(last_failure_osd_epoch, bl);
+ return;
+ }
+
+ ENCODE_START(5, 4, bl);
+ encode(epoch, bl);
+ encode(flags, bl);
+ encode(last_failure, bl);
+ encode(root, bl);
+ encode(session_timeout, bl);
+ encode(session_autoclose, bl);
+ encode(max_file_size, bl);
+ encode(max_mds, bl);
+ encode(mds_info, bl, features);
+ encode(data_pools, bl);
+ encode(cas_pool, bl);
+
+ // kclient ignores everything from here
+ __u16 ev = 14;
+ encode(ev, bl);
+ encode(compat, bl);
+ encode(metadata_pool, bl);
+ encode(created, bl);
+ encode(modified, bl);
+ encode(tableserver, bl);
+ encode(in, bl);
+ encode(inc, bl);
+ encode(up, bl);
+ encode(failed, bl);
+ encode(stopped, bl);
+ encode(last_failure_osd_epoch, bl);
+ encode(ever_allowed_features, bl);
+ encode(explicitly_allowed_features, bl);
+ encode(inline_data_enabled, bl);
+ encode(enabled, bl);
+ encode(fs_name, bl);
+ encode(damaged, bl);
+ encode(balancer, bl);
+ encode(standby_count_wanted, bl);
+ encode(old_max_mds, bl);
+ encode(min_compat_client, bl);
+ ENCODE_FINISH(bl);
+}
+
+void MDSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
+{
+ /* Before we did stricter checking, it was possible to remove a data pool
+ * without also deleting it from the MDSMap. Check for that here after
+ * decoding the data pools.
+ */
+
+ for (auto it = data_pools.begin(); it != data_pools.end();) {
+ if (!pool_exists(*it)) {
+ dout(0) << "removed non-existant data pool " << *it << " from MDSMap" << dendl;
+ it = data_pools.erase(it);
+ } else {
+ it++;
+ }
+ }
+}
+
+void MDSMap::decode(bufferlist::const_iterator& p)
+{
+ std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
+
+ cached_up_features = 0;
+ DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p);
+ decode(epoch, p);
+ decode(flags, p);
+ decode(last_failure, p);
+ decode(root, p);
+ decode(session_timeout, p);
+ decode(session_autoclose, p);
+ decode(max_file_size, p);
+ decode(max_mds, p);
+ decode(mds_info, p);
+ if (struct_v < 3) {
+ __u32 n;
+ decode(n, p);
+ while (n--) {
+ __u32 m;
+ decode(m, p);
+ data_pools.push_back(m);
+ }
+ __s32 s;
+ decode(s, p);
+ cas_pool = s;
+ } else {
+ decode(data_pools, p);
+ decode(cas_pool, p);
+ }
+
+ // kclient ignores everything from here
+ __u16 ev = 1;
+ if (struct_v >= 2)
+ decode(ev, p);
+ if (ev >= 3)
+ decode(compat, p);
+ else
+ compat = get_compat_set_base();
+ if (ev < 5) {
+ __u32 n;
+ decode(n, p);
+ metadata_pool = n;
+ } else {
+ decode(metadata_pool, p);
+ }
+ decode(created, p);
+ decode(modified, p);
+ decode(tableserver, p);
+ decode(in, p);
+ decode(inc, p);
+ decode(up, p);
+ decode(failed, p);
+ decode(stopped, p);
+ if (ev >= 4)
+ decode(last_failure_osd_epoch, p);
+ if (ev >= 6) {
+ if (ev < 10) {
+ // previously this was a bool about snaps, not a flag map
+ bool flag;
+ decode(flag, p);
+ ever_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
+ decode(flag, p);
+ explicitly_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
+ } else {
+ decode(ever_allowed_features, p);
+ decode(explicitly_allowed_features, p);
+ }
+ } else {
+ ever_allowed_features = 0;
+ explicitly_allowed_features = 0;
+ }
+ if (ev >= 7)
+ decode(inline_data_enabled, p);
+
+ if (ev >= 8) {
+ ceph_assert(struct_v >= 5);
+ decode(enabled, p);
+ decode(fs_name, p);
+ } else {
+ if (epoch > 1) {
+ // If an MDS has ever been started, epoch will be greater than 1,
+ // assume filesystem is enabled.
+ enabled = true;
+ } else {
+ // Upgrading from a cluster that never used an MDS, switch off
+ // filesystem until it's explicitly enabled.
+ enabled = false;
+ }
+ }
+
+ if (ev >= 9) {
+ decode(damaged, p);
+ }
+
+ if (ev >= 11) {
+ decode(balancer, p);
+ }
+
+ if (ev >= 12) {
+ decode(standby_count_wanted, p);
+ }
+
+ if (ev >= 13) {
+ decode(old_max_mds, p);
+ }
+
+ if (ev >= 14) {
+ decode(min_compat_client, p);
+ }
+
+ DECODE_FINISH(p);
+}
+
+MDSMap::availability_t MDSMap::is_cluster_available() const
+{
+ if (epoch == 0) {
+ // If I'm a client, this means I'm looking at an MDSMap instance
+ // that was never actually initialized from the mons. Client should
+ // wait.
+ return TRANSIENT_UNAVAILABLE;
+ }
+
+ // If a rank is marked damage (unavailable until operator intervenes)
+ if (damaged.size()) {
+ return STUCK_UNAVAILABLE;
+ }
+
+ // If no ranks are created (filesystem not initialized)
+ if (in.empty()) {
+ return STUCK_UNAVAILABLE;
+ }
+
+ for (const auto rank : in) {
+ if (up.count(rank) && mds_info.at(up.at(rank)).laggy()) {
+ // This might only be transient, but because we can't see
+ // standbys, we have no way of knowing whether there is a
+ // standby available to replace the laggy guy.
+ return STUCK_UNAVAILABLE;
+ }
+ }
+
+ if (get_num_mds(CEPH_MDS_STATE_ACTIVE) > 0) {
+ // Nobody looks stuck, so indicate to client they should go ahead
+ // and try mounting if anybody is active. This may include e.g.
+ // one MDS failing over and another active: the client should
+ // proceed to start talking to the active one and let the
+ // transiently-unavailable guy catch up later.
+ return AVAILABLE;
+ } else {
+ // Nothing indicating we were stuck, but nobody active (yet)
+ //return TRANSIENT_UNAVAILABLE;
+
+ // Because we don't have standbys in the MDSMap any more, we can't
+ // reliably indicate transient vs. stuck, so always say stuck so
+ // that the client doesn't block.
+ return STUCK_UNAVAILABLE;
+ }
+}
+
+bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next)
+{
+ bool state_valid = true;
+ if (next != prev) {
+ if (prev == MDSMap::STATE_REPLAY) {
+ if (next != MDSMap::STATE_RESOLVE && next != MDSMap::STATE_RECONNECT) {
+ state_valid = false;
+ }
+ } else if (prev == MDSMap::STATE_REJOIN) {
+ if (next != MDSMap::STATE_ACTIVE &&
+ next != MDSMap::STATE_CLIENTREPLAY &&
+ next != MDSMap::STATE_STOPPED) {
+ state_valid = false;
+ }
+ } else if (prev >= MDSMap::STATE_RESOLVE && prev < MDSMap::STATE_ACTIVE) {
+ // Once I have entered replay, the only allowable transitions are to
+ // the next next along in the sequence.
+ if (next != prev + 1) {
+ state_valid = false;
+ }
+ }
+ }
+
+ return state_valid;
+}
+
+bool MDSMap::check_health(mds_rank_t standby_daemon_count)
+{
+ std::set<mds_rank_t> standbys;
+ get_standby_replay_mds_set(standbys);
+ std::set<mds_rank_t> actives;
+ get_active_mds_set(actives);
+ mds_rank_t standbys_avail = (mds_rank_t)standbys.size()+standby_daemon_count;
+
+ /* If there are standby daemons available/replaying and
+ * standby_count_wanted is unset (default), then we set it to 1. This will
+ * happen during health checks by the mons. Also, during initial creation
+ * of the FS we will have no actives so we don't want to change the default
+ * yet.
+ */
+ if (standby_count_wanted == -1 && actives.size() > 0 && standbys_avail > 0) {
+ set_standby_count_wanted(1);
+ return true;
+ }
+ return false;
+}
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
new file mode 100644
index 00000000..031319da
--- /dev/null
+++ b/src/mds/MDSMap.h
@@ -0,0 +1,686 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_MDSMAP_H
+#define CEPH_MDSMAP_H
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+#include <string_view>
+
+#include <errno.h>
+
+#include "include/types.h"
+#include "common/Clock.h"
+#include "include/health.h"
+
+#include "common/config.h"
+
+#include "include/CompatSet.h"
+#include "include/ceph_features.h"
+#include "common/Formatter.h"
+#include "mds/mdstypes.h"
+
+class CephContext;
+class health_check_map_t;
+
+#define MDS_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "base v0.20")
+#define MDS_FEATURE_INCOMPAT_CLIENTRANGES CompatSet::Feature(2, "client writeable ranges")
+#define MDS_FEATURE_INCOMPAT_FILELAYOUT CompatSet::Feature(3, "default file layouts on dirs")
+#define MDS_FEATURE_INCOMPAT_DIRINODE CompatSet::Feature(4, "dir inode in separate object")
+#define MDS_FEATURE_INCOMPAT_ENCODING CompatSet::Feature(5, "mds uses versioned encoding")
+#define MDS_FEATURE_INCOMPAT_OMAPDIRFRAG CompatSet::Feature(6, "dirfrag is stored in omap")
+#define MDS_FEATURE_INCOMPAT_INLINE CompatSet::Feature(7, "mds uses inline data")
+#define MDS_FEATURE_INCOMPAT_NOANCHOR CompatSet::Feature(8, "no anchor table")
+#define MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2 CompatSet::Feature(9, "file layout v2")
+#define MDS_FEATURE_INCOMPAT_SNAPREALM_V2 CompatSet::Feature(10, "snaprealm v2")
+
+#define MDS_FS_NAME_DEFAULT "cephfs"
+
+class MDSMap {
+public:
+ /* These states are the union of the set of possible states of an MDS daemon,
+ * and the set of possible states of an MDS rank. See
+ * doc/cephfs/mds-states.rst for state descriptions,
+ * doc/cephfs/mds-state-diagram.svg for a visual state diagram, and
+ * doc/cephfs/mds-state-diagram.dot to update mds-state-diagram.svg.
+ */
+ typedef enum {
+ // States of an MDS daemon not currently holding a rank
+ // ====================================================
+ STATE_NULL = CEPH_MDS_STATE_NULL, // null value for fns returning this type.
+ STATE_BOOT = CEPH_MDS_STATE_BOOT, // up, boot announcement. destiny unknown.
+ STATE_STANDBY = CEPH_MDS_STATE_STANDBY, // up, idle. waiting for assignment by monitor.
+ STATE_STANDBY_REPLAY = CEPH_MDS_STATE_STANDBY_REPLAY, // up, replaying active node, ready to take over.
+
+ // States of an MDS rank, and of any MDS daemon holding that rank
+ // ==============================================================
+ STATE_STOPPED = CEPH_MDS_STATE_STOPPED, // down, once existed, but no subtrees. empty log. may not be held by a daemon.
+
+ STATE_CREATING = CEPH_MDS_STATE_CREATING, // up, creating MDS instance (new journal, idalloc..).
+ STATE_STARTING = CEPH_MDS_STATE_STARTING, // up, starting prior stopped MDS instance.
+
+ STATE_REPLAY = CEPH_MDS_STATE_REPLAY, // up, starting prior failed instance. scanning journal.
+ STATE_RESOLVE = CEPH_MDS_STATE_RESOLVE, // up, disambiguating distributed operations (import, rename, etc.)
+ STATE_RECONNECT = CEPH_MDS_STATE_RECONNECT, // up, reconnect to clients
+ STATE_REJOIN = CEPH_MDS_STATE_REJOIN, // up, replayed journal, rejoining distributed cache
+ STATE_CLIENTREPLAY = CEPH_MDS_STATE_CLIENTREPLAY, // up, active
+ STATE_ACTIVE = CEPH_MDS_STATE_ACTIVE, // up, active
+ STATE_STOPPING = CEPH_MDS_STATE_STOPPING, // up, exporting metadata (-> standby or out)
+ STATE_DNE = CEPH_MDS_STATE_DNE, // down, rank does not exist
+
+ // State which a daemon may send to MDSMonitor in its beacon
+ // to indicate that offline repair is required. Daemon must stop
+ // immediately after indicating this state.
+ STATE_DAMAGED = CEPH_MDS_STATE_DAMAGED
+
+ /*
+ * In addition to explicit states, an MDS rank implicitly in state:
+ * - STOPPED if it is not currently associated with an MDS daemon gid but it
+ * is in MDSMap::stopped
+ * - FAILED if it is not currently associated with an MDS daemon gid but it
+ * is in MDSMap::failed
+ * - DNE if it is not currently associated with an MDS daemon gid and it is
+ * missing from both MDSMap::failed and MDSMap::stopped
+ */
+ } DaemonState;
+
+ struct mds_info_t {
+ mds_gid_t global_id = MDS_GID_NONE;
+ std::string name;
+ mds_rank_t rank = MDS_RANK_NONE;
+ int32_t inc = 0;
+ MDSMap::DaemonState state = STATE_STANDBY;
+ version_t state_seq = 0;
+ entity_addrvec_t addrs;
+ utime_t laggy_since;
+ std::set<mds_rank_t> export_targets;
+ uint64_t mds_features = 0;
+ uint64_t flags = 0;
+ enum mds_flags : uint64_t {
+ FROZEN = 1 << 0,
+ };
+
+ mds_info_t() = default;
+
+ bool laggy() const { return !(laggy_since == utime_t()); }
+ void clear_laggy() { laggy_since = utime_t(); }
+
+ bool is_degraded() const {
+ return STATE_REPLAY <= state && state <= STATE_CLIENTREPLAY;
+ }
+
+ void freeze() { flags |= mds_flags::FROZEN; }
+ void unfreeze() { flags &= ~mds_flags::FROZEN; }
+ bool is_frozen() const { return flags&mds_flags::FROZEN; }
+
+ const entity_addrvec_t& get_addrs() const {
+ return addrs;
+ }
+
+ void encode(bufferlist& bl, uint64_t features) const {
+ if ((features & CEPH_FEATURE_MDSENC) == 0 ) encode_unversioned(bl);
+ else encode_versioned(bl, features);
+ }
+ void decode(bufferlist::const_iterator& p);
+ void dump(Formatter *f) const;
+ void dump(std::ostream&) const;
+
+ // The long form name for use in cluster log messages`
+ std::string human_name() const;
+
+ static void generate_test_instances(list<mds_info_t*>& ls);
+ private:
+ void encode_versioned(bufferlist& bl, uint64_t features) const;
+ void encode_unversioned(bufferlist& bl) const;
+ };
+
+ static CompatSet get_compat_set_all();
+ static CompatSet get_compat_set_default();
+ static CompatSet get_compat_set_base(); // pre v0.20
+
+protected:
+ // base map
+ epoch_t epoch = 0;
+ bool enabled = false;
+ std::string fs_name = MDS_FS_NAME_DEFAULT;
+ uint32_t flags = CEPH_MDSMAP_DEFAULTS; // flags
+ epoch_t last_failure = 0; // mds epoch of last failure
+ epoch_t last_failure_osd_epoch = 0; // osd epoch of last failure; any mds entering replay needs
+ // at least this osdmap to ensure the blacklist propagates.
+ utime_t created;
+ utime_t modified;
+
+ mds_rank_t tableserver = 0; // which MDS has snaptable
+ mds_rank_t root = 0; // which MDS has root directory
+
+ __u32 session_timeout = 60;
+ __u32 session_autoclose = 300;
+ uint64_t max_file_size = 1ULL<<40; /* 1TB */
+
+ int8_t min_compat_client = -1;
+
+ std::vector<int64_t> data_pools; // file data pools available to clients (via an ioctl). first is the default.
+ int64_t cas_pool = -1; // where CAS objects go
+ int64_t metadata_pool = -1; // where fs metadata objects go
+
+ /*
+ * in: the set of logical mds #'s that define the cluster. this is the set
+ * of mds's the metadata may be distributed over.
+ * up: map from logical mds #'s to the addrs filling those roles.
+ * failed: subset of @in that are failed.
+ * stopped: set of nodes that have been initialized, but are not active.
+ *
+ * @up + @failed = @in. @in * @stopped = {}.
+ */
+
+ mds_rank_t max_mds = 1; /* The maximum number of active MDSes. Also, the maximum rank. */
+ mds_rank_t old_max_mds = 0; /* Value to restore when MDS cluster is marked up */
+ mds_rank_t standby_count_wanted = -1;
+ string balancer; /* The name/version of the mantle balancer (i.e. the rados obj name) */
+
+ std::set<mds_rank_t> in; // currently defined cluster
+
+ // which ranks are failed, stopped, damaged (i.e. not held by a daemon)
+ std::set<mds_rank_t> failed, stopped, damaged;
+ std::map<mds_rank_t, mds_gid_t> up; // who is in those roles
+ std::map<mds_gid_t, mds_info_t> mds_info;
+
+ uint8_t ever_allowed_features = 0; //< bitmap of features the cluster has allowed
+ uint8_t explicitly_allowed_features = 0; //< bitmap of features explicitly enabled
+
+ bool inline_data_enabled = false;
+
+ uint64_t cached_up_features = 0;
+
+public:
+ CompatSet compat;
+
+ friend class MDSMonitor;
+ friend class Filesystem;
+ friend class FSMap;
+
+public:
+ bool get_inline_data_enabled() const { return inline_data_enabled; }
+ void set_inline_data_enabled(bool enabled) { inline_data_enabled = enabled; }
+
+ utime_t get_session_timeout() const {
+ return utime_t(session_timeout,0);
+ }
+ void set_session_timeout(uint32_t t) {
+ session_timeout = t;
+ }
+
+ utime_t get_session_autoclose() const {
+ return utime_t(session_autoclose, 0);
+ }
+ void set_session_autoclose(uint32_t t) {
+ session_autoclose = t;
+ }
+
+ uint64_t get_max_filesize() const { return max_file_size; }
+ void set_max_filesize(uint64_t m) { max_file_size = m; }
+
+ uint8_t get_min_compat_client() const { return min_compat_client; }
+ void set_min_compat_client(uint8_t version) { min_compat_client = version; }
+
+ int get_flags() const { return flags; }
+ bool test_flag(int f) const { return flags & f; }
+ void set_flag(int f) { flags |= f; }
+ void clear_flag(int f) { flags &= ~f; }
+
+ std::string_view get_fs_name() const {return fs_name;}
+
+ void set_snaps_allowed() {
+ set_flag(CEPH_MDSMAP_ALLOW_SNAPS);
+ ever_allowed_features |= CEPH_MDSMAP_ALLOW_SNAPS;
+ explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_SNAPS;
+ }
+ void clear_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
+ bool allows_snaps() const { return test_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
+ bool was_snaps_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_SNAPS; }
+
+ void set_standby_replay_allowed() {
+ set_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY);
+ ever_allowed_features |= CEPH_MDSMAP_ALLOW_STANDBY_REPLAY;
+ explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_STANDBY_REPLAY;
+ }
+ void clear_standby_replay_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); }
+ bool allows_standby_replay() const { return test_flag(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY); }
+ bool was_standby_replay_ever_allowed() const { return ever_allowed_features & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY; }
+
+ void set_multimds_snaps_allowed() {
+ set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS);
+ ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS;
+ explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS;
+ }
+ void clear_multimds_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); }
+ bool allows_multimds_snaps() const { return test_flag(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS); }
+
+ epoch_t get_epoch() const { return epoch; }
+ void inc_epoch() { epoch++; }
+
+ bool get_enabled() const { return enabled; }
+
+ const utime_t& get_created() const { return created; }
+ void set_created(utime_t ct) { modified = created = ct; }
+ const utime_t& get_modified() const { return modified; }
+ void set_modified(utime_t mt) { modified = mt; }
+
+ epoch_t get_last_failure() const { return last_failure; }
+ epoch_t get_last_failure_osd_epoch() const { return last_failure_osd_epoch; }
+
+ mds_rank_t get_max_mds() const { return max_mds; }
+ void set_max_mds(mds_rank_t m) { max_mds = m; }
+ void set_old_max_mds() { old_max_mds = max_mds; }
+ mds_rank_t get_old_max_mds() const { return old_max_mds; }
+
+ mds_rank_t get_standby_count_wanted(mds_rank_t standby_daemon_count) const {
+ ceph_assert(standby_daemon_count >= 0);
+ std::set<mds_rank_t> s;
+ get_standby_replay_mds_set(s);
+ mds_rank_t standbys_avail = (mds_rank_t)s.size()+standby_daemon_count;
+ mds_rank_t wanted = std::max(0, standby_count_wanted);
+ return wanted > standbys_avail ? wanted - standbys_avail : 0;
+ }
+ void set_standby_count_wanted(mds_rank_t n) { standby_count_wanted = n; }
+ bool check_health(mds_rank_t standby_daemon_count);
+
+ const std::string get_balancer() const { return balancer; }
+ void set_balancer(std::string val) { balancer.assign(val); }
+
+ mds_rank_t get_tableserver() const { return tableserver; }
+ mds_rank_t get_root() const { return root; }
+
+ const std::vector<int64_t> &get_data_pools() const { return data_pools; }
+ int64_t get_first_data_pool() const { return *data_pools.begin(); }
+ int64_t get_metadata_pool() const { return metadata_pool; }
+ bool is_data_pool(int64_t poolid) const {
+ auto p = std::find(data_pools.begin(), data_pools.end(), poolid);
+ if (p == data_pools.end())
+ return false;
+ return true;
+ }
+
+ bool pool_in_use(int64_t poolid) const {
+ return get_enabled() && (is_data_pool(poolid) || metadata_pool == poolid);
+ }
+
+ const auto& get_mds_info() const { return mds_info; }
+ const auto& get_mds_info_gid(mds_gid_t gid) const {
+ return mds_info.at(gid);
+ }
+ const mds_info_t& get_mds_info(mds_rank_t m) const {
+ ceph_assert(up.count(m) && mds_info.count(up.at(m)));
+ return mds_info.at(up.at(m));
+ }
+ mds_gid_t find_mds_gid_by_name(std::string_view s) const {
+ for (const auto& [gid, info] : mds_info) {
+ if (info.name == s) {
+ return gid;
+ }
+ }
+ return MDS_GID_NONE;
+ }
+
+ // counts
+ unsigned get_num_in_mds() const {
+ return in.size();
+ }
+ unsigned get_num_up_mds() const {
+ return up.size();
+ }
+ mds_rank_t get_last_in_mds() const {
+ auto p = in.rbegin();
+ return p == in.rend() ? MDS_RANK_NONE : *p;
+ }
+ int get_num_failed_mds() const {
+ return failed.size();
+ }
+ unsigned get_num_mds(int state) const {
+ unsigned n = 0;
+ for (std::map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin();
+ p != mds_info.end();
+ ++p)
+ if (p->second.state == state) ++n;
+ return n;
+ }
+
+ // data pools
+ void add_data_pool(int64_t poolid) {
+ data_pools.push_back(poolid);
+ }
+ int remove_data_pool(int64_t poolid) {
+ std::vector<int64_t>::iterator p = std::find(data_pools.begin(), data_pools.end(), poolid);
+ if (p == data_pools.end())
+ return -ENOENT;
+ data_pools.erase(p);
+ return 0;
+ }
+
+ // sets
+ void get_mds_set(std::set<mds_rank_t>& s) const {
+ s = in;
+ }
+ void get_up_mds_set(std::set<mds_rank_t>& s) const {
+ for (std::map<mds_rank_t, mds_gid_t>::const_iterator p = up.begin();
+ p != up.end();
+ ++p)
+ s.insert(p->first);
+ }
+ void get_active_mds_set(std::set<mds_rank_t>& s) const {
+ get_mds_set(s, MDSMap::STATE_ACTIVE);
+ }
+ void get_standby_replay_mds_set(std::set<mds_rank_t>& s) const {
+ get_mds_set(s, MDSMap::STATE_STANDBY_REPLAY);
+ }
+ void get_failed_mds_set(std::set<mds_rank_t>& s) const {
+ s = failed;
+ }
+
+ // features
+ uint64_t get_up_features() {
+ if (!cached_up_features) {
+ bool first = true;
+ for (std::map<mds_rank_t, mds_gid_t>::const_iterator p = up.begin();
+ p != up.end();
+ ++p) {
+ std::map<mds_gid_t, mds_info_t>::const_iterator q =
+ mds_info.find(p->second);
+ ceph_assert(q != mds_info.end());
+ if (first) {
+ cached_up_features = q->second.mds_features;
+ first = false;
+ } else {
+ cached_up_features &= q->second.mds_features;
+ }
+ }
+ }
+ return cached_up_features;
+ }
+
+ /**
+ * Get MDS ranks which are in but not up.
+ */
+ void get_down_mds_set(std::set<mds_rank_t> *s) const
+ {
+ ceph_assert(s != NULL);
+ s->insert(failed.begin(), failed.end());
+ s->insert(damaged.begin(), damaged.end());
+ }
+
+ int get_failed() const {
+ if (!failed.empty()) return *failed.begin();
+ return -1;
+ }
+ void get_stopped_mds_set(std::set<mds_rank_t>& s) const {
+ s = stopped;
+ }
+ void get_recovery_mds_set(std::set<mds_rank_t>& s) const {
+ s = failed;
+ for (const auto& p : damaged)
+ s.insert(p);
+ for (const auto& p : mds_info)
+ if (p.second.state >= STATE_REPLAY && p.second.state <= STATE_STOPPING)
+ s.insert(p.second.rank);
+ }
+
+ void get_mds_set_lower_bound(std::set<mds_rank_t>& s, DaemonState first) const {
+ for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin();
+ p != mds_info.end();
+ ++p)
+ if (p->second.state >= first && p->second.state <= STATE_STOPPING)
+ s.insert(p->second.rank);
+ }
+ void get_mds_set(std::set<mds_rank_t>& s, DaemonState state) const {
+ for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin();
+ p != mds_info.end();
+ ++p)
+ if (p->second.state == state)
+ s.insert(p->second.rank);
+ }
+
+ void get_health(list<pair<health_status_t,std::string> >& summary,
+ list<pair<health_status_t,std::string> > *detail) const;
+
+ void get_health_checks(health_check_map_t *checks) const;
+
+ typedef enum
+ {
+ AVAILABLE = 0,
+ TRANSIENT_UNAVAILABLE = 1,
+ STUCK_UNAVAILABLE = 2
+
+ } availability_t;
+
+ /**
+ * Return indication of whether cluster is available. This is a
+ * heuristic for clients to see if they should bother waiting to talk to
+ * MDSs, or whether they should error out at startup/mount.
+ *
+ * A TRANSIENT_UNAVAILABLE result indicates that the cluster is in a
+ * transition state like replaying, or is potentially about the fail over.
+ * Clients should wait for an updated map before making a final decision
+ * about whether the filesystem is mountable.
+ *
+ * A STUCK_UNAVAILABLE result indicates that we can't see a way that
+ * the cluster is about to recover on its own, so it'll probably require
+ * administrator intervention: clients should probably not bother trying
+ * to mount.
+ */
+ availability_t is_cluster_available() const;
+
+ /**
+ * Return whether this MDSMap is suitable for resizing based on the state
+ * of the ranks.
+ */
+ bool is_resizeable() const {
+ return !is_degraded() &&
+ get_num_mds(CEPH_MDS_STATE_CREATING) == 0 &&
+ get_num_mds(CEPH_MDS_STATE_STARTING) == 0 &&
+ get_num_mds(CEPH_MDS_STATE_STOPPING) == 0;
+ }
+
+ // mds states
+ bool is_down(mds_rank_t m) const { return up.count(m) == 0; }
+ bool is_up(mds_rank_t m) const { return up.count(m); }
+ bool is_in(mds_rank_t m) const { return up.count(m) || failed.count(m); }
+ bool is_out(mds_rank_t m) const { return !is_in(m); }
+
+ bool is_failed(mds_rank_t m) const { return failed.count(m); }
+ bool is_stopped(mds_rank_t m) const { return stopped.count(m); }
+
+ bool is_dne(mds_rank_t m) const { return in.count(m) == 0; }
+ bool is_dne_gid(mds_gid_t gid) const { return mds_info.count(gid) == 0; }
+
+ /**
+ * Get MDS daemon status by GID
+ */
+ auto get_state_gid(mds_gid_t gid) const {
+ auto it = mds_info.find(gid);
+ if (it == mds_info.end())
+ return STATE_NULL;
+ return it->second.state;
+ }
+
+ /**
+ * Get MDS rank state if the rank is up, else STATE_NULL
+ */
+ auto get_state(mds_rank_t m) const {
+ auto it = up.find(m);
+ if (it == up.end())
+ return STATE_NULL;
+ return get_state_gid(it->second);
+ }
+
+ const auto& get_info(mds_rank_t m) const {
+ return mds_info.at(up.at(m));
+ }
+ const auto& get_info_gid(mds_gid_t gid) const {
+ return mds_info.at(gid);
+ }
+
+ bool is_boot(mds_rank_t m) const { return get_state(m) == STATE_BOOT; }
+ bool is_creating(mds_rank_t m) const { return get_state(m) == STATE_CREATING; }
+ bool is_starting(mds_rank_t m) const { return get_state(m) == STATE_STARTING; }
+ bool is_replay(mds_rank_t m) const { return get_state(m) == STATE_REPLAY; }
+ bool is_resolve(mds_rank_t m) const { return get_state(m) == STATE_RESOLVE; }
+ bool is_reconnect(mds_rank_t m) const { return get_state(m) == STATE_RECONNECT; }
+ bool is_rejoin(mds_rank_t m) const { return get_state(m) == STATE_REJOIN; }
+ bool is_clientreplay(mds_rank_t m) const { return get_state(m) == STATE_CLIENTREPLAY; }
+ bool is_active(mds_rank_t m) const { return get_state(m) == STATE_ACTIVE; }
+ bool is_stopping(mds_rank_t m) const { return get_state(m) == STATE_STOPPING; }
+ bool is_active_or_stopping(mds_rank_t m) const {
+ return is_active(m) || is_stopping(m);
+ }
+ bool is_clientreplay_or_active_or_stopping(mds_rank_t m) const {
+ return is_clientreplay(m) || is_active(m) || is_stopping(m);
+ }
+
+ mds_gid_t get_standby_replay(mds_rank_t r) const {
+ for (auto& [gid,info] : mds_info) {
+ if (info.rank == r && info.state == STATE_STANDBY_REPLAY) {
+ return gid;
+ }
+ }
+ return MDS_GID_NONE;
+ }
+ bool has_standby_replay(mds_rank_t r) const {
+ return get_standby_replay(r) != MDS_GID_NONE;
+ }
+
+ bool is_followable(mds_rank_t r) const {
+ if (auto it1 = up.find(r); it1 != up.end()) {
+ if (auto it2 = mds_info.find(it1->second); it2 != mds_info.end()) {
+ auto& info = it2->second;
+ if (!info.is_degraded() && !has_standby_replay(r)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ bool is_laggy_gid(mds_gid_t gid) const {
+ auto it = mds_info.find(gid);
+ return it == mds_info.end() ? false : it->second.laggy();
+ }
+
+ // degraded = some recovery in process. fixes active membership and
+ // recovery_set.
+ bool is_degraded() const {
+ if (!failed.empty() || !damaged.empty())
+ return true;
+ for (const auto& p : mds_info) {
+ if (p.second.is_degraded())
+ return true;
+ }
+ return false;
+ }
+ bool is_any_failed() const {
+ return failed.size();
+ }
+ bool is_resolving() const {
+ return
+ get_num_mds(STATE_RESOLVE) > 0 &&
+ get_num_mds(STATE_REPLAY) == 0 &&
+ failed.empty() && damaged.empty();
+ }
+ bool is_rejoining() const {
+ // nodes are rejoining cache state
+ return
+ get_num_mds(STATE_REJOIN) > 0 &&
+ get_num_mds(STATE_REPLAY) == 0 &&
+ get_num_mds(STATE_RECONNECT) == 0 &&
+ get_num_mds(STATE_RESOLVE) == 0 &&
+ failed.empty() && damaged.empty();
+ }
+ bool is_stopped() const {
+ return up.empty();
+ }
+
+ /**
+ * Get whether a rank is 'up', i.e. has
+ * an MDS daemon's entity_inst_t associated
+ * with it.
+ */
+ bool have_inst(mds_rank_t m) const {
+ return up.count(m);
+ }
+
+ /**
+ * Get the MDS daemon entity_inst_t for a rank
+ * known to be up.
+ */
+ entity_addrvec_t get_addrs(mds_rank_t m) const {
+ return mds_info.at(up.at(m)).get_addrs();
+ }
+
+ mds_rank_t get_rank_gid(mds_gid_t gid) const {
+ if (mds_info.count(gid)) {
+ return mds_info.at(gid).rank;
+ } else {
+ return MDS_RANK_NONE;
+ }
+ }
+
+ /**
+ * Get MDS rank incarnation if the rank is up, else -1
+ */
+ mds_gid_t get_incarnation(mds_rank_t m) const {
+ std::map<mds_rank_t, mds_gid_t>::const_iterator u = up.find(m);
+ if (u == up.end())
+ return MDS_GID_NONE;
+ return (mds_gid_t)get_inc_gid(u->second);
+ }
+
+ int get_inc_gid(mds_gid_t gid) const {
+ auto mds_info_entry = mds_info.find(gid);
+ if (mds_info_entry != mds_info.end())
+ return mds_info_entry->second.inc;
+ return -1;
+ }
+ void encode(bufferlist& bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator& p);
+ void decode(const bufferlist& bl) {
+ auto p = bl.cbegin();
+ decode(p);
+ }
+ void sanitize(const std::function<bool(int64_t pool)>& pool_exists);
+
+ void print(ostream& out) const;
+ void print_summary(Formatter *f, ostream *out) const;
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<MDSMap*>& ls);
+
+ static bool state_transition_valid(DaemonState prev, DaemonState next);
+};
+WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t)
+WRITE_CLASS_ENCODER_FEATURES(MDSMap)
+
+inline ostream& operator<<(ostream &out, const MDSMap &m) {
+ m.print_summary(NULL, &out);
+ return out;
+}
+
+inline std::ostream& operator<<(std::ostream& o, const MDSMap::mds_info_t& info) {
+ info.dump(o);
+ return o;
+}
+
+#endif
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
new file mode 100644
index 00000000..83a9d127
--- /dev/null
+++ b/src/mds/MDSRank.cc
@@ -0,0 +1,3824 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string_view>
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "messages/MClientRequestForward.h"
+#include "messages/MMDSLoadTargets.h"
+#include "messages/MMDSTableRequest.h"
+
+#include "mgr/MgrClient.h"
+
+#include "MDSDaemon.h"
+#include "MDSMap.h"
+#include "SnapClient.h"
+#include "SnapServer.h"
+#include "MDBalancer.h"
+#include "Migrator.h"
+#include "Locker.h"
+#include "InoTable.h"
+#include "mon/MonClient.h"
+#include "common/HeartbeatMap.h"
+#include "ScrubStack.h"
+
+
+#include "MDSRank.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << whoami << '.' << incarnation << ' '
+
+class C_Flush_Journal : public MDSInternalContext {
+public:
+ C_Flush_Journal(MDCache *mdcache, MDLog *mdlog, MDSRank *mds,
+ std::ostream *ss, Context *on_finish)
+ : MDSInternalContext(mds),
+ mdcache(mdcache), mdlog(mdlog), ss(ss), on_finish(on_finish),
+ whoami(mds->whoami), incarnation(mds->incarnation) {
+ }
+
+ void send() {
+ assert(mds->mds_lock.is_locked());
+
+ dout(20) << __func__ << dendl;
+
+ if (mdcache->is_readonly()) {
+ dout(5) << __func__ << ": read-only FS" << dendl;
+ complete(-EROFS);
+ return;
+ }
+
+ if (!mds->is_active()) {
+ dout(5) << __func__ << ": MDS not active, no-op" << dendl;
+ complete(0);
+ return;
+ }
+
+ flush_mdlog();
+ }
+
+private:
+
+ void flush_mdlog() {
+ dout(20) << __func__ << dendl;
+
+ // I need to seal off the current segment, and then mark all
+ // previous segments for expiry
+ mdlog->start_new_segment();
+
+ Context *ctx = new FunctionContext([this](int r) {
+ handle_flush_mdlog(r);
+ });
+
+ // Flush initially so that all the segments older than our new one
+ // will be elegible for expiry
+ mdlog->flush();
+ mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
+ }
+
+ void handle_flush_mdlog(int r) {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ if (r != 0) {
+ *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
+ complete(r);
+ return;
+ }
+
+ clear_mdlog();
+ }
+
+ void clear_mdlog() {
+ dout(20) << __func__ << dendl;
+
+ Context *ctx = new FunctionContext([this](int r) {
+ handle_clear_mdlog(r);
+ });
+
+ // Because we may not be the last wait_for_safe context on MDLog,
+ // and subsequent contexts might wake up in the middle of our
+ // later trim_all and interfere with expiry (by e.g. marking
+ // dirs/dentries dirty on previous log segments), we run a second
+ // wait_for_safe here. See #10368
+ mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
+ }
+
+ void handle_clear_mdlog(int r) {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ if (r != 0) {
+ *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
+ complete(r);
+ return;
+ }
+
+ trim_mdlog();
+ }
+
+ void trim_mdlog() {
+ // Put all the old log segments into expiring or expired state
+ dout(5) << __func__ << ": beginning segment expiry" << dendl;
+
+ int ret = mdlog->trim_all();
+ if (ret != 0) {
+ *ss << "Error " << ret << " (" << cpp_strerror(ret) << ") while trimming log";
+ complete(ret);
+ return;
+ }
+
+ expire_segments();
+ }
+
+ void expire_segments() {
+ dout(20) << __func__ << dendl;
+
+ // Attach contexts to wait for all expiring segments to expire
+ MDSGatherBuilder expiry_gather(g_ceph_context);
+
+ const auto &expiring_segments = mdlog->get_expiring_segments();
+ for (auto p : expiring_segments) {
+ p->wait_for_expiry(expiry_gather.new_sub());
+ }
+ dout(5) << __func__ << ": waiting for " << expiry_gather.num_subs_created()
+ << " segments to expire" << dendl;
+
+ if (!expiry_gather.has_subs()) {
+ trim_segments();
+ return;
+ }
+
+ Context *ctx = new FunctionContext([this](int r) {
+ handle_expire_segments(r);
+ });
+ expiry_gather.set_finisher(new MDSInternalContextWrapper(mds, ctx));
+ expiry_gather.activate();
+ }
+
+ void handle_expire_segments(int r) {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ ceph_assert(r == 0); // MDLog is not allowed to raise errors via
+ // wait_for_expiry
+ trim_segments();
+ }
+
+ void trim_segments() {
+ dout(20) << __func__ << dendl;
+
+ Context *ctx = new C_OnFinisher(new FunctionContext([this](int _) {
+ std::lock_guard locker(mds->mds_lock);
+ trim_expired_segments();
+ }), mds->finisher);
+ ctx->complete(0);
+ }
+
+ void trim_expired_segments() {
+ dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now "
+ << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
+ << mdlog->get_journaler()->get_trimmed_pos() << dendl;
+
+ // Now everyone I'm interested in is expired
+ mdlog->trim_expired_segments();
+
+ dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now "
+ << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
+ << mdlog->get_journaler()->get_trimmed_pos() << dendl;
+
+ write_journal_head();
+ }
+
+ void write_journal_head() {
+ dout(20) << __func__ << dendl;
+
+ Context *ctx = new FunctionContext([this](int r) {
+ std::lock_guard locker(mds->mds_lock);
+ handle_write_head(r);
+ });
+ // Flush the journal header so that readers will start from after
+ // the flushed region
+ mdlog->get_journaler()->write_head(ctx);
+ }
+
+ void handle_write_head(int r) {
+ if (r != 0) {
+ *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
+ } else {
+ dout(5) << __func__ << ": write_head complete, all done!" << dendl;
+ }
+
+ complete(r);
+ }
+
+ void finish(int r) override {
+ dout(20) << __func__ << ": r=" << r << dendl;
+ on_finish->complete(r);
+ }
+
+ MDCache *mdcache;
+ MDLog *mdlog;
+ std::ostream *ss;
+ Context *on_finish;
+
+ // so as to use dout
+ mds_rank_t whoami;
+ int incarnation;
+};
+
+class C_Drop_Cache : public MDSInternalContext {
+public:
+ C_Drop_Cache(Server *server, MDCache *mdcache, MDLog *mdlog,
+ MDSRank *mds, uint64_t recall_timeout,
+ Formatter *f, Context *on_finish)
+ : MDSInternalContext(mds),
+ server(server), mdcache(mdcache), mdlog(mdlog),
+ recall_timeout(recall_timeout), recall_start(mono_clock::now()),
+ f(f), on_finish(on_finish),
+ whoami(mds->whoami), incarnation(mds->incarnation) {
+ }
+
+ void send() {
+ // not really a hard requirement here, but lets ensure this in
+ // case we change the logic here.
+ assert(mds->mds_lock.is_locked());
+
+ dout(20) << __func__ << dendl;
+ f->open_object_section("result");
+ recall_client_state();
+ }
+
+private:
+ // context which completes itself (with -ETIMEDOUT) after a specified
+ // timeout or when explicitly completed, whichever comes first. Note
+ // that the context does not detroy itself after completion -- it
+ // needs to be explicitly freed.
+ class C_ContextTimeout : public MDSInternalContext {
+ public:
+ C_ContextTimeout(MDSRank *mds, uint64_t timeout, Context *on_finish)
+ : MDSInternalContext(mds),
+ timeout(timeout),
+ lock("mds::context::timeout", false, true),
+ on_finish(on_finish) {
+ }
+ ~C_ContextTimeout() {
+ ceph_assert(timer_task == nullptr);
+ }
+
+ void start_timer() {
+ if (!timeout) {
+ return;
+ }
+
+ timer_task = new FunctionContext([this](int _) {
+ timer_task = nullptr;
+ complete(-ETIMEDOUT);
+ });
+ mds->timer.add_event_after(timeout, timer_task);
+ }
+
+ void finish(int r) override {
+ Context *ctx = nullptr;
+ {
+ std::lock_guard locker(lock);
+ std::swap(on_finish, ctx);
+ }
+ if (ctx != nullptr) {
+ ctx->complete(r);
+ }
+ }
+ void complete(int r) override {
+ if (timer_task != nullptr) {
+ mds->timer.cancel_event(timer_task);
+ }
+
+ finish(r);
+ }
+
+ uint64_t timeout;
+ Mutex lock;
+ Context *on_finish = nullptr;
+ Context *timer_task = nullptr;
+ };
+
+ auto do_trim() {
+ auto [throttled, count] = mdcache->trim(UINT64_MAX);
+ dout(10) << __func__
+ << (throttled ? " (throttled)" : "")
+ << " trimmed " << count << " caps" << dendl;
+ dentries_trimmed += count;
+ return std::make_pair(throttled, count);
+ }
+
+ void recall_client_state() {
+ dout(20) << __func__ << dendl;
+ auto now = mono_clock::now();
+ auto duration = std::chrono::duration<double>(now-recall_start).count();
+
+ MDSGatherBuilder gather(g_ceph_context);
+ auto flags = Server::RecallFlags::STEADY|Server::RecallFlags::TRIM;
+ auto [throttled, count] = server->recall_client_state(&gather, flags);
+ dout(10) << __func__
+ << (throttled ? " (throttled)" : "")
+ << " recalled " << count << " caps" << dendl;
+
+ caps_recalled += count;
+ if ((throttled || count > 0) && (recall_timeout == 0 || duration < recall_timeout)) {
+ C_ContextTimeout *ctx = new C_ContextTimeout(
+ mds, 1, new FunctionContext([this](int r) {
+ recall_client_state();
+ }));
+ ctx->start_timer();
+ gather.set_finisher(new MDSInternalContextWrapper(mds, ctx));
+ gather.activate();
+ mdlog->flush(); /* use down-time to incrementally flush log */
+ do_trim(); /* use down-time to incrementally trim cache */
+ } else {
+ if (!gather.has_subs()) {
+ return handle_recall_client_state(0);
+ } else if (recall_timeout > 0 && duration > recall_timeout) {
+ gather.set_finisher(new C_MDSInternalNoop);
+ gather.activate();
+ return handle_recall_client_state(-ETIMEDOUT);
+ } else {
+ uint64_t remaining = (recall_timeout == 0 ? 0 : recall_timeout-duration);
+ C_ContextTimeout *ctx = new C_ContextTimeout(
+ mds, remaining, new FunctionContext([this](int r) {
+ handle_recall_client_state(r);
+ }));
+
+ ctx->start_timer();
+ gather.set_finisher(new MDSInternalContextWrapper(mds, ctx));
+ gather.activate();
+ }
+ }
+ }
+
+ void handle_recall_client_state(int r) {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ // client recall section
+ f->open_object_section("client_recall");
+ f->dump_int("return_code", r);
+ f->dump_string("message", cpp_strerror(r));
+ f->dump_int("recalled", caps_recalled);
+ f->close_section();
+
+ // we can still continue after recall timeout
+ flush_journal();
+ }
+
+ void flush_journal() {
+ dout(20) << __func__ << dendl;
+
+ Context *ctx = new FunctionContext([this](int r) {
+ handle_flush_journal(r);
+ });
+
+ C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, mds, &ss, ctx);
+ flush_journal->send();
+ }
+
+ void handle_flush_journal(int r) {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ if (r != 0) {
+ cmd_err(f, ss.str());
+ complete(r);
+ return;
+ }
+
+ // journal flush section
+ f->open_object_section("flush_journal");
+ f->dump_int("return_code", r);
+ f->dump_string("message", ss.str());
+ f->close_section();
+
+ trim_cache();
+ }
+
+ void trim_cache() {
+ dout(20) << __func__ << dendl;
+
+ auto [throttled, count] = do_trim();
+ if (throttled && count > 0) {
+ auto timer = new FunctionContext([this](int _) {
+ trim_cache();
+ });
+ mds->timer.add_event_after(1.0, timer);
+ } else {
+ cache_status();
+ }
+ }
+
+ void cache_status() {
+ dout(20) << __func__ << dendl;
+
+ f->open_object_section("trim_cache");
+ f->dump_int("trimmed", dentries_trimmed);
+ f->close_section();
+
+ // cache status section
+ mdcache->cache_status(f);
+
+ complete(0);
+ }
+
+ void finish(int r) override {
+ dout(20) << __func__ << ": r=" << r << dendl;
+
+ auto d = std::chrono::duration<double>(mono_clock::now()-recall_start);
+ f->dump_float("duration", d.count());
+
+ f->close_section();
+ on_finish->complete(r);
+ }
+
+ Server *server;
+ MDCache *mdcache;
+ MDLog *mdlog;
+ uint64_t recall_timeout;
+ mono_time recall_start;
+ Formatter *f;
+ Context *on_finish;
+
+ int retval = 0;
+ std::stringstream ss;
+ uint64_t caps_recalled = 0;
+ uint64_t dentries_trimmed = 0;
+
+ // so as to use dout
+ mds_rank_t whoami;
+ int incarnation;
+
+ void cmd_err(Formatter *f, std::string_view err) {
+ f->reset();
+ f->open_object_section("result");
+ f->dump_string("error", err);
+ f->close_section();
+ }
+};
+
+MDSRank::MDSRank(
+ mds_rank_t whoami_,
+ Mutex &mds_lock_,
+ LogChannelRef &clog_,
+ SafeTimer &timer_,
+ Beacon &beacon_,
+ std::unique_ptr<MDSMap>& mdsmap_,
+ Messenger *msgr,
+ MonClient *monc_,
+ MgrClient *mgrc,
+ Context *respawn_hook_,
+ Context *suicide_hook_)
+ :
+ whoami(whoami_), incarnation(0),
+ mds_lock(mds_lock_), cct(msgr->cct), clog(clog_), timer(timer_),
+ mdsmap(mdsmap_),
+ objecter(new Objecter(g_ceph_context, msgr, monc_, nullptr, 0, 0)),
+ server(NULL), mdcache(NULL), locker(NULL), mdlog(NULL),
+ balancer(NULL), scrubstack(NULL),
+ damage_table(whoami_),
+ inotable(NULL), snapserver(NULL), snapclient(NULL),
+ sessionmap(this), logger(NULL), mlogger(NULL),
+ op_tracker(g_ceph_context, g_conf()->mds_enable_op_tracker,
+ g_conf()->osd_num_op_tracker_shard),
+ last_state(MDSMap::STATE_BOOT),
+ state(MDSMap::STATE_BOOT),
+ cluster_degraded(false), stopping(false),
+ purge_queue(g_ceph_context, whoami_,
+ mdsmap_->get_metadata_pool(), objecter,
+ new FunctionContext(
+ [this](int r){
+ // Purge Queue operates inside mds_lock when we're calling into
+ // it, and outside when in background, so must handle both cases.
+ if (mds_lock.is_locked_by_me()) {
+ handle_write_error(r);
+ } else {
+ std::lock_guard l(mds_lock);
+ handle_write_error(r);
+ }
+ }
+ )
+ ),
+ progress_thread(this), dispatch_depth(0),
+ hb(NULL), last_tid(0), osd_epoch_barrier(0), beacon(beacon_),
+ mds_slow_req_count(0),
+ last_client_mdsmap_bcast(0),
+ messenger(msgr), monc(monc_), mgrc(mgrc),
+ respawn_hook(respawn_hook_),
+ suicide_hook(suicide_hook_),
+ standby_replaying(false),
+ starttime(mono_clock::now())
+{
+ hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank", pthread_self());
+
+ purge_queue.update_op_limit(*mdsmap);
+
+ objecter->unset_honor_osdmap_full();
+
+ finisher = new Finisher(cct);
+
+ mdcache = new MDCache(this, purge_queue);
+ mdlog = new MDLog(this);
+ balancer = new MDBalancer(this, messenger, monc);
+
+ scrubstack = new ScrubStack(mdcache, clog, finisher);
+
+ inotable = new InoTable(this);
+ snapserver = new SnapServer(this, monc);
+ snapclient = new SnapClient(this);
+
+ server = new Server(this);
+ locker = new Locker(this, mdcache);
+
+ op_tracker.set_complaint_and_threshold(cct->_conf->mds_op_complaint_time,
+ cct->_conf->mds_op_log_threshold);
+ op_tracker.set_history_size_and_duration(cct->_conf->mds_op_history_size,
+ cct->_conf->mds_op_history_duration);
+
+ schedule_update_timer_task();
+}
+
+MDSRank::~MDSRank()
+{
+ if (hb) {
+ g_ceph_context->get_heartbeat_map()->remove_worker(hb);
+ }
+
+ if (scrubstack) { delete scrubstack; scrubstack = NULL; }
+ if (mdcache) { delete mdcache; mdcache = NULL; }
+ if (mdlog) { delete mdlog; mdlog = NULL; }
+ if (balancer) { delete balancer; balancer = NULL; }
+ if (inotable) { delete inotable; inotable = NULL; }
+ if (snapserver) { delete snapserver; snapserver = NULL; }
+ if (snapclient) { delete snapclient; snapclient = NULL; }
+
+ if (server) { delete server; server = 0; }
+ if (locker) { delete locker; locker = 0; }
+
+ if (logger) {
+ g_ceph_context->get_perfcounters_collection()->remove(logger);
+ delete logger;
+ logger = 0;
+ }
+ if (mlogger) {
+ g_ceph_context->get_perfcounters_collection()->remove(mlogger);
+ delete mlogger;
+ mlogger = 0;
+ }
+
+ delete finisher;
+ finisher = NULL;
+
+ delete suicide_hook;
+ suicide_hook = NULL;
+
+ delete respawn_hook;
+ respawn_hook = NULL;
+
+ delete objecter;
+ objecter = nullptr;
+}
+
+void MDSRankDispatcher::init()
+{
+ objecter->init();
+ messenger->add_dispatcher_head(objecter);
+
+ objecter->start();
+
+ update_log_config();
+ create_logger();
+
+ // Expose the OSDMap (already populated during MDS::init) to anyone
+ // who is interested in it.
+ handle_osd_map();
+
+ progress_thread.create("mds_rank_progr");
+
+ purge_queue.init();
+
+ finisher->start();
+}
+
+void MDSRank::update_targets()
+{
+ // get MonMap's idea of my export_targets
+ const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
+
+ dout(20) << "updating export targets, currently " << map_targets.size() << " ranks are targets" << dendl;
+
+ bool send = false;
+ set<mds_rank_t> new_map_targets;
+
+ auto it = export_targets.begin();
+ while (it != export_targets.end()) {
+ mds_rank_t rank = it->first;
+ auto &counter = it->second;
+ dout(20) << "export target mds." << rank << " is " << counter << dendl;
+
+ double val = counter.get();
+ if (val <= 0.01) {
+ dout(15) << "export target mds." << rank << " is no longer an export target" << dendl;
+ export_targets.erase(it++);
+ send = true;
+ continue;
+ }
+ if (!map_targets.count(rank)) {
+ dout(15) << "export target mds." << rank << " not in map's export_targets" << dendl;
+ send = true;
+ }
+ new_map_targets.insert(rank);
+ it++;
+ }
+ if (new_map_targets.size() < map_targets.size()) {
+ dout(15) << "export target map holds stale targets, sending update" << dendl;
+ send = true;
+ }
+
+ if (send) {
+ dout(15) << "updating export_targets, now " << new_map_targets.size() << " ranks are targets" << dendl;
+ auto m = MMDSLoadTargets::create(mds_gid_t(monc->get_global_id()), new_map_targets);
+ monc->send_mon_message(m.detach());
+ }
+}
+
+void MDSRank::hit_export_target(mds_rank_t rank, double amount)
+{
+ double rate = g_conf()->mds_bal_target_decay;
+ if (amount < 0.0) {
+ amount = 100.0/g_conf()->mds_bal_target_decay; /* a good default for "i am trying to keep this export_target active" */
+ }
+ auto em = export_targets.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple(DecayRate(rate)));
+ auto &counter = em.first->second;
+ counter.hit(amount);
+ if (em.second) {
+ dout(15) << "hit export target (new) is " << counter << dendl;
+ } else {
+ dout(15) << "hit export target is " << counter << dendl;
+ }
+}
+
+class C_MDS_MonCommand : public MDSInternalContext {
+ std::string cmd;
+public:
+ std::string outs;
+ C_MDS_MonCommand(MDSRank *m, std::string_view c)
+ : MDSInternalContext(m), cmd(c) {}
+ void finish(int r) override {
+ mds->_mon_command_finish(r, cmd, outs);
+ }
+};
+
+void MDSRank::_mon_command_finish(int r, std::string_view cmd, std::string_view outs)
+{
+ if (r < 0) {
+ dout(0) << __func__ << ": mon command " << cmd << " failed with errno " << r
+ << " (" << outs << ")" << dendl;
+ } else {
+ dout(1) << __func__ << ": mon command " << cmd << " succeed" << dendl;
+ }
+}
+
+void MDSRank::set_mdsmap_multimds_snaps_allowed()
+{
+ static bool already_sent = false;
+ if (already_sent)
+ return;
+
+ stringstream ss;
+ ss << "{\"prefix\":\"fs set\", \"fs_name\":\"" << mdsmap->get_fs_name() << "\", ";
+ ss << "\"var\":\"allow_multimds_snaps\", \"val\":\"true\", ";
+ ss << "\"confirm\":\"--yes-i-am-really-a-mds\"}";
+ std::vector<std::string> cmd = {ss.str()};
+
+ dout(0) << __func__ << ": sending mon command: " << cmd[0] << dendl;
+
+ C_MDS_MonCommand *fin = new C_MDS_MonCommand(this, cmd[0]);
+ monc->start_mon_command(cmd, {}, nullptr, &fin->outs, new C_IO_Wrapper(this, fin));
+
+ already_sent = true;
+}
+
+void MDSRank::mark_base_recursively_scrubbed(inodeno_t ino)
+{
+ if (mdsmap->get_tableserver() == whoami)
+ snapserver->mark_base_recursively_scrubbed(ino);
+}
+
+void MDSRankDispatcher::tick()
+{
+ heartbeat_reset();
+
+ if (beacon.is_laggy()) {
+ dout(1) << "skipping upkeep work because connection to Monitors appears laggy" << dendl;
+ return;
+ }
+
+ check_ops_in_flight();
+
+ // Wake up thread in case we use to be laggy and have waiting_for_nolaggy
+ // messages to progress.
+ progress_thread.signal();
+
+ // make sure mds log flushes, trims periodically
+ mdlog->flush();
+
+ // update average session uptime
+ sessionmap.update_average_session_age();
+
+ if (is_active() || is_stopping()) {
+ mdlog->trim(); // NOT during recovery!
+ }
+
+ // ...
+ if (is_cache_trimmable()) {
+ server->find_idle_sessions();
+ server->evict_cap_revoke_non_responders();
+ locker->tick();
+ }
+
+ // log
+ if (logger) {
+ logger->set(l_mds_subtrees, mdcache->num_subtrees());
+ mdcache->log_stat();
+ }
+
+ if (is_reconnect())
+ server->reconnect_tick();
+
+ if (is_active()) {
+ balancer->tick();
+ mdcache->find_stale_fragment_freeze();
+ mdcache->migrator->find_stale_export_freeze();
+
+ if (mdsmap->get_tableserver() == whoami) {
+ snapserver->check_osd_map(false);
+ // Filesystem was created by pre-mimic mds. Allow multi-active mds after
+ // all old snapshots are deleted.
+ if (!mdsmap->allows_multimds_snaps() &&
+ snapserver->can_allow_multimds_snaps()) {
+ set_mdsmap_multimds_snaps_allowed();
+ }
+ }
+ }
+
+ if (is_active() || is_stopping()) {
+ update_targets();
+ }
+
+ // shut down?
+ if (is_stopping()) {
+ mdlog->trim();
+ if (mdcache->shutdown_pass()) {
+ uint64_t pq_progress = 0 ;
+ uint64_t pq_total = 0;
+ size_t pq_in_flight = 0;
+ if (!purge_queue.drain(&pq_progress, &pq_total, &pq_in_flight)) {
+ dout(7) << "shutdown_pass=true, but still waiting for purge queue"
+ << dendl;
+ // This takes unbounded time, so we must indicate progress
+ // to the administrator: we do it in a slightly imperfect way
+ // by sending periodic (tick frequency) clog messages while
+ // in this state.
+ clog->info() << "MDS rank " << whoami << " waiting for purge queue ("
+ << std::dec << pq_progress << "/" << pq_total << " " << pq_in_flight
+ << " files purging" << ")";
+ } else {
+ dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to "
+ "down:stopped" << dendl;
+ stopping_done();
+ }
+ }
+ else {
+ dout(7) << "shutdown_pass=false" << dendl;
+ }
+ }
+
+ // Expose ourselves to Beacon to update health indicators
+ beacon.notify_health(this);
+}
+
+void MDSRankDispatcher::shutdown()
+{
+ // It should never be possible for shutdown to get called twice, because
+ // anyone picking up mds_lock checks if stopping is true and drops
+ // out if it is.
+ ceph_assert(stopping == false);
+ stopping = true;
+
+ dout(1) << __func__ << ": shutting down rank " << whoami << dendl;
+
+ g_conf().remove_observer(this);
+
+ timer.shutdown();
+
+ // MDLog has to shut down before the finisher, because some of its
+ // threads block on IOs that require finisher to complete.
+ mdlog->shutdown();
+
+ // shut down cache
+ mdcache->shutdown();
+
+ purge_queue.shutdown();
+
+ mds_lock.Unlock();
+ finisher->stop(); // no flushing
+ mds_lock.Lock();
+
+ if (objecter->initialized)
+ objecter->shutdown();
+
+ monc->shutdown();
+
+ op_tracker.on_shutdown();
+
+ progress_thread.shutdown();
+
+ // release mds_lock for finisher/messenger threads (e.g.
+ // MDSDaemon::ms_handle_reset called from Messenger).
+ mds_lock.Unlock();
+
+ // shut down messenger
+ messenger->shutdown();
+
+ mds_lock.Lock();
+
+ // Workaround unclean shutdown: HeartbeatMap will assert if
+ // worker is not removed (as we do in ~MDS), but ~MDS is not
+ // always called after suicide.
+ if (hb) {
+ g_ceph_context->get_heartbeat_map()->remove_worker(hb);
+ hb = NULL;
+ }
+}
+
+/**
+ * Helper for simple callbacks that call a void fn with no args.
+ */
+class C_MDS_VoidFn : public MDSInternalContext
+{
+ typedef void (MDSRank::*fn_ptr)();
+ protected:
+ fn_ptr fn;
+ public:
+ C_MDS_VoidFn(MDSRank *mds_, fn_ptr fn_)
+ : MDSInternalContext(mds_), fn(fn_)
+ {
+ ceph_assert(mds_);
+ ceph_assert(fn_);
+ }
+
+ void finish(int r) override
+ {
+ (mds->*fn)();
+ }
+};
+
+int64_t MDSRank::get_metadata_pool()
+{
+ return mdsmap->get_metadata_pool();
+}
+
+MDSTableClient *MDSRank::get_table_client(int t)
+{
+ switch (t) {
+ case TABLE_ANCHOR: return NULL;
+ case TABLE_SNAP: return snapclient;
+ default: ceph_abort();
+ }
+}
+
+MDSTableServer *MDSRank::get_table_server(int t)
+{
+ switch (t) {
+ case TABLE_ANCHOR: return NULL;
+ case TABLE_SNAP: return snapserver;
+ default: ceph_abort();
+ }
+}
+
+void MDSRank::suicide()
+{
+ if (suicide_hook) {
+ suicide_hook->complete(0);
+ suicide_hook = NULL;
+ }
+}
+
+void MDSRank::respawn()
+{
+ if (respawn_hook) {
+ respawn_hook->complete(0);
+ respawn_hook = NULL;
+ }
+}
+
+void MDSRank::damaged()
+{
+ ceph_assert(whoami != MDS_RANK_NONE);
+ ceph_assert(mds_lock.is_locked_by_me());
+
+ beacon.set_want_state(*mdsmap, MDSMap::STATE_DAMAGED);
+ monc->flush_log(); // Flush any clog error from before we were called
+ beacon.notify_health(this); // Include latest status in our swan song
+ beacon.send_and_wait(g_conf()->mds_mon_shutdown_timeout);
+
+ // It's okay if we timed out and the mon didn't get our beacon, because
+ // another daemon (or ourselves after respawn) will eventually take the
+ // rank and report DAMAGED again when it hits same problem we did.
+
+ respawn(); // Respawn into standby in case mon has other work for us
+}
+
+void MDSRank::damaged_unlocked()
+{
+ std::lock_guard l(mds_lock);
+ damaged();
+}
+
+void MDSRank::handle_write_error(int err)
+{
+ if (err == -EBLACKLISTED) {
+ derr << "we have been blacklisted (fenced), respawning..." << dendl;
+ respawn();
+ return;
+ }
+
+ if (g_conf()->mds_action_on_write_error >= 2) {
+ derr << "unhandled write error " << cpp_strerror(err) << ", suicide..." << dendl;
+ respawn();
+ } else if (g_conf()->mds_action_on_write_error == 1) {
+ derr << "unhandled write error " << cpp_strerror(err) << ", force readonly..." << dendl;
+ mdcache->force_readonly();
+ } else {
+ // ignore;
+ derr << "unhandled write error " << cpp_strerror(err) << ", ignore..." << dendl;
+ }
+}
+
+void *MDSRank::ProgressThread::entry()
+{
+ std::lock_guard l(mds->mds_lock);
+ while (true) {
+ while (!mds->stopping &&
+ mds->finished_queue.empty() &&
+ (mds->waiting_for_nolaggy.empty() || mds->beacon.is_laggy())) {
+ cond.Wait(mds->mds_lock);
+ }
+
+ if (mds->stopping) {
+ break;
+ }
+
+ mds->_advance_queues();
+ }
+
+ return NULL;
+}
+
+
+void MDSRank::ProgressThread::shutdown()
+{
+ ceph_assert(mds->mds_lock.is_locked_by_me());
+ ceph_assert(mds->stopping);
+
+ if (am_self()) {
+ // Stopping is set, we will fall out of our main loop naturally
+ } else {
+ // Kick the thread to notice mds->stopping, and join it
+ cond.Signal();
+ mds->mds_lock.Unlock();
+ if (is_started())
+ join();
+ mds->mds_lock.Lock();
+ }
+}
+
+bool MDSRankDispatcher::ms_dispatch(const Message::const_ref &m)
+{
+ if (m->get_source().is_client()) {
+ Session *session = static_cast<Session*>(m->get_connection()->get_priv().get());
+ if (session)
+ session->last_seen = Session::clock::now();
+ }
+
+ inc_dispatch_depth();
+ bool ret = _dispatch(m, true);
+ dec_dispatch_depth();
+ return ret;
+}
+
+bool MDSRank::_dispatch(const Message::const_ref &m, bool new_msg)
+{
+ if (is_stale_message(m)) {
+ return true;
+ }
+ // do not proceed if this message cannot be handled
+ if (!is_valid_message(m)) {
+ return false;
+ }
+
+ if (beacon.is_laggy()) {
+ dout(5) << " laggy, deferring " << *m << dendl;
+ waiting_for_nolaggy.push_back(m);
+ } else if (new_msg && !waiting_for_nolaggy.empty()) {
+ dout(5) << " there are deferred messages, deferring " << *m << dendl;
+ waiting_for_nolaggy.push_back(m);
+ } else {
+ handle_message(m);
+ heartbeat_reset();
+ }
+
+ if (dispatch_depth > 1)
+ return true;
+
+ // finish any triggered contexts
+ _advance_queues();
+
+ if (beacon.is_laggy()) {
+ // We've gone laggy during dispatch, don't do any
+ // more housekeeping
+ return true;
+ }
+
+ // hack: thrash exports
+ static utime_t start;
+ utime_t now = ceph_clock_now();
+ if (start == utime_t())
+ start = now;
+ /*double el = now - start;
+ if (el > 30.0 &&
+ el < 60.0)*/
+ for (int i=0; i<g_conf()->mds_thrash_exports; i++) {
+ set<mds_rank_t> s;
+ if (!is_active()) break;
+ mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE);
+ if (s.size() < 2 || CInode::count() < 10)
+ break; // need peers for this to work.
+ if (mdcache->migrator->get_num_exporting() > g_conf()->mds_thrash_exports * 5 ||
+ mdcache->migrator->get_export_queue_size() > g_conf()->mds_thrash_exports * 10)
+ break;
+
+ dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf()->mds_thrash_exports << dendl;
+
+ // pick a random dir inode
+ CInode *in = mdcache->hack_pick_random_inode();
+
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ if (!ls.empty()) { // must be an open dir.
+ list<CDir*>::iterator p = ls.begin();
+ int n = rand() % ls.size();
+ while (n--)
+ ++p;
+ CDir *dir = *p;
+ if (!dir->get_parent_dir()) continue; // must be linked.
+ if (!dir->is_auth()) continue; // must be auth.
+
+ mds_rank_t dest;
+ do {
+ int k = rand() % s.size();
+ set<mds_rank_t>::iterator p = s.begin();
+ while (k--) ++p;
+ dest = *p;
+ } while (dest == whoami);
+ mdcache->migrator->export_dir_nicely(dir,dest);
+ }
+ }
+ // hack: thrash fragments
+ for (int i=0; i<g_conf()->mds_thrash_fragments; i++) {
+ if (!is_active()) break;
+ if (mdcache->get_num_fragmenting_dirs() > 5 * g_conf()->mds_thrash_fragments) break;
+ dout(7) << "mds thrashing fragments pass " << (i+1) << "/" << g_conf()->mds_thrash_fragments << dendl;
+
+ // pick a random dir inode
+ CInode *in = mdcache->hack_pick_random_inode();
+
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ if (ls.empty()) continue; // must be an open dir.
+ CDir *dir = ls.front();
+ if (!dir->get_parent_dir()) continue; // must be linked.
+ if (!dir->is_auth()) continue; // must be auth.
+ frag_t fg = dir->get_frag();
+ if ((fg == frag_t() || (rand() % (1 << fg.bits()) == 0))) {
+ mdcache->split_dir(dir, 1);
+ } else {
+ balancer->queue_merge(dir);
+ }
+ }
+
+ // hack: force hash root?
+ /*
+ if (false &&
+ mdcache->get_root() &&
+ mdcache->get_root()->dir &&
+ !(mdcache->get_root()->dir->is_hashed() ||
+ mdcache->get_root()->dir->is_hashing())) {
+ dout(0) << "hashing root" << dendl;
+ mdcache->migrator->hash_dir(mdcache->get_root()->dir);
+ }
+ */
+
+ update_mlogger();
+ return true;
+}
+
+void MDSRank::update_mlogger()
+{
+ if (mlogger) {
+ mlogger->set(l_mdm_ino, CInode::count());
+ mlogger->set(l_mdm_dir, CDir::count());
+ mlogger->set(l_mdm_dn, CDentry::count());
+ mlogger->set(l_mdm_cap, Capability::count());
+ mlogger->set(l_mdm_inoa, CInode::increments());
+ mlogger->set(l_mdm_inos, CInode::decrements());
+ mlogger->set(l_mdm_dira, CDir::increments());
+ mlogger->set(l_mdm_dirs, CDir::decrements());
+ mlogger->set(l_mdm_dna, CDentry::increments());
+ mlogger->set(l_mdm_dns, CDentry::decrements());
+ mlogger->set(l_mdm_capa, Capability::increments());
+ mlogger->set(l_mdm_caps, Capability::decrements());
+ }
+}
+
+// message types that the mds can handle
+bool MDSRank::is_valid_message(const Message::const_ref &m) {
+ int port = m->get_type() & 0xff00;
+ int type = m->get_type();
+
+ if (port == MDS_PORT_CACHE ||
+ port == MDS_PORT_MIGRATOR ||
+ type == CEPH_MSG_CLIENT_SESSION ||
+ type == CEPH_MSG_CLIENT_RECONNECT ||
+ type == CEPH_MSG_CLIENT_RECLAIM ||
+ type == CEPH_MSG_CLIENT_REQUEST ||
+ type == MSG_MDS_SLAVE_REQUEST ||
+ type == MSG_MDS_HEARTBEAT ||
+ type == MSG_MDS_TABLE_REQUEST ||
+ type == MSG_MDS_LOCK ||
+ type == MSG_MDS_INODEFILECAPS ||
+ type == CEPH_MSG_CLIENT_CAPS ||
+ type == CEPH_MSG_CLIENT_CAPRELEASE ||
+ type == CEPH_MSG_CLIENT_LEASE) {
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * lower priority messages we defer if we seem laggy
+ */
+
+#define ALLOW_MESSAGES_FROM(peers) \
+ do { \
+ if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
+ dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
+ << " allowing=" << #peers << " message=" << *m << dendl; \
+ return; \
+ } \
+ } while (0)
+
+void MDSRank::handle_message(const Message::const_ref &m)
+{
+ int port = m->get_type() & 0xff00;
+
+ switch (port) {
+ case MDS_PORT_CACHE:
+ ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+ mdcache->dispatch(m);
+ break;
+
+ case MDS_PORT_MIGRATOR:
+ ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+ mdcache->migrator->dispatch(m);
+ break;
+
+ default:
+ switch (m->get_type()) {
+ // SERVER
+ case CEPH_MSG_CLIENT_SESSION:
+ case CEPH_MSG_CLIENT_RECONNECT:
+ case CEPH_MSG_CLIENT_RECLAIM:
+ ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
+ // fall-thru
+ case CEPH_MSG_CLIENT_REQUEST:
+ server->dispatch(m);
+ break;
+ case MSG_MDS_SLAVE_REQUEST:
+ ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+ server->dispatch(m);
+ break;
+
+ case MSG_MDS_HEARTBEAT:
+ ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+ balancer->proc_message(m);
+ break;
+
+ case MSG_MDS_TABLE_REQUEST:
+ ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+ {
+ const MMDSTableRequest::const_ref &req = MMDSTableRequest::msgref_cast(m);
+ if (req->op < 0) {
+ MDSTableClient *client = get_table_client(req->table);
+ client->handle_request(req);
+ } else {
+ MDSTableServer *server = get_table_server(req->table);
+ server->handle_request(req);
+ }
+ }
+ break;
+
+ case MSG_MDS_LOCK:
+ case MSG_MDS_INODEFILECAPS:
+ ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MDS);
+ locker->dispatch(m);
+ break;
+
+ case CEPH_MSG_CLIENT_CAPS:
+ case CEPH_MSG_CLIENT_CAPRELEASE:
+ case CEPH_MSG_CLIENT_LEASE:
+ ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_CLIENT);
+ locker->dispatch(m);
+ break;
+
+ default:
+ derr << "unrecognized message " << *m << dendl;
+ }
+ }
+}
+
+/**
+ * Advance finished_queue and waiting_for_nolaggy.
+ *
+ * Usually drain both queues, but may not drain waiting_for_nolaggy
+ * if beacon is currently laggy.
+ */
+void MDSRank::_advance_queues()
+{
+ ceph_assert(mds_lock.is_locked_by_me());
+
+ if (!finished_queue.empty()) {
+ dout(7) << "mds has " << finished_queue.size() << " queued contexts" << dendl;
+ while (!finished_queue.empty()) {
+ auto fin = finished_queue.front();
+ finished_queue.pop_front();
+
+ dout(10) << " finish " << fin << dendl;
+ fin->complete(0);
+
+ heartbeat_reset();
+ }
+ }
+
+ while (!waiting_for_nolaggy.empty()) {
+ // stop if we're laggy now!
+ if (beacon.is_laggy())
+ break;
+
+ Message::const_ref old = waiting_for_nolaggy.front();
+ waiting_for_nolaggy.pop_front();
+
+ if (!is_stale_message(old)) {
+ dout(7) << " processing laggy deferred " << *old << dendl;
+ ceph_assert(is_valid_message(old));
+ handle_message(old);
+ }
+
+ heartbeat_reset();
+ }
+}
+
+/**
+ * Call this when you take mds_lock, or periodically if you're going to
+ * hold the lock for a long time (e.g. iterating over clients/inodes)
+ */
+void MDSRank::heartbeat_reset()
+{
+ // Any thread might jump into mds_lock and call us immediately
+ // after a call to suicide() completes, in which case MDSRank::hb
+ // has been freed and we are a no-op.
+ if (!hb) {
+ ceph_assert(stopping);
+ return;
+ }
+
+ // NB not enabling suicide grace, because the mon takes care of killing us
+ // (by blacklisting us) when we fail to send beacons, and it's simpler to
+ // only have one way of dying.
+ auto grace = g_conf().get_val<double>("mds_heartbeat_grace");
+ g_ceph_context->get_heartbeat_map()->reset_timeout(hb, grace, 0);
+}
+
+bool MDSRank::is_stale_message(const Message::const_ref &m) const
+{
+ // from bad mds?
+ if (m->get_source().is_mds()) {
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+ bool bad = false;
+ if (mdsmap->is_down(from)) {
+ bad = true;
+ } else {
+ // FIXME: this is a convoluted check. we should be maintaining a nice
+ // clean map of current ConnectionRefs for current mdses!!!
+ auto c = messenger->connect_to(CEPH_ENTITY_TYPE_MDS,
+ mdsmap->get_addrs(from));
+ if (c != m->get_connection()) {
+ bad = true;
+ dout(5) << " mds." << from << " should be " << c << " "
+ << c->get_peer_addrs() << " but this message is "
+ << m->get_connection() << " " << m->get_source_addrs()
+ << dendl;
+ }
+ }
+ if (bad) {
+ // bogus mds?
+ if (m->get_type() == CEPH_MSG_MDS_MAP) {
+ dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source()
+ << ", but it's an mdsmap, looking at it" << dendl;
+ } else if (m->get_type() == MSG_MDS_CACHEEXPIRE &&
+ mdsmap->get_addrs(from) == m->get_source_addrs()) {
+ dout(5) << "got " << *m << " from down mds " << m->get_source()
+ << ", but it's a cache_expire, looking at it" << dendl;
+ } else {
+ dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source()
+ << ", dropping" << dendl;
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+Session *MDSRank::get_session(const Message::const_ref &m)
+{
+ // do not carry ref
+ auto session = static_cast<Session *>(m->get_connection()->get_priv().get());
+ if (session) {
+ dout(20) << "get_session have " << session << " " << session->info.inst
+ << " state " << session->get_state_name() << dendl;
+ // Check if we've imported an open session since (new sessions start closed)
+ if (session->is_closed()) {
+ Session *imported_session = sessionmap.get_session(session->info.inst.name);
+ if (imported_session && imported_session != session) {
+ dout(10) << __func__ << " replacing connection bootstrap session "
+ << session << " with imported session " << imported_session
+ << dendl;
+ imported_session->info.auth_name = session->info.auth_name;
+ //assert(session->info.auth_name == imported_session->info.auth_name);
+ ceph_assert(session->info.inst == imported_session->info.inst);
+ imported_session->set_connection(session->get_connection().get());
+ // send out any queued messages
+ while (!session->preopen_out_queue.empty()) {
+ imported_session->get_connection()->send_message2(std::move(session->preopen_out_queue.front()));
+ session->preopen_out_queue.pop_front();
+ }
+ imported_session->auth_caps = session->auth_caps;
+ imported_session->last_seen = session->last_seen;
+ ceph_assert(session->get_nref() == 1);
+ imported_session->get_connection()->set_priv(imported_session->get());
+ session = imported_session;
+ }
+ }
+ } else {
+ dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
+ }
+ return session;
+}
+
+void MDSRank::send_message(const Message::ref& m, const ConnectionRef& c)
+{
+ ceph_assert(c);
+ c->send_message2(m);
+}
+
+
+void MDSRank::send_message_mds(const Message::ref& m, mds_rank_t mds)
+{
+ if (!mdsmap->is_up(mds)) {
+ dout(10) << "send_message_mds mds." << mds << " not up, dropping " << *m << dendl;
+ return;
+ }
+
+ // send mdsmap first?
+ if (mds != whoami && peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) {
+ auto _m = MMDSMap::create(monc->get_fsid(), *mdsmap);
+ messenger->send_to_mds(_m.detach(), mdsmap->get_addrs(mds));
+ peer_mdsmap_epoch[mds] = mdsmap->get_epoch();
+ }
+
+ // send message
+ messenger->send_to_mds(Message::ref(m).detach(), mdsmap->get_addrs(mds));
+}
+
+void MDSRank::forward_message_mds(const MClientRequest::const_ref& m, mds_rank_t mds)
+{
+ ceph_assert(mds != whoami);
+
+ /*
+ * don't actually forward if non-idempotent!
+ * client has to do it. although the MDS will ignore duplicate requests,
+ * the affected metadata may migrate, in which case the new authority
+ * won't have the metareq_id in the completed request map.
+ */
+ // NEW: always make the client resend!
+ bool client_must_resend = true; //!creq->can_forward();
+
+ // tell the client where it should go
+ auto session = get_session(m);
+ auto f = MClientRequestForward::create(m->get_tid(), mds, m->get_num_fwd()+1, client_must_resend);
+ send_message_client(f, session);
+}
+
+void MDSRank::send_message_client_counted(const Message::ref& m, client_t client)
+{
+ Session *session = sessionmap.get_session(entity_name_t::CLIENT(client.v));
+ if (session) {
+ send_message_client_counted(m, session);
+ } else {
+ dout(10) << "send_message_client_counted no session for client." << client << " " << *m << dendl;
+ }
+}
+
+void MDSRank::send_message_client_counted(const Message::ref& m, const ConnectionRef& connection)
+{
+ // do not carry ref
+ auto session = static_cast<Session *>(connection->get_priv().get());
+ if (session) {
+ send_message_client_counted(m, session);
+ } else {
+ dout(10) << "send_message_client_counted has no session for " << m->get_source_inst() << dendl;
+ // another Connection took over the Session
+ }
+}
+
+void MDSRank::send_message_client_counted(const Message::ref& m, Session* session)
+{
+ version_t seq = session->inc_push_seq();
+ dout(10) << "send_message_client_counted " << session->info.inst.name << " seq "
+ << seq << " " << *m << dendl;
+ if (session->get_connection()) {
+ session->get_connection()->send_message2(m);
+ } else {
+ session->preopen_out_queue.push_back(m);
+ }
+}
+
+void MDSRank::send_message_client(const Message::ref& m, Session* session)
+{
+ dout(10) << "send_message_client " << session->info.inst << " " << *m << dendl;
+ if (session->get_connection()) {
+ session->get_connection()->send_message2(m);
+ } else {
+ session->preopen_out_queue.push_back(m);
+ }
+}
+
+/**
+ * This is used whenever a RADOS operation has been cancelled
+ * or a RADOS client has been blacklisted, to cause the MDS and
+ * any clients to wait for this OSD epoch before using any new caps.
+ *
+ * See doc/cephfs/eviction
+ */
+void MDSRank::set_osd_epoch_barrier(epoch_t e)
+{
+ dout(4) << __func__ << ": epoch=" << e << dendl;
+ osd_epoch_barrier = e;
+}
+
+void MDSRank::retry_dispatch(const Message::const_ref &m)
+{
+ inc_dispatch_depth();
+ _dispatch(m, false);
+ dec_dispatch_depth();
+}
+
+double MDSRank::get_dispatch_queue_max_age(utime_t now) const
+{
+ return messenger->get_dispatch_queue_max_age(now);
+}
+
+bool MDSRank::is_daemon_stopping() const
+{
+ return stopping;
+}
+
+void MDSRank::request_state(MDSMap::DaemonState s)
+{
+ dout(3) << "request_state " << ceph_mds_state_name(s) << dendl;
+ beacon.set_want_state(*mdsmap, s);
+ beacon.send();
+}
+
+
+class C_MDS_BootStart : public MDSInternalContext {
+ MDSRank::BootStep nextstep;
+public:
+ C_MDS_BootStart(MDSRank *m, MDSRank::BootStep n)
+ : MDSInternalContext(m), nextstep(n) {}
+ void finish(int r) override {
+ mds->boot_start(nextstep, r);
+ }
+};
+
+
+void MDSRank::boot_start(BootStep step, int r)
+{
+ // Handle errors from previous step
+ if (r < 0) {
+ if (is_standby_replay() && (r == -EAGAIN)) {
+ dout(0) << "boot_start encountered an error EAGAIN"
+ << ", respawning since we fell behind journal" << dendl;
+ respawn();
+ } else if (r == -EINVAL || r == -ENOENT) {
+ // Invalid or absent data, indicates damaged on-disk structures
+ clog->error() << "Error loading MDS rank " << whoami << ": "
+ << cpp_strerror(r);
+ damaged();
+ ceph_assert(r == 0); // Unreachable, damaged() calls respawn()
+ } else if (r == -EROFS) {
+ dout(0) << "boot error forcing transition to read-only; MDS will try to continue" << dendl;
+ } else {
+ // Completely unexpected error, give up and die
+ dout(0) << "boot_start encountered an error, failing" << dendl;
+ suicide();
+ return;
+ }
+ }
+
+ ceph_assert(is_starting() || is_any_replay());
+
+ switch(step) {
+ case MDS_BOOT_INITIAL:
+ {
+ mdcache->init_layouts();
+
+ MDSGatherBuilder gather(g_ceph_context,
+ new C_MDS_BootStart(this, MDS_BOOT_OPEN_ROOT));
+ dout(2) << "Booting: " << step << ": opening inotable" << dendl;
+ inotable->set_rank(whoami);
+ inotable->load(gather.new_sub());
+
+ dout(2) << "Booting: " << step << ": opening sessionmap" << dendl;
+ sessionmap.set_rank(whoami);
+ sessionmap.load(gather.new_sub());
+
+ dout(2) << "Booting: " << step << ": opening mds log" << dendl;
+ mdlog->open(gather.new_sub());
+
+ if (is_starting()) {
+ dout(2) << "Booting: " << step << ": opening purge queue" << dendl;
+ purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
+ } else if (!standby_replaying) {
+ dout(2) << "Booting: " << step << ": opening purge queue (async)" << dendl;
+ purge_queue.open(NULL);
+ dout(2) << "Booting: " << step << ": loading open file table (async)" << dendl;
+ mdcache->open_file_table.load(nullptr);
+ }
+
+ if (mdsmap->get_tableserver() == whoami) {
+ dout(2) << "Booting: " << step << ": opening snap table" << dendl;
+ snapserver->set_rank(whoami);
+ snapserver->load(gather.new_sub());
+ }
+
+ gather.activate();
+ }
+ break;
+ case MDS_BOOT_OPEN_ROOT:
+ {
+ dout(2) << "Booting: " << step << ": loading/discovering base inodes" << dendl;
+
+ MDSGatherBuilder gather(g_ceph_context,
+ new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
+
+ if (is_starting()) {
+ // load mydir frag for the first log segment (creating subtree map)
+ mdcache->open_mydir_frag(gather.new_sub());
+ } else {
+ mdcache->open_mydir_inode(gather.new_sub());
+ }
+
+ mdcache->create_global_snaprealm();
+
+ if (whoami == mdsmap->get_root()) { // load root inode off disk if we are auth
+ mdcache->open_root_inode(gather.new_sub());
+ } else if (is_any_replay()) {
+ // replay. make up fake root inode to start with
+ mdcache->create_root_inode();
+ }
+ gather.activate();
+ }
+ break;
+ case MDS_BOOT_PREPARE_LOG:
+ if (is_any_replay()) {
+ dout(2) << "Booting: " << step << ": replaying mds log" << dendl;
+ MDSGatherBuilder gather(g_ceph_context,
+ new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
+
+ if (!standby_replaying) {
+ dout(2) << "Booting: " << step << ": waiting for purge queue recovered" << dendl;
+ purge_queue.wait_for_recovery(new C_IO_Wrapper(this, gather.new_sub()));
+ }
+
+ mdlog->replay(gather.new_sub());
+ gather.activate();
+ } else {
+ dout(2) << "Booting: " << step << ": positioning at end of old mds log" << dendl;
+ mdlog->append();
+ starting_done();
+ }
+ break;
+ case MDS_BOOT_REPLAY_DONE:
+ ceph_assert(is_any_replay());
+
+ // Sessiontable and inotable should be in sync after replay, validate
+ // that they are consistent.
+ validate_sessions();
+
+ replay_done();
+ break;
+ }
+}
+
+void MDSRank::validate_sessions()
+{
+ ceph_assert(mds_lock.is_locked_by_me());
+ bool valid = true;
+
+ // Identify any sessions which have state inconsistent with other,
+ // after they have been loaded from rados during startup.
+ // Mitigate bugs like: http://tracker.ceph.com/issues/16842
+ for (const auto &i : sessionmap.get_sessions()) {
+ Session *session = i.second;
+ interval_set<inodeno_t> badones;
+ if (inotable->intersects_free(session->info.prealloc_inos, &badones)) {
+ clog->error() << "client " << *session
+ << "loaded with preallocated inodes that are inconsistent with inotable";
+ valid = false;
+ }
+ }
+
+ if (!valid) {
+ damaged();
+ ceph_assert(valid);
+ }
+}
+
+void MDSRank::starting_done()
+{
+ dout(3) << "starting_done" << dendl;
+ ceph_assert(is_starting());
+ request_state(MDSMap::STATE_ACTIVE);
+
+ mdlog->start_new_segment();
+
+ // sync snaptable cache
+ snapclient->sync(new C_MDSInternalNoop);
+}
+
+
+void MDSRank::calc_recovery_set()
+{
+ // initialize gather sets
+ set<mds_rank_t> rs;
+ mdsmap->get_recovery_mds_set(rs);
+ rs.erase(whoami);
+ mdcache->set_recovery_set(rs);
+
+ dout(1) << " recovery set is " << rs << dendl;
+}
+
+
+void MDSRank::replay_start()
+{
+ dout(1) << "replay_start" << dendl;
+
+ if (is_standby_replay())
+ standby_replaying = true;
+
+ // Check if we need to wait for a newer OSD map before starting
+ Context *fin = new C_IO_Wrapper(this, new C_MDS_BootStart(this, MDS_BOOT_INITIAL));
+ bool const ready = objecter->wait_for_map(
+ mdsmap->get_last_failure_osd_epoch(),
+ fin);
+
+ if (ready) {
+ delete fin;
+ boot_start();
+ } else {
+ dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
+ << " (which blacklists prior instance)" << dendl;
+ }
+}
+
+
+class MDSRank::C_MDS_StandbyReplayRestartFinish : public MDSIOContext {
+ uint64_t old_read_pos;
+public:
+ C_MDS_StandbyReplayRestartFinish(MDSRank *mds_, uint64_t old_read_pos_) :
+ MDSIOContext(mds_), old_read_pos(old_read_pos_) {}
+ void finish(int r) override {
+ mds->_standby_replay_restart_finish(r, old_read_pos);
+ }
+ void print(ostream& out) const override {
+ out << "standby_replay_restart";
+ }
+};
+
+void MDSRank::_standby_replay_restart_finish(int r, uint64_t old_read_pos)
+{
+ if (old_read_pos < mdlog->get_journaler()->get_trimmed_pos()) {
+ dout(0) << "standby MDS fell behind active MDS journal's expire_pos, restarting" << dendl;
+ respawn(); /* we're too far back, and this is easier than
+ trying to reset everything in the cache, etc */
+ } else {
+ mdlog->standby_trim_segments();
+ boot_start(MDS_BOOT_PREPARE_LOG, r);
+ }
+}
+
+class MDSRank::C_MDS_StandbyReplayRestart : public MDSInternalContext {
+public:
+ explicit C_MDS_StandbyReplayRestart(MDSRank *m) : MDSInternalContext(m) {}
+ void finish(int r) override {
+ ceph_assert(!r);
+ mds->standby_replay_restart();
+ }
+};
+
+void MDSRank::standby_replay_restart()
+{
+ if (standby_replaying) {
+ /* Go around for another pass of replaying in standby */
+ dout(5) << "Restarting replay as standby-replay" << dendl;
+ mdlog->get_journaler()->reread_head_and_probe(
+ new C_MDS_StandbyReplayRestartFinish(
+ this,
+ mdlog->get_journaler()->get_read_pos()));
+ } else {
+ /* We are transitioning out of standby: wait for OSD map update
+ before making final pass */
+ dout(1) << "standby_replay_restart (final takeover pass)" << dendl;
+ Context *fin = new C_IO_Wrapper(this, new C_MDS_StandbyReplayRestart(this));
+ bool ready = objecter->wait_for_map(mdsmap->get_last_failure_osd_epoch(), fin);
+ if (ready) {
+ delete fin;
+ mdlog->get_journaler()->reread_head_and_probe(
+ new C_MDS_StandbyReplayRestartFinish(
+ this,
+ mdlog->get_journaler()->get_read_pos()));
+
+ dout(1) << " opening purge_queue (async)" << dendl;
+ purge_queue.open(NULL);
+ dout(1) << " opening open_file_table (async)" << dendl;
+ mdcache->open_file_table.load(nullptr);
+ } else {
+ dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
+ << " (which blacklists prior instance)" << dendl;
+ }
+ }
+}
+
+void MDSRank::replay_done()
+{
+ if (!standby_replaying) {
+ dout(1) << "Finished replaying journal" << dendl;
+ } else {
+ dout(5) << "Finished replaying journal as standby-replay" << dendl;
+ }
+
+ if (is_standby_replay()) {
+ // The replay was done in standby state, and we are still in that state
+ ceph_assert(standby_replaying);
+ dout(10) << "setting replay timer" << dendl;
+ timer.add_event_after(g_conf()->mds_replay_interval,
+ new C_MDS_StandbyReplayRestart(this));
+ return;
+ } else if (standby_replaying) {
+ // The replay was done in standby state, we have now _left_ that state
+ dout(10) << " last replay pass was as a standby; making final pass" << dendl;
+ standby_replaying = false;
+ standby_replay_restart();
+ return;
+ } else {
+ // Replay is complete, journal read should be up to date
+ ceph_assert(mdlog->get_journaler()->get_read_pos() == mdlog->get_journaler()->get_write_pos());
+ ceph_assert(!is_standby_replay());
+
+ // Reformat and come back here
+ if (mdlog->get_journaler()->get_stream_format() < g_conf()->mds_journal_format) {
+ dout(4) << "reformatting journal on standby-replay->replay transition" << dendl;
+ mdlog->reopen(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
+ return;
+ }
+ }
+
+ dout(1) << "making mds journal writeable" << dendl;
+ mdlog->get_journaler()->set_writeable();
+ mdlog->get_journaler()->trim_tail();
+
+ if (mdsmap->get_tableserver() == whoami &&
+ snapserver->upgrade_format()) {
+ dout(1) << "upgrading snaptable format" << dendl;
+ snapserver->save(new C_MDSInternalNoop);
+ }
+
+ if (g_conf()->mds_wipe_sessions) {
+ dout(1) << "wiping out client sessions" << dendl;
+ sessionmap.wipe();
+ sessionmap.save(new C_MDSInternalNoop);
+ }
+ if (g_conf()->mds_wipe_ino_prealloc) {
+ dout(1) << "wiping out ino prealloc from sessions" << dendl;
+ sessionmap.wipe_ino_prealloc();
+ sessionmap.save(new C_MDSInternalNoop);
+ }
+ if (g_conf()->mds_skip_ino) {
+ inodeno_t i = g_conf()->mds_skip_ino;
+ dout(1) << "skipping " << i << " inodes" << dendl;
+ inotable->skip_inos(i);
+ inotable->save(new C_MDSInternalNoop);
+ }
+
+ if (mdsmap->get_num_in_mds() == 1 &&
+ mdsmap->get_num_failed_mds() == 0) { // just me!
+ dout(2) << "i am alone, moving to state reconnect" << dendl;
+ request_state(MDSMap::STATE_RECONNECT);
+ // sync snaptable cache
+ snapclient->sync(new C_MDSInternalNoop);
+ } else {
+ dout(2) << "i am not alone, moving to state resolve" << dendl;
+ request_state(MDSMap::STATE_RESOLVE);
+ }
+}
+
+void MDSRank::reopen_log()
+{
+ dout(1) << "reopen_log" << dendl;
+ mdcache->rollback_uncommitted_fragments();
+}
+
+void MDSRank::resolve_start()
+{
+ dout(1) << "resolve_start" << dendl;
+
+ reopen_log();
+
+ calc_recovery_set();
+
+ mdcache->resolve_start(new C_MDS_VoidFn(this, &MDSRank::resolve_done));
+ finish_contexts(g_ceph_context, waiting_for_resolve);
+}
+
+void MDSRank::resolve_done()
+{
+ dout(1) << "resolve_done" << dendl;
+ request_state(MDSMap::STATE_RECONNECT);
+ // sync snaptable cache
+ snapclient->sync(new C_MDSInternalNoop);
+}
+
+void MDSRank::reconnect_start()
+{
+ dout(1) << "reconnect_start" << dendl;
+
+ if (last_state == MDSMap::STATE_REPLAY) {
+ reopen_log();
+ }
+
+ // Drop any blacklisted clients from the SessionMap before going
+ // into reconnect, so that we don't wait for them.
+ objecter->enable_blacklist_events();
+ std::set<entity_addr_t> blacklist;
+ epoch_t epoch = 0;
+ objecter->with_osdmap([&blacklist, &epoch](const OSDMap& o) {
+ o.get_blacklist(&blacklist);
+ epoch = o.get_epoch();
+ });
+ auto killed = server->apply_blacklist(blacklist);
+ dout(4) << "reconnect_start: killed " << killed << " blacklisted sessions ("
+ << blacklist.size() << " blacklist entries, "
+ << sessionmap.get_sessions().size() << ")" << dendl;
+ if (killed) {
+ set_osd_epoch_barrier(epoch);
+ }
+
+ server->reconnect_clients(new C_MDS_VoidFn(this, &MDSRank::reconnect_done));
+ finish_contexts(g_ceph_context, waiting_for_reconnect);
+}
+void MDSRank::reconnect_done()
+{
+ dout(1) << "reconnect_done" << dendl;
+ request_state(MDSMap::STATE_REJOIN); // move to rejoin state
+}
+
+void MDSRank::rejoin_joint_start()
+{
+ dout(1) << "rejoin_joint_start" << dendl;
+ mdcache->rejoin_send_rejoins();
+}
+void MDSRank::rejoin_start()
+{
+ dout(1) << "rejoin_start" << dendl;
+ mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
+ finish_contexts(g_ceph_context, waiting_for_rejoin);
+}
+void MDSRank::rejoin_done()
+{
+ dout(1) << "rejoin_done" << dendl;
+ mdcache->show_subtrees();
+ mdcache->show_cache();
+
+ if (mdcache->is_any_uncommitted_fragment()) {
+ dout(1) << " waiting for uncommitted fragments" << dendl;
+ mdcache->wait_for_uncommitted_fragments(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
+ return;
+ }
+
+ // funny case: is our cache empty? no subtrees?
+ if (!mdcache->is_subtrees()) {
+ if (whoami == 0) {
+ // The root should always have a subtree!
+ clog->error() << "No subtrees found for root MDS rank!";
+ damaged();
+ ceph_assert(mdcache->is_subtrees());
+ } else {
+ dout(1) << " empty cache, no subtrees, leaving cluster" << dendl;
+ request_state(MDSMap::STATE_STOPPED);
+ }
+ return;
+ }
+
+ if (replay_queue.empty() && !server->get_num_pending_reclaim()) {
+ request_state(MDSMap::STATE_ACTIVE);
+ } else {
+ replaying_requests_done = replay_queue.empty();
+ request_state(MDSMap::STATE_CLIENTREPLAY);
+ }
+}
+
+void MDSRank::clientreplay_start()
+{
+ dout(1) << "clientreplay_start" << dendl;
+ finish_contexts(g_ceph_context, waiting_for_replay); // kick waiters
+ queue_one_replay();
+}
+
+bool MDSRank::queue_one_replay()
+{
+ if (!replay_queue.empty()) {
+ queue_waiter(replay_queue.front());
+ replay_queue.pop_front();
+ return true;
+ }
+ if (!replaying_requests_done) {
+ replaying_requests_done = true;
+ mdlog->flush();
+ }
+ maybe_clientreplay_done();
+ return false;
+}
+
+void MDSRank::maybe_clientreplay_done()
+{
+ if (is_clientreplay() && get_want_state() == MDSMap::STATE_CLIENTREPLAY) {
+
+ // don't go to active if there are session waiting for being reclaimed
+ if (replaying_requests_done && !server->get_num_pending_reclaim()) {
+ mdlog->wait_for_safe(new C_MDS_VoidFn(this, &MDSRank::clientreplay_done));
+ return;
+ }
+
+ dout(1) << " still have " << replay_queue.size() + (int)!replaying_requests_done
+ << " requests need to be replayed, " << server->get_num_pending_reclaim()
+ << " sessions need to be reclaimed" << dendl;
+ }
+}
+
+void MDSRank::clientreplay_done()
+{
+ dout(1) << "clientreplay_done" << dendl;
+ request_state(MDSMap::STATE_ACTIVE);
+}
+
+void MDSRank::active_start()
+{
+ dout(1) << "active_start" << dendl;
+
+ if (last_state == MDSMap::STATE_CREATING ||
+ last_state == MDSMap::STATE_STARTING) {
+ mdcache->open_root();
+ }
+
+ mdcache->clean_open_file_lists();
+ mdcache->export_remaining_imported_caps();
+ finish_contexts(g_ceph_context, waiting_for_replay); // kick waiters
+
+ mdcache->reissue_all_caps();
+
+ finish_contexts(g_ceph_context, waiting_for_active); // kick waiters
+}
+
+void MDSRank::recovery_done(int oldstate)
+{
+ dout(1) << "recovery_done -- successful recovery!" << dendl;
+ ceph_assert(is_clientreplay() || is_active());
+
+ if (oldstate == MDSMap::STATE_CREATING)
+ return;
+
+ mdcache->start_recovered_truncates();
+ mdcache->start_files_to_recover();
+
+ // tell connected clients
+ //bcast_mds_map(); // not anymore, they get this from the monitor
+
+ mdcache->populate_mydir();
+}
+
+void MDSRank::creating_done()
+{
+ dout(1)<< "creating_done" << dendl;
+ request_state(MDSMap::STATE_ACTIVE);
+ // sync snaptable cache
+ snapclient->sync(new C_MDSInternalNoop);
+}
+
+void MDSRank::boot_create()
+{
+ dout(3) << "boot_create" << dendl;
+
+ MDSGatherBuilder fin(g_ceph_context, new C_MDS_VoidFn(this, &MDSRank::creating_done));
+
+ mdcache->init_layouts();
+
+ inotable->set_rank(whoami);
+ sessionmap.set_rank(whoami);
+
+ // start with a fresh journal
+ dout(10) << "boot_create creating fresh journal" << dendl;
+ mdlog->create(fin.new_sub());
+
+ // open new journal segment, but do not journal subtree map (yet)
+ mdlog->prepare_new_segment();
+
+ if (whoami == mdsmap->get_root()) {
+ dout(3) << "boot_create creating fresh hierarchy" << dendl;
+ mdcache->create_empty_hierarchy(fin.get());
+ }
+
+ dout(3) << "boot_create creating mydir hierarchy" << dendl;
+ mdcache->create_mydir_hierarchy(fin.get());
+
+ dout(3) << "boot_create creating global snaprealm" << dendl;
+ mdcache->create_global_snaprealm();
+
+ // fixme: fake out inotable (reset, pretend loaded)
+ dout(10) << "boot_create creating fresh inotable table" << dendl;
+ inotable->reset();
+ inotable->save(fin.new_sub());
+
+ // write empty sessionmap
+ sessionmap.save(fin.new_sub());
+
+ // Create empty purge queue
+ purge_queue.create(new C_IO_Wrapper(this, fin.new_sub()));
+
+ // initialize tables
+ if (mdsmap->get_tableserver() == whoami) {
+ dout(10) << "boot_create creating fresh snaptable" << dendl;
+ snapserver->set_rank(whoami);
+ snapserver->reset();
+ snapserver->save(fin.new_sub());
+ }
+
+ ceph_assert(g_conf()->mds_kill_create_at != 1);
+
+ // ok now journal it
+ mdlog->journal_segment_subtree_map(fin.new_sub());
+ mdlog->flush();
+
+ // Usually we do this during reconnect, but creation skips that.
+ objecter->enable_blacklist_events();
+
+ fin.activate();
+}
+
+void MDSRank::stopping_start()
+{
+ dout(2) << "Stopping..." << dendl;
+
+ if (mdsmap->get_num_in_mds() == 1 && !sessionmap.empty()) {
+ std::vector<Session*> victims;
+ const auto& sessions = sessionmap.get_sessions();
+ for (const auto& p : sessions) {
+ if (!p.first.is_client()) {
+ continue;
+ }
+
+ Session *s = p.second;
+ victims.push_back(s);
+ }
+
+ dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl;
+ ceph_assert(!victims.empty());
+
+ C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop);
+ for (const auto &s : victims) {
+ std::stringstream ss;
+ evict_client(s->get_client().v, false,
+ g_conf()->mds_session_blacklist_on_evict, ss, gather.new_sub());
+ }
+ gather.activate();
+ }
+
+ mdcache->shutdown_start();
+}
+
+void MDSRank::stopping_done()
+{
+ dout(2) << "Finished stopping..." << dendl;
+
+ // tell monitor we shut down cleanly.
+ request_state(MDSMap::STATE_STOPPED);
+}
+
+void MDSRankDispatcher::handle_mds_map(
+ const MMDSMap::const_ref &m,
+ const MDSMap &oldmap)
+{
+ // I am only to be passed MDSMaps in which I hold a rank
+ ceph_assert(whoami != MDS_RANK_NONE);
+
+ MDSMap::DaemonState oldstate = state;
+ mds_gid_t mds_gid = mds_gid_t(monc->get_global_id());
+ state = mdsmap->get_state_gid(mds_gid);
+ if (state != oldstate) {
+ last_state = oldstate;
+ incarnation = mdsmap->get_inc_gid(mds_gid);
+ }
+
+ version_t epoch = m->get_epoch();
+
+ // note source's map version
+ if (m->get_source().is_mds() &&
+ peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] < epoch) {
+ dout(15) << " peer " << m->get_source()
+ << " has mdsmap epoch >= " << epoch
+ << dendl;
+ peer_mdsmap_epoch[mds_rank_t(m->get_source().num())] = epoch;
+ }
+
+ // Validate state transitions while I hold a rank
+ if (!MDSMap::state_transition_valid(oldstate, state)) {
+ derr << "Invalid state transition " << ceph_mds_state_name(oldstate)
+ << "->" << ceph_mds_state_name(state) << dendl;
+ respawn();
+ }
+
+ if (oldstate != state) {
+ // update messenger.
+ if (state == MDSMap::STATE_STANDBY_REPLAY) {
+ dout(1) << "handle_mds_map i am now mds." << mds_gid << "." << incarnation
+ << " replaying mds." << whoami << "." << incarnation << dendl;
+ messenger->set_myname(entity_name_t::MDS(mds_gid));
+ } else {
+ dout(1) << "handle_mds_map i am now mds." << whoami << "." << incarnation << dendl;
+ messenger->set_myname(entity_name_t::MDS(whoami));
+ }
+ }
+
+ // tell objecter my incarnation
+ if (objecter->get_client_incarnation() != incarnation)
+ objecter->set_client_incarnation(incarnation);
+
+ if (oldmap.get_min_compat_client() != mdsmap->get_min_compat_client())
+ server->update_required_client_features();
+
+ // for debug
+ if (g_conf()->mds_dump_cache_on_map)
+ mdcache->dump_cache();
+
+ cluster_degraded = mdsmap->is_degraded();
+
+ // mdsmap and oldmap can be discontinuous. failover might happen in the missing mdsmap.
+ // the 'restart' set tracks ranks that have restarted since the old mdsmap
+ set<mds_rank_t> restart;
+ // replaying mds does not communicate with other ranks
+ if (state >= MDSMap::STATE_RESOLVE) {
+ // did someone fail?
+ // new down?
+ set<mds_rank_t> olddown, down;
+ oldmap.get_down_mds_set(&olddown);
+ mdsmap->get_down_mds_set(&down);
+ for (const auto& r : down) {
+ if (oldmap.have_inst(r) && olddown.count(r) == 0) {
+ messenger->mark_down_addrs(oldmap.get_addrs(r));
+ handle_mds_failure(r);
+ }
+ }
+
+ // did someone fail?
+ // did their addr/inst change?
+ set<mds_rank_t> up;
+ mdsmap->get_up_mds_set(up);
+ for (const auto& r : up) {
+ auto& info = mdsmap->get_info(r);
+ if (oldmap.have_inst(r)) {
+ auto& oldinfo = oldmap.get_info(r);
+ if (info.inc != oldinfo.inc) {
+ messenger->mark_down_addrs(oldinfo.get_addrs());
+ if (info.state == MDSMap::STATE_REPLAY ||
+ info.state == MDSMap::STATE_RESOLVE) {
+ restart.insert(r);
+ handle_mds_failure(r);
+ } else {
+ ceph_assert(info.state == MDSMap::STATE_STARTING ||
+ info.state == MDSMap::STATE_ACTIVE);
+ // -> stopped (missing) -> starting -> active
+ restart.insert(r);
+ mdcache->migrator->handle_mds_failure_or_stop(r);
+ if (mdsmap->get_tableserver() == whoami)
+ snapserver->handle_mds_failure_or_stop(r);
+ }
+ }
+ } else {
+ if (info.state == MDSMap::STATE_REPLAY ||
+ info.state == MDSMap::STATE_RESOLVE) {
+ // -> starting/creating (missing) -> active (missing) -> replay -> resolve
+ restart.insert(r);
+ handle_mds_failure(r);
+ } else {
+ ceph_assert(info.state == MDSMap::STATE_CREATING ||
+ info.state == MDSMap::STATE_STARTING ||
+ info.state == MDSMap::STATE_ACTIVE);
+ }
+ }
+ }
+ }
+
+ // did it change?
+ if (oldstate != state) {
+ dout(1) << "handle_mds_map state change "
+ << ceph_mds_state_name(oldstate) << " --> "
+ << ceph_mds_state_name(state) << dendl;
+ beacon.set_want_state(*mdsmap, state);
+
+ if (oldstate == MDSMap::STATE_STANDBY_REPLAY) {
+ dout(10) << "Monitor activated us! Deactivating replay loop" << dendl;
+ assert (state == MDSMap::STATE_REPLAY);
+ } else {
+ // did i just recover?
+ if ((is_active() || is_clientreplay()) &&
+ (oldstate == MDSMap::STATE_CREATING ||
+ oldstate == MDSMap::STATE_REJOIN ||
+ oldstate == MDSMap::STATE_RECONNECT))
+ recovery_done(oldstate);
+
+ if (is_active()) {
+ active_start();
+ } else if (is_any_replay()) {
+ replay_start();
+ } else if (is_resolve()) {
+ resolve_start();
+ } else if (is_reconnect()) {
+ reconnect_start();
+ } else if (is_rejoin()) {
+ rejoin_start();
+ } else if (is_clientreplay()) {
+ clientreplay_start();
+ } else if (is_creating()) {
+ boot_create();
+ } else if (is_starting()) {
+ boot_start();
+ } else if (is_stopping()) {
+ ceph_assert(oldstate == MDSMap::STATE_ACTIVE);
+ stopping_start();
+ }
+ }
+ }
+
+ // RESOLVE
+ // is someone else newly resolving?
+ if (state >= MDSMap::STATE_RESOLVE) {
+ // recover snaptable
+ if (mdsmap->get_tableserver() == whoami) {
+ if (oldstate < MDSMap::STATE_RESOLVE) {
+ set<mds_rank_t> s;
+ mdsmap->get_mds_set_lower_bound(s, MDSMap::STATE_RESOLVE);
+ snapserver->finish_recovery(s);
+ } else {
+ set<mds_rank_t> old_set, new_set;
+ oldmap.get_mds_set_lower_bound(old_set, MDSMap::STATE_RESOLVE);
+ mdsmap->get_mds_set_lower_bound(new_set, MDSMap::STATE_RESOLVE);
+ for (const auto& r : new_set) {
+ if (r == whoami)
+ continue; // not me
+ if (!old_set.count(r) || restart.count(r)) { // newly so?
+ snapserver->handle_mds_recovery(r);
+ }
+ }
+ }
+ }
+
+ if ((!oldmap.is_resolving() || !restart.empty()) && mdsmap->is_resolving()) {
+ set<mds_rank_t> resolve;
+ mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
+ dout(10) << " resolve set is " << resolve << dendl;
+ calc_recovery_set();
+ mdcache->send_resolves();
+ }
+ }
+
+ // REJOIN
+ // is everybody finally rejoining?
+ if (state >= MDSMap::STATE_REJOIN) {
+ // did we start?
+ if (!oldmap.is_rejoining() && mdsmap->is_rejoining())
+ rejoin_joint_start();
+
+ // did we finish?
+ if (g_conf()->mds_dump_cache_after_rejoin &&
+ oldmap.is_rejoining() && !mdsmap->is_rejoining())
+ mdcache->dump_cache(); // for DEBUG only
+
+ if (oldstate >= MDSMap::STATE_REJOIN ||
+ oldstate == MDSMap::STATE_STARTING) {
+ // ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them.
+ set<mds_rank_t> olddis, dis;
+ oldmap.get_mds_set_lower_bound(olddis, MDSMap::STATE_REJOIN);
+ mdsmap->get_mds_set_lower_bound(dis, MDSMap::STATE_REJOIN);
+ for (const auto& r : dis) {
+ if (r == whoami)
+ continue; // not me
+ if (!olddis.count(r) || restart.count(r)) { // newly so?
+ mdcache->kick_discovers(r);
+ mdcache->kick_open_ino_peers(r);
+ }
+ }
+ }
+ }
+
+ if (oldmap.is_degraded() && !cluster_degraded && state >= MDSMap::STATE_ACTIVE) {
+ dout(1) << "cluster recovered." << dendl;
+ auto it = waiting_for_active_peer.find(MDS_RANK_NONE);
+ if (it != waiting_for_active_peer.end()) {
+ queue_waiters(it->second);
+ waiting_for_active_peer.erase(it);
+ }
+ }
+
+ // did someone go active?
+ if (state >= MDSMap::STATE_CLIENTREPLAY &&
+ oldstate >= MDSMap::STATE_CLIENTREPLAY) {
+ set<mds_rank_t> oldactive, active;
+ oldmap.get_mds_set_lower_bound(oldactive, MDSMap::STATE_CLIENTREPLAY);
+ mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
+ for (const auto& r : active) {
+ if (r == whoami)
+ continue; // not me
+ if (!oldactive.count(r) || restart.count(r)) // newly so?
+ handle_mds_recovery(r);
+ }
+ }
+
+ if (is_clientreplay() || is_active() || is_stopping()) {
+ // did anyone stop?
+ set<mds_rank_t> oldstopped, stopped;
+ oldmap.get_stopped_mds_set(oldstopped);
+ mdsmap->get_stopped_mds_set(stopped);
+ for (const auto& r : stopped)
+ if (oldstopped.count(r) == 0) { // newly so?
+ mdcache->migrator->handle_mds_failure_or_stop(r);
+ if (mdsmap->get_tableserver() == whoami)
+ snapserver->handle_mds_failure_or_stop(r);
+ }
+ }
+
+ {
+ map<epoch_t,MDSContext::vec >::iterator p = waiting_for_mdsmap.begin();
+ while (p != waiting_for_mdsmap.end() && p->first <= mdsmap->get_epoch()) {
+ MDSContext::vec ls;
+ ls.swap(p->second);
+ waiting_for_mdsmap.erase(p++);
+ queue_waiters(ls);
+ }
+ }
+
+ if (is_active()) {
+ // Before going active, set OSD epoch barrier to latest (so that
+ // we don't risk handing out caps to clients with old OSD maps that
+ // might not include barriers from the previous incarnation of this MDS)
+ set_osd_epoch_barrier(objecter->with_osdmap(
+ std::mem_fn(&OSDMap::get_epoch)));
+
+ /* Now check if we should hint to the OSD that a read may follow */
+ if (mdsmap->has_standby_replay(whoami))
+ mdlog->set_write_iohint(0);
+ else
+ mdlog->set_write_iohint(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+ }
+
+ if (oldmap.get_max_mds() != mdsmap->get_max_mds()) {
+ purge_queue.update_op_limit(*mdsmap);
+ }
+
+ if (scrubstack->is_scrubbing()) {
+ if (mdsmap->get_max_mds() > 1) {
+ auto c = new C_MDSInternalNoop;
+ scrubstack->scrub_abort(c);
+ }
+ }
+ mdcache->handle_mdsmap(*mdsmap);
+}
+
+void MDSRank::handle_mds_recovery(mds_rank_t who)
+{
+ dout(5) << "handle_mds_recovery mds." << who << dendl;
+
+ mdcache->handle_mds_recovery(who);
+
+ queue_waiters(waiting_for_active_peer[who]);
+ waiting_for_active_peer.erase(who);
+}
+
+void MDSRank::handle_mds_failure(mds_rank_t who)
+{
+ if (who == whoami) {
+ dout(5) << "handle_mds_failure for myself; not doing anything" << dendl;
+ return;
+ }
+ dout(5) << "handle_mds_failure mds." << who << dendl;
+
+ mdcache->handle_mds_failure(who);
+
+ if (mdsmap->get_tableserver() == whoami)
+ snapserver->handle_mds_failure_or_stop(who);
+
+ snapclient->handle_mds_failure(who);
+}
+
+bool MDSRankDispatcher::handle_asok_command(std::string_view command,
+ const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& ss)
+{
+ if (command == "dump_ops_in_flight" ||
+ command == "ops") {
+ if (!op_tracker.dump_ops_in_flight(f)) {
+ ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+ please enable \"mds_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+ }
+ } else if (command == "dump_blocked_ops") {
+ if (!op_tracker.dump_ops_in_flight(f, true)) {
+ ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+ Please enable \"mds_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+ }
+ } else if (command == "dump_historic_ops") {
+ if (!op_tracker.dump_historic_ops(f)) {
+ ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+ please enable \"mds_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+ }
+ } else if (command == "dump_historic_ops_by_duration") {
+ if (!op_tracker.dump_historic_ops(f, true)) {
+ ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+ please enable \"mds_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+ }
+ } else if (command == "osdmap barrier") {
+ int64_t target_epoch = 0;
+ bool got_val = cmd_getval(g_ceph_context, cmdmap, "target_epoch", target_epoch);
+
+ if (!got_val) {
+ ss << "no target epoch given";
+ return true;
+ }
+
+ mds_lock.Lock();
+ set_osd_epoch_barrier(target_epoch);
+ mds_lock.Unlock();
+
+ C_SaferCond cond;
+ bool already_got = objecter->wait_for_map(target_epoch, &cond);
+ if (!already_got) {
+ dout(4) << __func__ << ": waiting for OSD epoch " << target_epoch << dendl;
+ cond.wait();
+ }
+ } else if (command == "session ls") {
+ std::lock_guard l(mds_lock);
+
+ heartbeat_reset();
+
+ dump_sessions(SessionFilter(), f);
+ } else if (command == "session evict") {
+ std::string client_id;
+ const bool got_arg = cmd_getval(g_ceph_context, cmdmap, "client_id", client_id);
+ if(!got_arg) {
+ ss << "Invalid client_id specified";
+ return true;
+ }
+
+ mds_lock.Lock();
+ std::stringstream dss;
+ bool evicted = evict_client(strtol(client_id.c_str(), 0, 10), true,
+ g_conf()->mds_session_blacklist_on_evict, dss);
+ if (!evicted) {
+ dout(15) << dss.str() << dendl;
+ ss << dss.str();
+ }
+ mds_lock.Unlock();
+ } else if (command == "session config") {
+ int64_t client_id;
+ std::string option;
+ std::string value;
+
+ cmd_getval(g_ceph_context, cmdmap, "client_id", client_id);
+ cmd_getval(g_ceph_context, cmdmap, "option", option);
+ bool got_value = cmd_getval(g_ceph_context, cmdmap, "value", value);
+
+ mds_lock.Lock();
+ config_client(client_id, !got_value, option, value, ss);
+ mds_lock.Unlock();
+ } else if (command == "scrub_path") {
+ string path;
+ vector<string> scrubop_vec;
+ cmd_getval(g_ceph_context, cmdmap, "scrubops", scrubop_vec);
+ cmd_getval(g_ceph_context, cmdmap, "path", path);
+
+ /* Multiple MDS scrub is not currently supported. See also: https://tracker.ceph.com/issues/12274 */
+ if (mdsmap->get_max_mds() > 1) {
+ ss << "Scrub is not currently supported for multiple active MDS. Please reduce max_mds to 1 and then scrub.";
+ return true;
+ }
+
+ C_SaferCond cond;
+ command_scrub_start(f, path, "", scrubop_vec, &cond);
+ cond.wait();
+ } else if (command == "tag path") {
+ string path;
+ cmd_getval(g_ceph_context, cmdmap, "path", path);
+ string tag;
+ cmd_getval(g_ceph_context, cmdmap, "tag", tag);
+ command_tag_path(f, path, tag);
+ } else if (command == "flush_path") {
+ string path;
+ cmd_getval(g_ceph_context, cmdmap, "path", path);
+ command_flush_path(f, path);
+ } else if (command == "flush journal") {
+ command_flush_journal(f);
+ } else if (command == "get subtrees") {
+ command_get_subtrees(f);
+ } else if (command == "export dir") {
+ string path;
+ if(!cmd_getval(g_ceph_context, cmdmap, "path", path)) {
+ ss << "malformed path";
+ return true;
+ }
+ int64_t rank;
+ if(!cmd_getval(g_ceph_context, cmdmap, "rank", rank)) {
+ ss << "malformed rank";
+ return true;
+ }
+ command_export_dir(f, path, (mds_rank_t)rank);
+ } else if (command == "dump cache") {
+ std::lock_guard l(mds_lock);
+ string path;
+ int r;
+ if(!cmd_getval(g_ceph_context, cmdmap, "path", path)) {
+ r = mdcache->dump_cache(f);
+ } else {
+ r = mdcache->dump_cache(path);
+ }
+
+ if (r != 0) {
+ ss << "Failed to dump cache: " << cpp_strerror(r);
+ f->reset();
+ }
+ } else if (command == "cache status") {
+ std::lock_guard l(mds_lock);
+ mdcache->cache_status(f);
+ } else if (command == "dump tree") {
+ command_dump_tree(cmdmap, ss, f);
+ } else if (command == "dump loads") {
+ std::lock_guard l(mds_lock);
+ int r = balancer->dump_loads(f);
+ if (r != 0) {
+ ss << "Failed to dump loads: " << cpp_strerror(r);
+ f->reset();
+ }
+ } else if (command == "dump snaps") {
+ std::lock_guard l(mds_lock);
+ string server;
+ cmd_getval(g_ceph_context, cmdmap, "server", server);
+ if (server == "--server") {
+ if (mdsmap->get_tableserver() == whoami) {
+ snapserver->dump(f);
+ } else {
+ ss << "Not snapserver";
+ }
+ } else {
+ int r = snapclient->dump_cache(f);
+ if (r != 0) {
+ ss << "Failed to dump snapclient: " << cpp_strerror(r);
+ f->reset();
+ }
+ }
+ } else if (command == "force_readonly") {
+ std::lock_guard l(mds_lock);
+ mdcache->force_readonly();
+ } else if (command == "dirfrag split") {
+ command_dirfrag_split(cmdmap, ss);
+ } else if (command == "dirfrag merge") {
+ command_dirfrag_merge(cmdmap, ss);
+ } else if (command == "dirfrag ls") {
+ command_dirfrag_ls(cmdmap, ss, f);
+ } else if (command == "openfiles ls") {
+ command_openfiles_ls(f);
+ } else if (command == "dump inode") {
+ command_dump_inode(f, cmdmap, ss);
+ } else {
+ return false;
+ }
+
+ return true;
+}
+
+class C_MDS_Send_Command_Reply : public MDSInternalContext {
+protected:
+ MCommand::const_ref m;
+public:
+ C_MDS_Send_Command_Reply(MDSRank *_mds, const MCommand::const_ref &_m) :
+ MDSInternalContext(_mds), m(_m) {}
+
+ void send(int r, std::string_view ss) {
+ std::stringstream ds;
+ send(r, ss, ds);
+ }
+
+ void send(int r, std::string_view ss, std::stringstream &ds) {
+ bufferlist bl;
+ bl.append(ds);
+ MDSDaemon::send_command_reply(m, mds, r, bl, ss);
+ }
+
+ void finish(int r) override {
+ send(r, "");
+ }
+};
+
+class C_ExecAndReply : public C_MDS_Send_Command_Reply {
+public:
+ C_ExecAndReply(MDSRank *mds, const MCommand::const_ref &m)
+ : C_MDS_Send_Command_Reply(mds, m), f(true) {
+ }
+
+ void finish(int r) override {
+ std::stringstream ds;
+ std::stringstream ss;
+ if (r != 0) {
+ f.flush(ss);
+ } else {
+ f.flush(ds);
+ }
+
+ send(r, ss.str(), ds);
+ }
+
+ virtual void exec() = 0;
+
+protected:
+ JSONFormatter f;
+};
+
+class C_CacheDropExecAndReply : public C_ExecAndReply {
+public:
+ C_CacheDropExecAndReply(MDSRank *mds, const MCommand::const_ref &m,
+ uint64_t timeout)
+ : C_ExecAndReply(mds, m), timeout(timeout) {
+ }
+
+ void exec() override {
+ mds->command_cache_drop(timeout, &f, this);
+ }
+
+private:
+ uint64_t timeout;
+};
+
+class C_ScrubExecAndReply : public C_ExecAndReply {
+public:
+ C_ScrubExecAndReply(MDSRank *mds, const MCommand::const_ref &m,
+ const std::string &path, const std::string &tag,
+ const std::vector<std::string> &scrubop)
+ : C_ExecAndReply(mds, m), path(path), tag(tag), scrubop(scrubop) {
+ }
+
+ void exec() override {
+ mds->command_scrub_start(&f, path, tag, scrubop, this);
+ }
+
+private:
+ std::string path;
+ std::string tag;
+ std::vector<std::string> scrubop;
+};
+
+class C_ScrubControlExecAndReply : public C_ExecAndReply {
+public:
+ C_ScrubControlExecAndReply(MDSRank *mds, const MCommand::const_ref &m,
+ const std::string &command)
+ : C_ExecAndReply(mds, m), command(command) {
+ }
+
+ void exec() override {
+ if (command == "abort") {
+ mds->command_scrub_abort(&f, this);
+ } else if (command == "pause") {
+ mds->command_scrub_pause(&f, this);
+ } else {
+ ceph_abort();
+ }
+ }
+
+ void finish(int r) override {
+ f.open_object_section("result");
+ f.dump_int("return_code", r);
+ f.close_section();
+ C_ExecAndReply::finish(r);
+ }
+
+private:
+ std::string command;
+};
+
+/**
+ * This function drops the mds_lock, so don't do anything with
+ * MDSRank after calling it (we could have gone into shutdown): just
+ * send your result back to the calling client and finish.
+ */
+void MDSRankDispatcher::evict_clients(const SessionFilter &filter, const MCommand::const_ref &m)
+{
+ C_MDS_Send_Command_Reply *reply = new C_MDS_Send_Command_Reply(this, m);
+
+ if (is_any_replay()) {
+ reply->send(-EAGAIN, "MDS is replaying log");
+ delete reply;
+ return;
+ }
+
+ std::vector<Session*> victims;
+ const auto& sessions = sessionmap.get_sessions();
+ for (const auto& p : sessions) {
+ if (!p.first.is_client()) {
+ continue;
+ }
+
+ Session *s = p.second;
+
+ if (filter.match(*s, std::bind(&Server::waiting_for_reconnect, server, std::placeholders::_1))) {
+ victims.push_back(s);
+ }
+ }
+
+ dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl;
+
+ if (victims.empty()) {
+ reply->send(0, "");
+ delete reply;
+ return;
+ }
+
+ C_GatherBuilder gather(g_ceph_context, reply);
+ for (const auto s : victims) {
+ std::stringstream ss;
+ evict_client(s->get_client().v, false,
+ g_conf()->mds_session_blacklist_on_evict, ss, gather.new_sub());
+ }
+ gather.activate();
+}
+
+void MDSRankDispatcher::dump_sessions(const SessionFilter &filter, Formatter *f) const
+{
+ // Dump sessions, decorated with recovery/replay status
+ f->open_array_section("sessions");
+ for (auto& [name, s] : sessionmap.get_sessions()) {
+ if (!name.is_client()) {
+ continue;
+ }
+
+ if (!filter.match(*s, std::bind(&Server::waiting_for_reconnect, server, std::placeholders::_1))) {
+ continue;
+ }
+
+ f->dump_object("session", *s);
+ }
+ f->close_section(); // sessions
+}
+
+void MDSRank::command_scrub_start(Formatter *f,
+ std::string_view path, std::string_view tag,
+ const vector<string>& scrubop_vec, Context *on_finish)
+{
+ bool force = false;
+ bool recursive = false;
+ bool repair = false;
+ for (auto &op : scrubop_vec) {
+ if (op == "force")
+ force = true;
+ else if (op == "recursive")
+ recursive = true;
+ else if (op == "repair")
+ repair = true;
+ }
+
+ std::lock_guard l(mds_lock);
+ mdcache->enqueue_scrub(path, tag, force, recursive, repair, f, on_finish);
+ // scrub_dentry() finishers will dump the data for us; we're done!
+}
+
+void MDSRank::command_tag_path(Formatter *f,
+ std::string_view path, std::string_view tag)
+{
+ C_SaferCond scond;
+ {
+ std::lock_guard l(mds_lock);
+ mdcache->enqueue_scrub(path, tag, true, true, false, f, &scond);
+ }
+ scond.wait();
+}
+
+void MDSRank::command_scrub_abort(Formatter *f, Context *on_finish) {
+ std::lock_guard l(mds_lock);
+ scrubstack->scrub_abort(on_finish);
+}
+
+void MDSRank::command_scrub_pause(Formatter *f, Context *on_finish) {
+ std::lock_guard l(mds_lock);
+ scrubstack->scrub_pause(on_finish);
+}
+
+void MDSRank::command_scrub_resume(Formatter *f) {
+ int r = scrubstack->scrub_resume();
+
+ f->open_object_section("result");
+ f->dump_int("return_code", r);
+ f->close_section();
+}
+
+void MDSRank::command_scrub_status(Formatter *f) {
+ scrubstack->scrub_status(f);
+}
+
+void MDSRank::command_flush_path(Formatter *f, std::string_view path)
+{
+ C_SaferCond scond;
+ {
+ std::lock_guard l(mds_lock);
+ mdcache->flush_dentry(path, &scond);
+ }
+ int r = scond.wait();
+ f->open_object_section("results");
+ f->dump_int("return_code", r);
+ f->close_section(); // results
+}
+
+// synchronous wrapper around "journal flush" asynchronous context
+// execution.
+void MDSRank::command_flush_journal(Formatter *f) {
+ ceph_assert(f != NULL);
+
+ C_SaferCond cond;
+ std::stringstream ss;
+ {
+ std::lock_guard locker(mds_lock);
+ C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, this, &ss, &cond);
+ flush_journal->send();
+ }
+ int r = cond.wait();
+
+ f->open_object_section("result");
+ f->dump_string("message", ss.str());
+ f->dump_int("return_code", r);
+ f->close_section();
+}
+
+void MDSRank::command_get_subtrees(Formatter *f)
+{
+ ceph_assert(f != NULL);
+ std::lock_guard l(mds_lock);
+
+ std::vector<CDir*> subtrees;
+ mdcache->get_subtrees(subtrees);
+
+ f->open_array_section("subtrees");
+ for (const auto& dir : subtrees) {
+ f->open_object_section("subtree");
+ {
+ f->dump_bool("is_auth", dir->is_auth());
+ f->dump_int("auth_first", dir->get_dir_auth().first);
+ f->dump_int("auth_second", dir->get_dir_auth().second);
+ f->dump_int("export_pin", dir->inode->get_export_pin());
+ f->open_object_section("dir");
+ dir->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+}
+
+
+void MDSRank::command_export_dir(Formatter *f,
+ std::string_view path,
+ mds_rank_t target)
+{
+ int r = _command_export_dir(path, target);
+ f->open_object_section("results");
+ f->dump_int("return_code", r);
+ f->close_section(); // results
+}
+
+int MDSRank::_command_export_dir(
+ std::string_view path,
+ mds_rank_t target)
+{
+ std::lock_guard l(mds_lock);
+ filepath fp(path);
+
+ if (target == whoami || !mdsmap->is_up(target) || !mdsmap->is_in(target)) {
+ derr << "bad MDS target " << target << dendl;
+ return -ENOENT;
+ }
+
+ CInode *in = mdcache->cache_traverse(fp);
+ if (!in) {
+ derr << "Bath path '" << path << "'" << dendl;
+ return -ENOENT;
+ }
+ CDir *dir = in->get_dirfrag(frag_t());
+ if (!dir || !(dir->is_auth())) {
+ derr << "bad export_dir path dirfrag frag_t() or dir not auth" << dendl;
+ return -EINVAL;
+ }
+
+ mdcache->migrator->export_dir(dir, target);
+ return 0;
+}
+
+void MDSRank::command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f)
+{
+ std::string root;
+ int64_t depth;
+ cmd_getval(g_ceph_context, cmdmap, "root", root);
+ if (!cmd_getval(g_ceph_context, cmdmap, "depth", depth))
+ depth = -1;
+ std::lock_guard l(mds_lock);
+ CInode *in = mdcache->cache_traverse(filepath(root.c_str()));
+ if (!in) {
+ ss << "root inode is not in cache";
+ return;
+ }
+ f->open_array_section("inodes");
+ mdcache->dump_tree(in, 0, depth, f);
+ f->close_section();
+}
+
+CDir *MDSRank::_command_dirfrag_get(
+ const cmdmap_t &cmdmap,
+ std::ostream &ss)
+{
+ std::string path;
+ bool got = cmd_getval(g_ceph_context, cmdmap, "path", path);
+ if (!got) {
+ ss << "missing path argument";
+ return NULL;
+ }
+
+ std::string frag_str;
+ if (!cmd_getval(g_ceph_context, cmdmap, "frag", frag_str)) {
+ ss << "missing frag argument";
+ return NULL;
+ }
+
+ CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
+ if (!in) {
+ // TODO really we should load something in if it's not in cache,
+ // but the infrastructure is harder, and we might still be unable
+ // to act on it if someone else is auth.
+ ss << "directory '" << path << "' inode not in cache";
+ return NULL;
+ }
+
+ frag_t fg;
+
+ if (!fg.parse(frag_str.c_str())) {
+ ss << "frag " << frag_str << " failed to parse";
+ return NULL;
+ }
+
+ CDir *dir = in->get_dirfrag(fg);
+ if (!dir) {
+ ss << "frag " << in->ino() << "/" << fg << " not in cache ("
+ "use `dirfrag ls` to see if it should exist)";
+ return NULL;
+ }
+
+ if (!dir->is_auth()) {
+ ss << "frag " << dir->dirfrag() << " not auth (auth = "
+ << dir->authority() << ")";
+ return NULL;
+ }
+
+ return dir;
+}
+
+bool MDSRank::command_dirfrag_split(
+ cmdmap_t cmdmap,
+ std::ostream &ss)
+{
+ std::lock_guard l(mds_lock);
+ int64_t by = 0;
+ if (!cmd_getval(g_ceph_context, cmdmap, "bits", by)) {
+ ss << "missing bits argument";
+ return false;
+ }
+
+ if (by <= 0) {
+ ss << "must split by >0 bits";
+ return false;
+ }
+
+ CDir *dir = _command_dirfrag_get(cmdmap, ss);
+ if (!dir) {
+ return false;
+ }
+
+ mdcache->split_dir(dir, by);
+
+ return true;
+}
+
+bool MDSRank::command_dirfrag_merge(
+ cmdmap_t cmdmap,
+ std::ostream &ss)
+{
+ std::lock_guard l(mds_lock);
+ std::string path;
+ bool got = cmd_getval(g_ceph_context, cmdmap, "path", path);
+ if (!got) {
+ ss << "missing path argument";
+ return false;
+ }
+
+ std::string frag_str;
+ if (!cmd_getval(g_ceph_context, cmdmap, "frag", frag_str)) {
+ ss << "missing frag argument";
+ return false;
+ }
+
+ CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
+ if (!in) {
+ ss << "directory '" << path << "' inode not in cache";
+ return false;
+ }
+
+ frag_t fg;
+ if (!fg.parse(frag_str.c_str())) {
+ ss << "frag " << frag_str << " failed to parse";
+ return false;
+ }
+
+ mdcache->merge_dir(in, fg);
+
+ return true;
+}
+
+bool MDSRank::command_dirfrag_ls(
+ cmdmap_t cmdmap,
+ std::ostream &ss,
+ Formatter *f)
+{
+ std::lock_guard l(mds_lock);
+ std::string path;
+ bool got = cmd_getval(g_ceph_context, cmdmap, "path", path);
+ if (!got) {
+ ss << "missing path argument";
+ return false;
+ }
+
+ CInode *in = mdcache->cache_traverse(filepath(path.c_str()));
+ if (!in) {
+ ss << "directory inode not in cache";
+ return false;
+ }
+
+ f->open_array_section("frags");
+ frag_vec_t leaves;
+ // NB using get_leaves_under instead of get_dirfrags to give
+ // you the list of what dirfrags may exist, not which are in cache
+ in->dirfragtree.get_leaves_under(frag_t(), leaves);
+ for (const auto& leaf : leaves) {
+ f->open_object_section("frag");
+ f->dump_int("value", leaf.value());
+ f->dump_int("bits", leaf.bits());
+ CachedStackStringStream css;
+ *css << std::hex << leaf.value() << "/" << std::dec << leaf.bits();
+ f->dump_string("str", css->strv());
+ f->close_section();
+ }
+ f->close_section();
+
+ return true;
+}
+
+void MDSRank::command_openfiles_ls(Formatter *f)
+{
+ std::lock_guard l(mds_lock);
+ mdcache->dump_openfiles(f);
+}
+
+void MDSRank::command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss)
+{
+ std::lock_guard l(mds_lock);
+ int64_t number;
+ bool got = cmd_getval(g_ceph_context, cmdmap, "number", number);
+ if (!got) {
+ ss << "missing inode number";
+ return;
+ }
+
+ bool success = mdcache->dump_inode(f, number);
+ if (!success) {
+ ss << "dump inode failed, wrong inode number or the inode is not cached";
+ }
+}
+
+void MDSRank::dump_status(Formatter *f) const
+{
+ if (state == MDSMap::STATE_REPLAY ||
+ state == MDSMap::STATE_STANDBY_REPLAY) {
+ mdlog->dump_replay_status(f);
+ } else if (state == MDSMap::STATE_RESOLVE) {
+ mdcache->dump_resolve_status(f);
+ } else if (state == MDSMap::STATE_RECONNECT) {
+ server->dump_reconnect_status(f);
+ } else if (state == MDSMap::STATE_REJOIN) {
+ mdcache->dump_rejoin_status(f);
+ } else if (state == MDSMap::STATE_CLIENTREPLAY) {
+ dump_clientreplay_status(f);
+ }
+ f->dump_float("rank_uptime", get_uptime().count());
+}
+
+void MDSRank::dump_clientreplay_status(Formatter *f) const
+{
+ f->open_object_section("clientreplay_status");
+ f->dump_unsigned("clientreplay_queue", replay_queue.size());
+ f->dump_unsigned("active_replay", mdcache->get_num_client_requests());
+ f->close_section();
+}
+
+void MDSRankDispatcher::update_log_config()
+{
+ map<string,string> log_to_monitors;
+ map<string,string> log_to_syslog;
+ map<string,string> log_channel;
+ map<string,string> log_prio;
+ map<string,string> log_to_graylog;
+ map<string,string> log_to_graylog_host;
+ map<string,string> log_to_graylog_port;
+ uuid_d fsid;
+ string host;
+
+ if (parse_log_client_options(g_ceph_context, log_to_monitors, log_to_syslog,
+ log_channel, log_prio, log_to_graylog,
+ log_to_graylog_host, log_to_graylog_port,
+ fsid, host) == 0)
+ clog->update_config(log_to_monitors, log_to_syslog,
+ log_channel, log_prio, log_to_graylog,
+ log_to_graylog_host, log_to_graylog_port,
+ fsid, host);
+ dout(10) << __func__ << " log_to_monitors " << log_to_monitors << dendl;
+}
+
+void MDSRank::create_logger()
+{
+ dout(10) << "create_logger" << dendl;
+ {
+ PerfCountersBuilder mds_plb(g_ceph_context, "mds", l_mds_first, l_mds_last);
+
+ // super useful (high prio) perf stats
+ mds_plb.add_u64_counter(l_mds_request, "request", "Requests", "req",
+ PerfCountersBuilder::PRIO_CRITICAL);
+ mds_plb.add_time_avg(l_mds_reply_latency, "reply_latency", "Reply latency", "rlat",
+ PerfCountersBuilder::PRIO_CRITICAL);
+ mds_plb.add_u64(l_mds_inodes, "inodes", "Inodes", "inos",
+ PerfCountersBuilder::PRIO_CRITICAL);
+ mds_plb.add_u64_counter(l_mds_forward, "forward", "Forwarding request", "fwd",
+ PerfCountersBuilder::PRIO_INTERESTING);
+ mds_plb.add_u64(l_mds_caps, "caps", "Capabilities", "caps",
+ PerfCountersBuilder::PRIO_INTERESTING);
+ mds_plb.add_u64_counter(l_mds_exported_inodes, "exported_inodes", "Exported inodes",
+ "exi", PerfCountersBuilder::PRIO_INTERESTING);
+ mds_plb.add_u64_counter(l_mds_imported_inodes, "imported_inodes", "Imported inodes",
+ "imi", PerfCountersBuilder::PRIO_INTERESTING);
+
+ // useful dir/inode/subtree stats
+ mds_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+ mds_plb.add_u64(l_mds_root_rfiles, "root_rfiles", "root inode rfiles");
+ mds_plb.add_u64(l_mds_root_rbytes, "root_rbytes", "root inode rbytes");
+ mds_plb.add_u64(l_mds_root_rsnaps, "root_rsnaps", "root inode rsnaps");
+ mds_plb.add_u64_counter(l_mds_dir_fetch, "dir_fetch", "Directory fetch");
+ mds_plb.add_u64_counter(l_mds_dir_commit, "dir_commit", "Directory commit");
+ mds_plb.add_u64_counter(l_mds_dir_split, "dir_split", "Directory split");
+ mds_plb.add_u64_counter(l_mds_dir_merge, "dir_merge", "Directory merge");
+ mds_plb.add_u64(l_mds_inode_max, "inode_max", "Max inodes, cache size");
+ mds_plb.add_u64(l_mds_inodes_pinned, "inodes_pinned", "Inodes pinned");
+ mds_plb.add_u64(l_mds_inodes_expired, "inodes_expired", "Inodes expired");
+ mds_plb.add_u64(l_mds_inodes_with_caps, "inodes_with_caps",
+ "Inodes with capabilities");
+ mds_plb.add_u64(l_mds_subtrees, "subtrees", "Subtrees");
+ mds_plb.add_u64(l_mds_load_cent, "load_cent", "Load per cent");
+ mds_plb.add_u64_counter(l_mds_openino_dir_fetch, "openino_dir_fetch",
+ "OpenIno incomplete directory fetchings");
+
+ // low prio stats
+ mds_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+ mds_plb.add_u64_counter(l_mds_reply, "reply", "Replies");
+ mds_plb.add_u64(l_mds_inodes_top, "inodes_top", "Inodes on top");
+ mds_plb.add_u64(l_mds_inodes_bottom, "inodes_bottom", "Inodes on bottom");
+ mds_plb.add_u64(
+ l_mds_inodes_pin_tail, "inodes_pin_tail", "Inodes on pin tail");
+ mds_plb.add_u64_counter(l_mds_traverse, "traverse", "Traverses");
+ mds_plb.add_u64_counter(l_mds_traverse_hit, "traverse_hit", "Traverse hits");
+ mds_plb.add_u64_counter(l_mds_traverse_forward, "traverse_forward",
+ "Traverse forwards");
+ mds_plb.add_u64_counter(l_mds_traverse_discover, "traverse_discover",
+ "Traverse directory discovers");
+ mds_plb.add_u64_counter(l_mds_traverse_dir_fetch, "traverse_dir_fetch",
+ "Traverse incomplete directory content fetchings");
+ mds_plb.add_u64_counter(l_mds_traverse_remote_ino, "traverse_remote_ino",
+ "Traverse remote dentries");
+ mds_plb.add_u64_counter(l_mds_traverse_lock, "traverse_lock",
+ "Traverse locks");
+ mds_plb.add_u64(l_mds_dispatch_queue_len, "q", "Dispatch queue length");
+ mds_plb.add_u64_counter(l_mds_exported, "exported", "Exports");
+ mds_plb.add_u64_counter(l_mds_imported, "imported", "Imports");
+ mds_plb.add_u64_counter(l_mds_openino_backtrace_fetch, "openino_backtrace_fetch",
+ "OpenIno backtrace fetchings");
+ mds_plb.add_u64_counter(l_mds_openino_peer_discover, "openino_peer_discover",
+ "OpenIno peer inode discovers");
+
+ logger = mds_plb.create_perf_counters();
+ g_ceph_context->get_perfcounters_collection()->add(logger);
+ }
+
+ {
+ PerfCountersBuilder mdm_plb(g_ceph_context, "mds_mem", l_mdm_first, l_mdm_last);
+ mdm_plb.add_u64(l_mdm_ino, "ino", "Inodes", "ino",
+ PerfCountersBuilder::PRIO_INTERESTING);
+ mdm_plb.add_u64(l_mdm_dn, "dn", "Dentries", "dn",
+ PerfCountersBuilder::PRIO_INTERESTING);
+
+ mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+ mdm_plb.add_u64_counter(l_mdm_inoa, "ino+", "Inodes opened");
+ mdm_plb.add_u64_counter(l_mdm_inos, "ino-", "Inodes closed");
+ mdm_plb.add_u64(l_mdm_dir, "dir", "Directories");
+ mdm_plb.add_u64_counter(l_mdm_dira, "dir+", "Directories opened");
+ mdm_plb.add_u64_counter(l_mdm_dirs, "dir-", "Directories closed");
+ mdm_plb.add_u64_counter(l_mdm_dna, "dn+", "Dentries opened");
+ mdm_plb.add_u64_counter(l_mdm_dns, "dn-", "Dentries closed");
+ mdm_plb.add_u64(l_mdm_cap, "cap", "Capabilities");
+ mdm_plb.add_u64_counter(l_mdm_capa, "cap+", "Capabilities added");
+ mdm_plb.add_u64_counter(l_mdm_caps, "cap-", "Capabilities removed");
+ mdm_plb.add_u64(l_mdm_heap, "heap", "Heap size");
+
+ mdm_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+ mdm_plb.add_u64(l_mdm_rss, "rss", "RSS");
+
+ mlogger = mdm_plb.create_perf_counters();
+ g_ceph_context->get_perfcounters_collection()->add(mlogger);
+ }
+
+ mdlog->create_logger();
+ server->create_logger();
+ purge_queue.create_logger();
+ sessionmap.register_perfcounters();
+ mdcache->register_perfcounters();
+}
+
+void MDSRank::check_ops_in_flight()
+{
+ string summary;
+ vector<string> warnings;
+ int slow = 0;
+ if (op_tracker.check_ops_in_flight(&summary, warnings, &slow)) {
+ clog->warn() << summary;
+ for (const auto& warning : warnings) {
+ clog->warn() << warning;
+ }
+ }
+
+ // set mds slow request count
+ mds_slow_req_count = slow;
+ return;
+}
+
+void MDSRankDispatcher::handle_osd_map()
+{
+ if (is_active() &&
+ mdsmap->get_tableserver() == whoami) {
+ snapserver->check_osd_map(true);
+ }
+
+ server->handle_osd_map();
+
+ purge_queue.update_op_limit(*mdsmap);
+
+ std::set<entity_addr_t> newly_blacklisted;
+ objecter->consume_blacklist_events(&newly_blacklisted);
+ auto epoch = objecter->with_osdmap([](const OSDMap &o){return o.get_epoch();});
+ dout(4) << "handle_osd_map epoch " << epoch << ", "
+ << newly_blacklisted.size() << " new blacklist entries" << dendl;
+ auto victims = server->apply_blacklist(newly_blacklisted);
+ if (victims) {
+ set_osd_epoch_barrier(epoch);
+ }
+
+
+ // By default the objecter only requests OSDMap updates on use,
+ // we would like to always receive the latest maps in order to
+ // apply policy based on the FULL flag.
+ objecter->maybe_request_map();
+}
+
+int MDSRank::config_client(int64_t session_id, bool remove,
+ const std::string& option, const std::string& value,
+ std::ostream& ss)
+{
+ Session *session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
+ if (!session) {
+ ss << "session " << session_id << " not in sessionmap!";
+ return -ENOENT;
+ }
+
+ if (option == "timeout") {
+ if (remove) {
+ auto it = session->info.client_metadata.find("timeout");
+ if (it == session->info.client_metadata.end()) {
+ ss << "Nonexistent config: " << option;
+ return -ENODATA;
+ }
+ session->info.client_metadata.erase(it);
+ } else {
+ char *end;
+ strtoul(value.c_str(), &end, 0);
+ if (*end) {
+ ss << "Invalid config for timeout: " << value;
+ return -EINVAL;
+ }
+ session->info.client_metadata[option] = value;
+ }
+ //sessionmap._mark_dirty(session, true);
+ } else {
+ ss << "Invalid config option: " << option;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+bool MDSRank::evict_client(int64_t session_id,
+ bool wait, bool blacklist, std::ostream& err_ss,
+ Context *on_killed)
+{
+ ceph_assert(mds_lock.is_locked_by_me());
+
+ // Mutually exclusive args
+ ceph_assert(!(wait && on_killed != nullptr));
+
+ if (is_any_replay()) {
+ err_ss << "MDS is replaying log";
+ return false;
+ }
+
+ Session *session = sessionmap.get_session(
+ entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
+ if (!session) {
+ err_ss << "session " << session_id << " not in sessionmap!";
+ return false;
+ }
+
+ auto& addr = session->info.inst.addr;
+ {
+ CachedStackStringStream css;
+ *css << "Evicting " << (blacklist ? "(and blacklisting) " : "")
+ << "client session " << session_id << " (" << addr << ")";
+ dout(1) << css->strv() << dendl;
+ clog->info() << css->strv();
+ }
+
+ dout(4) << "Preparing blacklist command... (wait=" << wait << ")" << dendl;
+ stringstream ss;
+ ss << "{\"prefix\":\"osd blacklist\", \"blacklistop\":\"add\",";
+ ss << "\"addr\":\"";
+ ss << addr;
+ ss << "\"}";
+ std::string tmp = ss.str();
+ std::vector<std::string> cmd = {tmp};
+
+ auto kill_client_session = [this, session_id, wait, on_killed](){
+ ceph_assert(mds_lock.is_locked_by_me());
+ Session *session = sessionmap.get_session(
+ entity_name_t(CEPH_ENTITY_TYPE_CLIENT, session_id));
+ if (session) {
+ if (on_killed || !wait) {
+ server->kill_session(session, on_killed);
+ } else {
+ C_SaferCond on_safe;
+ server->kill_session(session, &on_safe);
+
+ mds_lock.Unlock();
+ on_safe.wait();
+ mds_lock.Lock();
+ }
+ } else {
+ dout(1) << "session " << session_id << " was removed while we waited "
+ "for blacklist" << dendl;
+
+ // Even though it wasn't us that removed it, kick our completion
+ // as the session has been removed.
+ if (on_killed) {
+ on_killed->complete(0);
+ }
+ }
+ };
+
+ auto apply_blacklist = [this, cmd](std::function<void ()> fn){
+ ceph_assert(mds_lock.is_locked_by_me());
+
+ Context *on_blacklist_done = new FunctionContext([this, fn](int r) {
+ objecter->wait_for_latest_osdmap(
+ new C_OnFinisher(
+ new FunctionContext([this, fn](int r) {
+ std::lock_guard l(mds_lock);
+ auto epoch = objecter->with_osdmap([](const OSDMap &o){
+ return o.get_epoch();
+ });
+
+ set_osd_epoch_barrier(epoch);
+
+ fn();
+ }), finisher)
+ );
+ });
+
+ dout(4) << "Sending mon blacklist command: " << cmd[0] << dendl;
+ monc->start_mon_command(cmd, {}, nullptr, nullptr, on_blacklist_done);
+ };
+
+ if (wait) {
+ if (blacklist) {
+ C_SaferCond inline_ctx;
+ apply_blacklist([&inline_ctx](){inline_ctx.complete(0);});
+ mds_lock.Unlock();
+ inline_ctx.wait();
+ mds_lock.Lock();
+ }
+
+ // We dropped mds_lock, so check that session still exists
+ session = sessionmap.get_session(entity_name_t(CEPH_ENTITY_TYPE_CLIENT,
+ session_id));
+ if (!session) {
+ dout(1) << "session " << session_id << " was removed while we waited "
+ "for blacklist" << dendl;
+ return true;
+ }
+ kill_client_session();
+ } else {
+ if (blacklist) {
+ apply_blacklist(kill_client_session);
+ } else {
+ kill_client_session();
+ }
+ }
+
+ return true;
+}
+
+void MDSRank::bcast_mds_map()
+{
+ dout(7) << "bcast_mds_map " << mdsmap->get_epoch() << dendl;
+
+ // share the map with mounted clients
+ set<Session*> clients;
+ sessionmap.get_client_session_set(clients);
+ for (const auto &session : clients) {
+ auto m = MMDSMap::create(monc->get_fsid(), *mdsmap);
+ session->get_connection()->send_message2(std::move(m));
+ }
+ last_client_mdsmap_bcast = mdsmap->get_epoch();
+}
+
+Context *MDSRank::create_async_exec_context(C_ExecAndReply *ctx) {
+ return new C_OnFinisher(new FunctionContext([ctx](int _) {
+ ctx->exec();
+ }), finisher);
+}
+
+MDSRankDispatcher::MDSRankDispatcher(
+ mds_rank_t whoami_,
+ Mutex &mds_lock_,
+ LogChannelRef &clog_,
+ SafeTimer &timer_,
+ Beacon &beacon_,
+ std::unique_ptr<MDSMap> &mdsmap_,
+ Messenger *msgr,
+ MonClient *monc_,
+ MgrClient *mgrc,
+ Context *respawn_hook_,
+ Context *suicide_hook_)
+ : MDSRank(whoami_, mds_lock_, clog_, timer_, beacon_, mdsmap_,
+ msgr, monc_, mgrc, respawn_hook_, suicide_hook_)
+{
+ g_conf().add_observer(this);
+}
+
+bool MDSRankDispatcher::handle_command(
+ const cmdmap_t &cmdmap,
+ const MCommand::const_ref &m,
+ int *r,
+ std::stringstream *ds,
+ std::stringstream *ss,
+ Context **run_later,
+ bool *need_reply)
+{
+ ceph_assert(r != nullptr);
+ ceph_assert(ds != nullptr);
+ ceph_assert(ss != nullptr);
+
+ *need_reply = true;
+
+ std::string prefix;
+ cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+
+ if (prefix == "session ls" || prefix == "client ls") {
+ std::vector<std::string> filter_args;
+ cmd_getval(g_ceph_context, cmdmap, "filters", filter_args);
+
+ SessionFilter filter;
+ *r = filter.parse(filter_args, ss);
+ if (*r != 0) {
+ return true;
+ }
+
+ JSONFormatter f(true);
+ dump_sessions(filter, &f);
+ f.flush(*ds);
+ return true;
+ } else if (prefix == "session evict" || prefix == "client evict") {
+ std::vector<std::string> filter_args;
+ cmd_getval(g_ceph_context, cmdmap, "filters", filter_args);
+
+ SessionFilter filter;
+ *r = filter.parse(filter_args, ss);
+ if (*r != 0) {
+ return true;
+ }
+
+ evict_clients(filter, m);
+
+ *need_reply = false;
+ return true;
+ } else if (prefix == "session config" || prefix == "client config") {
+ int64_t client_id;
+ std::string option;
+ std::string value;
+
+ cmd_getval(g_ceph_context, cmdmap, "client_id", client_id);
+ cmd_getval(g_ceph_context, cmdmap, "option", option);
+ bool got_value = cmd_getval(g_ceph_context, cmdmap, "value", value);
+
+ *r = config_client(client_id, !got_value, option, value, *ss);
+ return true;
+ } else if (prefix == "damage ls") {
+ JSONFormatter f(true);
+ damage_table.dump(&f);
+ f.flush(*ds);
+ return true;
+ } else if (prefix == "damage rm") {
+ damage_entry_id_t id = 0;
+ bool got = cmd_getval(g_ceph_context, cmdmap, "damage_id", (int64_t&)id);
+ if (!got) {
+ *r = -EINVAL;
+ return true;
+ }
+
+ damage_table.erase(id);
+ return true;
+ } else if (prefix == "cache drop") {
+ int64_t timeout;
+ if (!cmd_getval(g_ceph_context, cmdmap, "timeout", timeout)) {
+ timeout = 0;
+ }
+
+ *need_reply = false;
+ *run_later = create_async_exec_context(new C_CacheDropExecAndReply
+ (this, m, (uint64_t)timeout));
+ return true;
+ } else if (prefix == "scrub start") {
+ string path;
+ string tag;
+ vector<string> scrubop_vec;
+ cmd_getval(g_ceph_context, cmdmap, "scrubops", scrubop_vec);
+ cmd_getval(g_ceph_context, cmdmap, "path", path);
+ cmd_getval(g_ceph_context, cmdmap, "tag", tag);
+
+ /* Multiple MDS scrub is not currently supported. See also: https://tracker.ceph.com/issues/12274 */
+ if (mdsmap->get_max_mds() > 1) {
+ *ss << "Scrub is not currently supported for multiple active MDS. Please reduce max_mds to 1 and then scrub.";
+ *r = ENOTSUP;
+ return true;
+ }
+
+ *need_reply = false;
+ *run_later = create_async_exec_context(new C_ScrubExecAndReply
+ (this, m, path, tag, scrubop_vec));
+ return true;
+ } else if (prefix == "scrub abort") {
+ *need_reply = false;
+ *run_later = create_async_exec_context(new C_ScrubControlExecAndReply
+ (this, m, "abort"));
+ return true;
+ } else if (prefix == "scrub pause") {
+ *need_reply = false;
+ *run_later = create_async_exec_context(new C_ScrubControlExecAndReply
+ (this, m, "pause"));
+ return true;
+ } else if (prefix == "scrub resume") {
+ JSONFormatter f(true);
+ command_scrub_resume(&f);
+ f.flush(*ds);
+ return true;
+ } else if (prefix == "scrub status") {
+ JSONFormatter f(true);
+ command_scrub_status(&f);
+ f.flush(*ds);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void MDSRank::command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish) {
+ dout(20) << __func__ << dendl;
+
+ std::lock_guard locker(mds_lock);
+ C_Drop_Cache *request = new C_Drop_Cache(server, mdcache, mdlog, this,
+ timeout, f, on_finish);
+ request->send();
+}
+
+epoch_t MDSRank::get_osd_epoch() const
+{
+ return objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
+}
+
+const char** MDSRankDispatcher::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "clog_to_graylog",
+ "clog_to_graylog_host",
+ "clog_to_graylog_port",
+ "clog_to_monitors",
+ "clog_to_syslog",
+ "clog_to_syslog_facility",
+ "clog_to_syslog_level",
+ "fsid",
+ "host",
+ "mds_bal_fragment_dirs",
+ "mds_bal_fragment_interval",
+ "mds_cache_memory_limit",
+ "mds_cache_mid",
+ "mds_cache_reservation",
+ "mds_cache_size",
+ "mds_cache_trim_decay_rate",
+ "mds_cap_revoke_eviction_timeout",
+ "mds_dump_cache_threshold_file",
+ "mds_dump_cache_threshold_formatter",
+ "mds_enable_op_tracker",
+ "mds_health_cache_threshold",
+ "mds_inject_migrator_session_race",
+ "mds_log_pause",
+ "mds_max_export_size",
+ "mds_max_purge_files",
+ "mds_forward_all_requests_to_auth",
+ "mds_max_purge_ops",
+ "mds_max_purge_ops_per_pg",
+ "mds_max_snaps_per_dir",
+ "mds_op_complaint_time",
+ "mds_op_history_duration",
+ "mds_op_history_size",
+ "mds_op_log_threshold",
+ "mds_recall_max_decay_rate",
+ "mds_recall_warning_decay_rate",
+ "mds_request_load_average_decay_rate",
+ "mds_session_cache_liveness_decay_rate",
+ "mds_replay_unsafe_with_closed_session",
+ "mds_session_cap_acquisition_decay_rate",
+ "mds_max_caps_per_client",
+ "mds_session_cap_acquisition_throttle",
+ "mds_session_max_caps_throttle_ratio",
+ "mds_cap_acquisition_throttle_retry_request_time",
+ NULL
+ };
+ return KEYS;
+}
+
+void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::set<std::string>& changed)
+{
+ // XXX with or without mds_lock!
+
+ if (changed.count("mds_op_complaint_time") || changed.count("mds_op_log_threshold")) {
+ op_tracker.set_complaint_and_threshold(conf->mds_op_complaint_time, conf->mds_op_log_threshold);
+ }
+ if (changed.count("mds_op_history_size") || changed.count("mds_op_history_duration")) {
+ op_tracker.set_history_size_and_duration(conf->mds_op_history_size, conf->mds_op_history_duration);
+ }
+ if (changed.count("mds_enable_op_tracker")) {
+ op_tracker.set_tracking(conf->mds_enable_op_tracker);
+ }
+ if (changed.count("clog_to_monitors") ||
+ changed.count("clog_to_syslog") ||
+ changed.count("clog_to_syslog_level") ||
+ changed.count("clog_to_syslog_facility") ||
+ changed.count("clog_to_graylog") ||
+ changed.count("clog_to_graylog_host") ||
+ changed.count("clog_to_graylog_port") ||
+ changed.count("host") ||
+ changed.count("fsid")) {
+ update_log_config();
+ }
+
+ finisher->queue(new FunctionContext([this, changed](int r) {
+ std::scoped_lock lock(mds_lock);
+
+ if (changed.count("mds_log_pause") && !g_conf()->mds_log_pause) {
+ mdlog->kick_submitter();
+ }
+ sessionmap.handle_conf_change(changed);
+ server->handle_conf_change(changed);
+ mdcache->handle_conf_change(changed, *mdsmap);
+ purge_queue.handle_conf_change(changed, *mdsmap);
+ }));
+}
+
+void MDSRank::get_task_status(std::map<std::string, std::string> *status) {
+ dout(20) << __func__ << dendl;
+
+ // scrub summary for now..
+ std::string_view scrub_summary = scrubstack->scrub_summary();
+ status->emplace(SCRUB_STATUS_KEY, std::move(scrub_summary));
+}
+
+void MDSRank::schedule_update_timer_task() {
+ dout(20) << __func__ << dendl;
+
+ timer.add_event_after(g_conf().get_val<double>("mds_task_status_update_interval"),
+ new FunctionContext([this](int _) {
+ send_task_status();
+ }));
+}
+
+void MDSRank::send_task_status() {
+ std::map<std::string, std::string> status;
+ get_task_status(&status);
+
+ if (!status.empty()) {
+ dout(20) << __func__ << ": updating " << status.size() << " status keys" << dendl;
+
+ int r = mgrc->service_daemon_update_task_status(std::move(status));
+ if (r < 0) {
+ derr << ": failed to update service daemon status: " << cpp_strerror(r) << dendl;
+ }
+ }
+
+ schedule_update_timer_task();
+}
diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h
new file mode 100644
index 00000000..c10bad23
--- /dev/null
+++ b/src/mds/MDSRank.h
@@ -0,0 +1,673 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef MDS_RANK_H_
+#define MDS_RANK_H_
+
+#include <string_view>
+
+#include "common/DecayCounter.h"
+#include "common/LogClient.h"
+#include "common/Timer.h"
+#include "common/TrackedOp.h"
+
+#include "messages/MClientRequest.h"
+#include "messages/MCommand.h"
+#include "messages/MMDSMap.h"
+
+#include "Beacon.h"
+#include "DamageTable.h"
+#include "MDSMap.h"
+#include "SessionMap.h"
+#include "MDCache.h"
+#include "MDLog.h"
+#include "MDSContext.h"
+#include "PurgeQueue.h"
+#include "Server.h"
+#include "osdc/Journaler.h"
+
+// Full .h import instead of forward declaration for PerfCounter, for the
+// benefit of those including this header and using MDSRank::logger
+#include "common/perf_counters.h"
+
+enum {
+ l_mds_first = 2000,
+ l_mds_request,
+ l_mds_reply,
+ l_mds_reply_latency,
+ l_mds_forward,
+ l_mds_dir_fetch,
+ l_mds_dir_commit,
+ l_mds_dir_split,
+ l_mds_dir_merge,
+ l_mds_inode_max,
+ l_mds_inodes,
+ l_mds_inodes_top,
+ l_mds_inodes_bottom,
+ l_mds_inodes_pin_tail,
+ l_mds_inodes_pinned,
+ l_mds_inodes_expired,
+ l_mds_inodes_with_caps,
+ l_mds_caps,
+ l_mds_subtrees,
+ l_mds_traverse,
+ l_mds_traverse_hit,
+ l_mds_traverse_forward,
+ l_mds_traverse_discover,
+ l_mds_traverse_dir_fetch,
+ l_mds_traverse_remote_ino,
+ l_mds_traverse_lock,
+ l_mds_load_cent,
+ l_mds_dispatch_queue_len,
+ l_mds_exported,
+ l_mds_exported_inodes,
+ l_mds_imported,
+ l_mds_imported_inodes,
+ l_mds_openino_dir_fetch,
+ l_mds_openino_backtrace_fetch,
+ l_mds_openino_peer_discover,
+ l_mds_root_rfiles,
+ l_mds_root_rbytes,
+ l_mds_root_rsnaps,
+ l_mds_last,
+};
+
+// memory utilization
+enum {
+ l_mdm_first = 2500,
+ l_mdm_ino,
+ l_mdm_inoa,
+ l_mdm_inos,
+ l_mdm_dir,
+ l_mdm_dira,
+ l_mdm_dirs,
+ l_mdm_dn,
+ l_mdm_dna,
+ l_mdm_dns,
+ l_mdm_cap,
+ l_mdm_capa,
+ l_mdm_caps,
+ l_mdm_rss,
+ l_mdm_heap,
+ l_mdm_last,
+};
+
+namespace ceph {
+ struct heartbeat_handle_d;
+}
+
+class Locker;
+class MDCache;
+class MDLog;
+class MDBalancer;
+class InoTable;
+class SnapServer;
+class SnapClient;
+class MDSTableServer;
+class MDSTableClient;
+class Messenger;
+class Objecter;
+class MonClient;
+class MgrClient;
+class Finisher;
+class ScrubStack;
+class C_MDS_Send_Command_Reply;
+class C_ExecAndReply;
+
+/**
+ * The public part of this class's interface is what's exposed to all
+ * the various subsystems (server, mdcache, etc), such as pointers
+ * to the other subsystems, and message-sending calls.
+ */
+class MDSRank {
+ protected:
+ const mds_rank_t whoami;
+
+ // Incarnation as seen in MDSMap at the point where a rank is
+ // assigned.
+ int incarnation;
+
+ public:
+
+ friend class C_Flush_Journal;
+ friend class C_Drop_Cache;
+
+ friend class C_CacheDropExecAndReply;
+ friend class C_ScrubExecAndReply;
+ friend class C_ScrubControlExecAndReply;
+
+ mds_rank_t get_nodeid() const { return whoami; }
+ int64_t get_metadata_pool();
+
+ // Reference to global MDS::mds_lock, so that users of MDSRank don't
+ // carry around references to the outer MDS, and we can substitute
+ // a separate lock here in future potentially.
+ Mutex &mds_lock;
+
+ mono_time get_starttime() const {
+ return starttime;
+ }
+ chrono::duration<double> get_uptime() const {
+ mono_time now = mono_clock::now();
+ return chrono::duration<double>(now-starttime);
+ }
+
+ class CephContext *cct;
+
+ bool is_daemon_stopping() const;
+
+ // Reference to global cluster log client, just to avoid initialising
+ // a separate one here.
+ LogChannelRef &clog;
+
+ // Reference to global timer utility, because MDSRank and MDSDaemon
+ // currently both use the same mds_lock, so it makes sense for them
+ // to share a timer.
+ SafeTimer &timer;
+
+ std::unique_ptr<MDSMap> &mdsmap; /* MDSDaemon::mdsmap */
+
+ Objecter *objecter;
+
+ // sub systems
+ Server *server;
+ MDCache *mdcache;
+ Locker *locker;
+ MDLog *mdlog;
+ MDBalancer *balancer;
+ ScrubStack *scrubstack;
+ DamageTable damage_table;
+
+
+ InoTable *inotable;
+
+ SnapServer *snapserver;
+ SnapClient *snapclient;
+
+ MDSTableClient *get_table_client(int t);
+ MDSTableServer *get_table_server(int t);
+
+ SessionMap sessionmap;
+ Session *get_session(client_t client) {
+ return sessionmap.get_session(entity_name_t::CLIENT(client.v));
+ }
+ Session *get_session(const Message::const_ref &m);
+
+ PerfCounters *logger, *mlogger;
+ OpTracker op_tracker;
+
+ // The last different state I held before current
+ MDSMap::DaemonState last_state;
+ // The state assigned to me by the MDSMap
+ MDSMap::DaemonState state;
+
+ bool cluster_degraded;
+
+ MDSMap::DaemonState get_state() const { return state; }
+ MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); }
+
+ bool is_creating() const { return state == MDSMap::STATE_CREATING; }
+ bool is_starting() const { return state == MDSMap::STATE_STARTING; }
+ bool is_standby() const { return state == MDSMap::STATE_STANDBY; }
+ bool is_replay() const { return state == MDSMap::STATE_REPLAY; }
+ bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; }
+ bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; }
+ bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; }
+ bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; }
+ bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; }
+ bool is_active() const { return state == MDSMap::STATE_ACTIVE; }
+ bool is_stopping() const { return state == MDSMap::STATE_STOPPING; }
+ bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
+ bool is_stopped() const { return mdsmap->is_stopped(whoami); }
+ bool is_cluster_degraded() const { return cluster_degraded; }
+ bool allows_multimds_snaps() const { return mdsmap->allows_multimds_snaps(); }
+
+ bool is_cache_trimmable() const {
+ return is_clientreplay() || is_active() || is_stopping();
+ }
+
+ void handle_write_error(int err);
+
+ void update_mlogger();
+ protected:
+ // Flag to indicate we entered shutdown: anyone seeing this to be true
+ // after taking mds_lock must drop out.
+ bool stopping;
+
+ // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
+ // because its init/shutdown happens at the top level.
+ PurgeQueue purge_queue;
+
+ class ProgressThread : public Thread {
+ MDSRank *mds;
+ Cond cond;
+ public:
+ explicit ProgressThread(MDSRank *mds_) : mds(mds_) {}
+ void * entry() override;
+ void shutdown();
+ void signal() {cond.Signal();}
+ } progress_thread;
+
+ list<Message::const_ref> waiting_for_nolaggy;
+ MDSContext::que finished_queue;
+ // Dispatch, retry, queues
+ int dispatch_depth;
+ void inc_dispatch_depth() { ++dispatch_depth; }
+ void dec_dispatch_depth() { --dispatch_depth; }
+ void retry_dispatch(const Message::const_ref &m);
+ bool is_valid_message(const Message::const_ref &m);
+ void handle_message(const Message::const_ref &m);
+ void _advance_queues();
+ bool _dispatch(const Message::const_ref &m, bool new_msg);
+
+ ceph::heartbeat_handle_d *hb; // Heartbeat for threads using mds_lock
+
+ bool is_stale_message(const Message::const_ref &m) const;
+
+ map<mds_rank_t, version_t> peer_mdsmap_epoch;
+
+ ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename)
+
+ MDSContext::vec waiting_for_active, waiting_for_replay, waiting_for_rejoin,
+ waiting_for_reconnect, waiting_for_resolve;
+ MDSContext::vec waiting_for_any_client_connection;
+ MDSContext::que replay_queue;
+ bool replaying_requests_done = false;
+
+ map<mds_rank_t, MDSContext::vec > waiting_for_active_peer;
+ map<epoch_t, MDSContext::vec > waiting_for_mdsmap;
+
+ epoch_t osd_epoch_barrier;
+
+ // Const reference to the beacon so that we can behave differently
+ // when it's laggy.
+ Beacon &beacon;
+
+ /**
+ * Emit clog warnings for any ops reported as warnings by optracker
+ */
+ void check_ops_in_flight();
+
+ int mds_slow_req_count;
+
+ /**
+ * Share MDSMap with clients
+ */
+ void bcast_mds_map(); // to mounted clients
+ epoch_t last_client_mdsmap_bcast;
+
+ map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */
+
+ void create_logger();
+ public:
+ void queue_waiter(MDSContext *c) {
+ finished_queue.push_back(c);
+ progress_thread.signal();
+ }
+ void queue_waiter_front(MDSContext *c) {
+ finished_queue.push_front(c);
+ progress_thread.signal();
+ }
+ void queue_waiters(MDSContext::vec& ls) {
+ MDSContext::vec v;
+ v.swap(ls);
+ std::copy(v.begin(), v.end(), std::back_inserter(finished_queue));
+ progress_thread.signal();
+ }
+ void queue_waiters_front(MDSContext::vec& ls) {
+ MDSContext::vec v;
+ v.swap(ls);
+ std::copy(v.rbegin(), v.rend(), std::front_inserter(finished_queue));
+ progress_thread.signal();
+ }
+
+ MDSRank(
+ mds_rank_t whoami_,
+ Mutex &mds_lock_,
+ LogChannelRef &clog_,
+ SafeTimer &timer_,
+ Beacon &beacon_,
+ std::unique_ptr<MDSMap> & mdsmap_,
+ Messenger *msgr,
+ MonClient *monc_,
+ MgrClient *mgrc,
+ Context *respawn_hook_,
+ Context *suicide_hook_);
+
+ protected:
+ ~MDSRank();
+
+ public:
+
+ // Daemon lifetime functions: these guys break the abstraction
+ // and call up into the parent MDSDaemon instance. It's kind
+ // of unavoidable: if we want any depth into our calls
+ // to be able to e.g. tear down the whole process, we have to
+ // have a reference going all the way down.
+ // >>>
+ void suicide();
+ void respawn();
+ // <<<
+
+ /**
+ * Call this periodically if inside a potentially long running piece
+ * of code while holding the mds_lock
+ */
+ void heartbeat_reset();
+
+ /**
+ * Report state DAMAGED to the mon, and then pass on to respawn(). Call
+ * this when an unrecoverable error is encountered while attempting
+ * to load an MDS rank's data structures. This is *not* for use with
+ * errors affecting normal dirfrag/inode objects -- they should be handled
+ * through cleaner scrub/repair mechanisms.
+ *
+ * Callers must already hold mds_lock.
+ */
+ void damaged();
+
+ /**
+ * Wrapper around `damaged` for users who are not
+ * already holding mds_lock.
+ *
+ * Callers must not already hold mds_lock.
+ */
+ void damaged_unlocked();
+
+ double last_cleared_laggy() const {
+ return beacon.last_cleared_laggy();
+ }
+
+ double get_dispatch_queue_max_age(utime_t now) const;
+
+ void send_message_mds(const Message::ref& m, mds_rank_t mds);
+ void forward_message_mds(const MClientRequest::const_ref& req, mds_rank_t mds);
+ void send_message_client_counted(const Message::ref& m, client_t client);
+ void send_message_client_counted(const Message::ref& m, Session* session);
+ void send_message_client_counted(const Message::ref& m, const ConnectionRef& connection);
+ void send_message_client(const Message::ref& m, Session* session);
+ void send_message(const Message::ref& m, const ConnectionRef& c);
+
+ void wait_for_active_peer(mds_rank_t who, MDSContext *c) {
+ waiting_for_active_peer[who].push_back(c);
+ }
+ void wait_for_cluster_recovered(MDSContext *c) {
+ ceph_assert(cluster_degraded);
+ waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
+ }
+
+ void wait_for_any_client_connection(MDSContext *c) {
+ waiting_for_any_client_connection.push_back(c);
+ }
+ void kick_waiters_for_any_client_connection(void) {
+ finish_contexts(g_ceph_context, waiting_for_any_client_connection);
+ }
+ void wait_for_active(MDSContext *c) {
+ waiting_for_active.push_back(c);
+ }
+ void wait_for_replay(MDSContext *c) {
+ waiting_for_replay.push_back(c);
+ }
+ void wait_for_rejoin(MDSContext *c) {
+ waiting_for_rejoin.push_back(c);
+ }
+ void wait_for_reconnect(MDSContext *c) {
+ waiting_for_reconnect.push_back(c);
+ }
+ void wait_for_resolve(MDSContext *c) {
+ waiting_for_resolve.push_back(c);
+ }
+ void wait_for_mdsmap(epoch_t e, MDSContext *c) {
+ waiting_for_mdsmap[e].push_back(c);
+ }
+ void enqueue_replay(MDSContext *c) {
+ replay_queue.push_back(c);
+ }
+
+ bool queue_one_replay();
+ void maybe_clientreplay_done();
+
+ void set_osd_epoch_barrier(epoch_t e);
+ epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
+ epoch_t get_osd_epoch() const;
+
+ ceph_tid_t issue_tid() { return ++last_tid; }
+
+ Finisher *finisher;
+
+ MDSMap *get_mds_map() { return mdsmap.get(); }
+
+ uint64_t get_num_requests() const { return logger->get(l_mds_request); }
+
+ int get_mds_slow_req_count() const { return mds_slow_req_count; }
+
+ void dump_status(Formatter *f) const;
+
+ void hit_export_target(mds_rank_t rank, double amount=-1.0);
+ bool is_export_target(mds_rank_t rank) {
+ const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
+ return map_targets.count(rank);
+ }
+
+ bool evict_client(int64_t session_id, bool wait, bool blacklist,
+ std::ostream& ss, Context *on_killed=nullptr);
+ int config_client(int64_t session_id, bool remove,
+ const std::string& option, const std::string& value,
+ std::ostream& ss);
+
+ void mark_base_recursively_scrubbed(inodeno_t ino);
+
+ protected:
+ void dump_clientreplay_status(Formatter *f) const;
+ void command_scrub_start(Formatter *f,
+ std::string_view path, std::string_view tag,
+ const vector<string>& scrubop_vec, Context *on_finish);
+ void command_tag_path(Formatter *f, std::string_view path,
+ std::string_view tag);
+ // scrub control commands
+ void command_scrub_abort(Formatter *f, Context *on_finish);
+ void command_scrub_pause(Formatter *f, Context *on_finish);
+ void command_scrub_resume(Formatter *f);
+ void command_scrub_status(Formatter *f);
+
+ void command_flush_path(Formatter *f, std::string_view path);
+ void command_flush_journal(Formatter *f);
+ void command_get_subtrees(Formatter *f);
+ void command_export_dir(Formatter *f,
+ std::string_view path, mds_rank_t dest);
+ bool command_dirfrag_split(
+ cmdmap_t cmdmap,
+ std::ostream &ss);
+ bool command_dirfrag_merge(
+ cmdmap_t cmdmap,
+ std::ostream &ss);
+ bool command_dirfrag_ls(
+ cmdmap_t cmdmap,
+ std::ostream &ss,
+ Formatter *f);
+ int _command_export_dir(std::string_view path, mds_rank_t dest);
+ CDir *_command_dirfrag_get(
+ const cmdmap_t &cmdmap,
+ std::ostream &ss);
+ void command_openfiles_ls(Formatter *f);
+ void command_dump_tree(const cmdmap_t &cmdmap, std::ostream &ss, Formatter *f);
+ void command_dump_inode(Formatter *f, const cmdmap_t &cmdmap, std::ostream &ss);
+ void command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish);
+
+ protected:
+ Messenger *messenger;
+ MonClient *monc;
+ MgrClient *mgrc;
+
+ Context *respawn_hook;
+ Context *suicide_hook;
+
+ // Friended to access retry_dispatch
+ friend class C_MDS_RetryMessage;
+
+ // FIXME the state machine logic should be separable from the dispatch
+ // logic that calls it.
+ // >>>
+ void calc_recovery_set();
+ void request_state(MDSMap::DaemonState s);
+
+ bool standby_replaying; // true if current replay pass is in standby-replay mode
+
+ typedef enum {
+ // The MDSMap is available, configure default layouts and structures
+ MDS_BOOT_INITIAL = 0,
+ // We are ready to open some inodes
+ MDS_BOOT_OPEN_ROOT,
+ // We are ready to do a replay if needed
+ MDS_BOOT_PREPARE_LOG,
+ // Replay is complete
+ MDS_BOOT_REPLAY_DONE
+ } BootStep;
+ friend class C_MDS_BootStart;
+ friend class C_MDS_InternalBootStart;
+ void boot_create(); // i am new mds.
+ void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0); // starting|replay
+
+ void replay_start();
+ void creating_done();
+ void starting_done();
+ void replay_done();
+ void standby_replay_restart();
+ void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
+ class C_MDS_StandbyReplayRestart;
+ class C_MDS_StandbyReplayRestartFinish;
+
+ void reopen_log();
+
+ void resolve_start();
+ void resolve_done();
+ void reconnect_start();
+ void reconnect_done();
+ void rejoin_joint_start();
+ void rejoin_start();
+ void rejoin_done();
+ void recovery_done(int oldstate);
+ void clientreplay_start();
+ void clientreplay_done();
+ void active_start();
+ void stopping_start();
+ void stopping_done();
+
+ void validate_sessions();
+ // <<<
+
+ // >>>
+ void handle_mds_recovery(mds_rank_t who);
+ void handle_mds_failure(mds_rank_t who);
+ // <<<
+
+ /* Update MDSMap export_targets for this rank. Called on ::tick(). */
+ void update_targets();
+
+ friend class C_MDS_MonCommand;
+ void _mon_command_finish(int r, std::string_view cmd, std::string_view outs);
+ void set_mdsmap_multimds_snaps_allowed();
+private:
+ mono_time starttime = mono_clock::zero();
+
+ // "task" string that gets displayed in ceph status
+ inline static const std::string SCRUB_STATUS_KEY = "scrub status";
+
+ void get_task_status(std::map<std::string, std::string> *status);
+ void schedule_update_timer_task();
+ void send_task_status();
+
+protected:
+ Context *create_async_exec_context(C_ExecAndReply *ctx);
+};
+
+/* This expects to be given a reference which it is responsible for.
+ * The finish function calls functions which
+ * will put the Message exactly once.*/
+class C_MDS_RetryMessage : public MDSInternalContext {
+public:
+ C_MDS_RetryMessage(MDSRank *mds, const Message::const_ref &m)
+ : MDSInternalContext(mds), m(m) {}
+ void finish(int r) override {
+ get_mds()->retry_dispatch(m);
+ }
+protected:
+ Message::const_ref m;
+};
+
+class CF_MDS_RetryMessageFactory : public MDSContextFactory {
+public:
+ CF_MDS_RetryMessageFactory(MDSRank *mds, const Message::const_ref &m)
+ : mds(mds), m(m) {}
+
+ MDSContext *build() {
+ return new C_MDS_RetryMessage(mds, m);
+ }
+
+private:
+ MDSRank *mds;
+ Message::const_ref m;
+};
+
+/**
+ * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
+ * the service/dispatcher stuff like init/shutdown that subsystems should
+ * never touch.
+ */
+class MDSRankDispatcher : public MDSRank, public md_config_obs_t
+{
+public:
+ void init();
+ void tick();
+ void shutdown();
+ bool handle_asok_command(std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f, std::ostream& ss);
+ void handle_mds_map(const MMDSMap::const_ref &m, const MDSMap &oldmap);
+ void handle_osd_map();
+ void update_log_config();
+
+ const char** get_tracked_conf_keys() const override final;
+ void handle_conf_change(const ConfigProxy& conf, const std::set<std::string>& changed) override;
+
+ bool handle_command(
+ const cmdmap_t &cmdmap,
+ const MCommand::const_ref &m,
+ int *r,
+ std::stringstream *ds,
+ std::stringstream *ss,
+ Context **run_later,
+ bool *need_reply);
+
+ void dump_sessions(const SessionFilter &filter, Formatter *f) const;
+ void evict_clients(const SessionFilter &filter, const MCommand::const_ref &m);
+
+ // Call into me from MDS::ms_dispatch
+ bool ms_dispatch(const Message::const_ref &m);
+
+ MDSRankDispatcher(
+ mds_rank_t whoami_,
+ Mutex &mds_lock_,
+ LogChannelRef &clog_,
+ SafeTimer &timer_,
+ Beacon &beacon_,
+ std::unique_ptr<MDSMap> &mdsmap_,
+ Messenger *msgr,
+ MonClient *monc_,
+ MgrClient *mgrc,
+ Context *respawn_hook_,
+ Context *suicide_hook_);
+};
+
+#endif // MDS_RANK_H_
+
diff --git a/src/mds/MDSTable.cc b/src/mds/MDSTable.cc
new file mode 100644
index 00000000..b0809f50
--- /dev/null
+++ b/src/mds/MDSTable.cc
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "MDSTable.h"
+
+#include "MDSRank.h"
+#include "MDLog.h"
+
+#include "osdc/Filer.h"
+
+#include "include/types.h"
+
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/Finisher.h"
+
+#include "include/ceph_assert.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << rank << "." << table_name << ": "
+
+
+class MDSTableIOContext : public MDSIOContextBase
+{
+ protected:
+ MDSTable *ida;
+ MDSRank *get_mds() override {return ida->mds;}
+ public:
+ explicit MDSTableIOContext(MDSTable *ida_) : ida(ida_) {
+ ceph_assert(ida != NULL);
+ }
+};
+
+
+class C_IO_MT_Save : public MDSTableIOContext {
+ version_t version;
+public:
+ C_IO_MT_Save(MDSTable *i, version_t v) : MDSTableIOContext(i), version(v) {}
+ void finish(int r) override {
+ ida->save_2(r, version);
+ }
+ void print(ostream& out) const override {
+ out << "table_save(" << ida->table_name << ")";
+ }
+};
+
+void MDSTable::save(MDSContext *onfinish, version_t v)
+{
+ if (v > 0 && v <= committing_version) {
+ dout(10) << "save v " << version << " - already saving "
+ << committing_version << " >= needed " << v << dendl;
+ if (onfinish)
+ waitfor_save[v].push_back(onfinish);
+ return;
+ }
+
+ dout(10) << "save v " << version << dendl;
+ ceph_assert(is_active());
+
+ bufferlist bl;
+ encode(version, bl);
+ encode_state(bl);
+
+ committing_version = version;
+
+ if (onfinish)
+ waitfor_save[version].push_back(onfinish);
+
+ // write (async)
+ SnapContext snapc;
+ object_t oid = get_object_name();
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+ mds->objecter->write_full(oid, oloc,
+ snapc,
+ bl, ceph::real_clock::now(), 0,
+ new C_OnFinisher(new C_IO_MT_Save(this, version),
+ mds->finisher));
+}
+
+void MDSTable::save_2(int r, version_t v)
+{
+ if (r < 0) {
+ dout(1) << "save error " << r << " v " << v << dendl;
+ mds->clog->error() << "failed to store table " << table_name << " object,"
+ << " errno " << r;
+ mds->handle_write_error(r);
+ return;
+ }
+
+ dout(10) << "save_2 v " << v << dendl;
+ committed_version = v;
+
+ MDSContext::vec ls;
+ while (!waitfor_save.empty()) {
+ auto it = waitfor_save.begin();
+ if (it->first > v) break;
+ auto& v = it->second;
+ ls.insert(ls.end(), v.begin(), v.end());
+ waitfor_save.erase(it);
+ }
+ finish_contexts(g_ceph_context, ls, 0);
+}
+
+
+void MDSTable::reset()
+{
+ reset_state();
+ projected_version = version;
+ state = STATE_ACTIVE;
+}
+
+
+
+// -----------------------
+
+class C_IO_MT_Load : public MDSTableIOContext {
+public:
+ Context *onfinish;
+ bufferlist bl;
+ C_IO_MT_Load(MDSTable *i, Context *o) : MDSTableIOContext(i), onfinish(o) {}
+ void finish(int r) override {
+ ida->load_2(r, bl, onfinish);
+ }
+ void print(ostream& out) const override {
+ out << "table_load(" << ida->table_name << ")";
+ }
+};
+
+object_t MDSTable::get_object_name() const
+{
+ char n[50];
+ if (per_mds)
+ snprintf(n, sizeof(n), "mds%d_%s", int(rank), table_name.c_str());
+ else
+ snprintf(n, sizeof(n), "mds_%s", table_name.c_str());
+ return object_t(n);
+}
+
+void MDSTable::load(MDSContext *onfinish)
+{
+ dout(10) << "load" << dendl;
+
+ ceph_assert(is_undef());
+ state = STATE_OPENING;
+
+ C_IO_MT_Load *c = new C_IO_MT_Load(this, onfinish);
+ object_t oid = get_object_name();
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+ mds->objecter->read_full(oid, oloc, CEPH_NOSNAP, &c->bl, 0,
+ new C_OnFinisher(c, mds->finisher));
+}
+
+void MDSTable::load_2(int r, bufferlist& bl, Context *onfinish)
+{
+ ceph_assert(is_opening());
+ state = STATE_ACTIVE;
+ if (r == -EBLACKLISTED) {
+ mds->respawn();
+ return;
+ }
+ if (r < 0) {
+ derr << "load_2 could not read table: " << r << dendl;
+ mds->clog->error() << "error reading table object '" << get_object_name()
+ << "' " << r << " (" << cpp_strerror(r) << ")";
+ mds->damaged();
+ ceph_assert(r >= 0); // Should be unreachable because damaged() calls respawn()
+ }
+
+ dout(10) << "load_2 got " << bl.length() << " bytes" << dendl;
+ auto p = bl.cbegin();
+
+ try {
+ decode(version, p);
+ projected_version = committed_version = version;
+ dout(10) << "load_2 loaded v" << version << dendl;
+ decode_state(p);
+ } catch (buffer::error &e) {
+ mds->clog->error() << "error decoding table object '" << get_object_name()
+ << "': " << e.what();
+ mds->damaged();
+ ceph_assert(r >= 0); // Should be unreachable because damaged() calls respawn()
+ }
+
+ if (onfinish) {
+ onfinish->complete(0);
+ }
+}
diff --git a/src/mds/MDSTable.h b/src/mds/MDSTable.h
new file mode 100644
index 00000000..6ad52b20
--- /dev/null
+++ b/src/mds/MDSTable.h
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDSTABLE_H
+#define CEPH_MDSTABLE_H
+
+#include "mdstypes.h"
+#include "mds_table_types.h"
+#include "include/buffer_fwd.h"
+
+#include "MDSContext.h"
+
+class MDSRank;
+
+class MDSTable {
+public:
+ MDSRank *mds;
+protected:
+ std::string table_name;
+ bool per_mds;
+ mds_rank_t rank;
+
+
+ static const int STATE_UNDEF = 0;
+ static const int STATE_OPENING = 1;
+ static const int STATE_ACTIVE = 2;
+ //static const int STATE_COMMITTING = 3;
+ int state;
+
+ version_t version, committing_version, committed_version, projected_version;
+
+ map<version_t, MDSContext::vec > waitfor_save;
+
+public:
+ MDSTable(MDSRank *m, std::string_view n, bool is_per_mds) :
+ mds(m), table_name(n), per_mds(is_per_mds), rank(MDS_RANK_NONE),
+ state(STATE_UNDEF),
+ version(0), committing_version(0), committed_version(0), projected_version(0) {}
+ virtual ~MDSTable() {}
+
+ void set_rank(mds_rank_t r)
+ {
+ rank = r;
+ }
+
+ version_t get_version() const { return version; }
+ version_t get_committed_version() const { return committed_version; }
+ version_t get_committing_version() const { return committing_version; }
+ version_t get_projected_version() const { return projected_version; }
+
+ void force_replay_version(version_t v) {
+ version = projected_version = v;
+ }
+
+ //version_t project_version() { return ++projected_version; }
+ //version_t inc_version() { return ++version; }
+
+ // load/save from disk (hack)
+ bool is_undef() const { return state == STATE_UNDEF; }
+ bool is_active() const { return state == STATE_ACTIVE; }
+ bool is_opening() const { return state == STATE_OPENING; }
+
+ void reset();
+ void save(MDSContext *onfinish=0, version_t need=0);
+ void save_2(int r, version_t v);
+
+ void shutdown() {
+ if (is_active()) save(0);
+ }
+
+ object_t get_object_name() const;
+ void load(MDSContext *onfinish);
+ void load_2(int, bufferlist&, Context *onfinish);
+
+ // child must overload these
+ virtual void reset_state() = 0;
+ virtual void decode_state(bufferlist::const_iterator& p) = 0;
+ virtual void encode_state(bufferlist& bl) const = 0;
+
+ friend class C_IO_MT_Load;
+ friend class C_IO_MT_Save;
+};
+
+#endif
diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
new file mode 100644
index 00000000..6418b130
--- /dev/null
+++ b/src/mds/MDSTableClient.cc
@@ -0,0 +1,264 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <iostream>
+
+#include "MDSMap.h"
+
+#include "MDSContext.h"
+#include "msg/Messenger.h"
+
+#include "MDSRank.h"
+#include "MDLog.h"
+#include "LogSegment.h"
+
+#include "MDSTableClient.h"
+#include "events/ETableClient.h"
+
+#include "common/config.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".tableclient(" << get_mdstable_name(table) << ") "
+
+
+class C_LoggedAck : public MDSLogContextBase {
+ MDSTableClient *tc;
+ version_t tid;
+ MDSRank *get_mds() override { return tc->mds; }
+public:
+ C_LoggedAck(MDSTableClient *a, version_t t) : tc(a), tid(t) {}
+ void finish(int r) override {
+ tc->_logged_ack(tid);
+ }
+};
+
+
+void MDSTableClient::handle_request(const MMDSTableRequest::const_ref &m)
+{
+ dout(10) << "handle_request " << *m << dendl;
+ ceph_assert(m->table == table);
+
+ if (mds->get_state() < MDSMap::STATE_RESOLVE) {
+ if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
+ mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
+ }
+ return;
+ }
+
+ version_t tid = m->get_tid();
+ uint64_t reqid = m->reqid;
+
+ switch (m->op) {
+ case TABLESERVER_OP_QUERY_REPLY:
+ handle_query_result(m);
+ break;
+
+ case TABLESERVER_OP_NOTIFY_PREP:
+ ceph_assert(g_conf()->mds_kill_mdstable_at != 9);
+ handle_notify_prep(m);
+ break;
+
+ case TABLESERVER_OP_AGREE:
+ if (pending_prepare.count(reqid)) {
+ dout(10) << "got agree on " << reqid << " atid " << tid << dendl;
+
+ ceph_assert(g_conf()->mds_kill_mdstable_at != 3);
+
+ MDSContext *onfinish = pending_prepare[reqid].onfinish;
+ *pending_prepare[reqid].ptid = tid;
+ if (pending_prepare[reqid].pbl)
+ *pending_prepare[reqid].pbl = m->bl;
+ pending_prepare.erase(reqid);
+ prepared_update[tid] = reqid;
+ if (onfinish) {
+ onfinish->complete(0);
+ }
+ }
+ else if (prepared_update.count(tid)) {
+ dout(10) << "got duplicated agree on " << reqid << " atid " << tid << dendl;
+ ceph_assert(prepared_update[tid] == reqid);
+ ceph_assert(!server_ready);
+ }
+ else if (pending_commit.count(tid)) {
+ dout(10) << "stray agree on " << reqid << " tid " << tid
+ << ", already committing, will resend COMMIT" << dendl;
+ ceph_assert(!server_ready);
+ // will re-send commit when receiving the server ready message
+ }
+ else {
+ dout(10) << "stray agree on " << reqid << " tid " << tid
+ << ", sending ROLLBACK" << dendl;
+ ceph_assert(!server_ready);
+ auto req = MMDSTableRequest::create(table, TABLESERVER_OP_ROLLBACK, 0, tid);
+ mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
+ }
+ break;
+
+ case TABLESERVER_OP_ACK:
+ if (pending_commit.count(tid) &&
+ pending_commit[tid]->pending_commit_tids[table].count(tid)) {
+ dout(10) << "got ack on tid " << tid << ", logging" << dendl;
+
+ ceph_assert(g_conf()->mds_kill_mdstable_at != 7);
+
+ // remove from committing list
+ pending_commit[tid]->pending_commit_tids[table].erase(tid);
+ pending_commit.erase(tid);
+
+ // log ACK.
+ mds->mdlog->start_submit_entry(new ETableClient(table, TABLESERVER_OP_ACK, tid),
+ new C_LoggedAck(this, tid));
+ } else {
+ dout(10) << "got stray ack on tid " << tid << ", ignoring" << dendl;
+ }
+ break;
+
+ case TABLESERVER_OP_SERVER_READY:
+ ceph_assert(!server_ready);
+ server_ready = true;
+
+ if (last_reqid == ~0ULL)
+ last_reqid = reqid;
+
+ resend_queries();
+ resend_prepares();
+ resend_commits();
+ break;
+
+ default:
+ ceph_abort_msg("unrecognized mds_table_client request op");
+ }
+}
+
+
+void MDSTableClient::_logged_ack(version_t tid)
+{
+ dout(10) << "_logged_ack " << tid << dendl;
+ // kick any waiters (LogSegment trim)
+ if (ack_waiters.count(tid)) {
+ dout(15) << "kicking ack waiters on tid " << tid << dendl;
+ mds->queue_waiters(ack_waiters[tid]);
+ ack_waiters.erase(tid);
+ }
+}
+
+void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl,
+ MDSContext *onfinish)
+{
+ if (last_reqid == ~0ULL) {
+ dout(10) << "tableserver is not ready yet, waiting for request id" << dendl;
+ waiting_for_reqid.push_back(_pending_prepare(onfinish, ptid, pbl, mutation));
+ return;
+ }
+
+ uint64_t reqid = ++last_reqid;
+ dout(10) << "_prepare " << reqid << dendl;
+
+ pending_prepare[reqid].mutation = mutation;
+ pending_prepare[reqid].ptid = ptid;
+ pending_prepare[reqid].pbl = pbl;
+ pending_prepare[reqid].onfinish = onfinish;
+
+ if (server_ready) {
+ // send message
+ auto req = MMDSTableRequest::create(table, TABLESERVER_OP_PREPARE, reqid);
+ req->bl = mutation;
+ mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
+ } else
+ dout(10) << "tableserver is not ready yet, deferring request" << dendl;
+}
+
+void MDSTableClient::commit(version_t tid, LogSegment *ls)
+{
+ dout(10) << "commit " << tid << dendl;
+
+ ceph_assert(prepared_update.count(tid));
+ prepared_update.erase(tid);
+
+ ceph_assert(pending_commit.count(tid) == 0);
+ pending_commit[tid] = ls;
+ ls->pending_commit_tids[table].insert(tid);
+
+ notify_commit(tid);
+
+ ceph_assert(g_conf()->mds_kill_mdstable_at != 4);
+
+ if (server_ready) {
+ // send message
+ auto req = MMDSTableRequest::create(table, TABLESERVER_OP_COMMIT, 0, tid);
+ mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
+ } else
+ dout(10) << "tableserver is not ready yet, deferring request" << dendl;
+}
+
+
+
+// recovery
+
+void MDSTableClient::got_journaled_agree(version_t tid, LogSegment *ls)
+{
+ dout(10) << "got_journaled_agree " << tid << dendl;
+ ls->pending_commit_tids[table].insert(tid);
+ pending_commit[tid] = ls;
+
+ notify_commit(tid);
+}
+
+void MDSTableClient::got_journaled_ack(version_t tid)
+{
+ dout(10) << "got_journaled_ack " << tid << dendl;
+ if (pending_commit.count(tid)) {
+ pending_commit[tid]->pending_commit_tids[table].erase(tid);
+ pending_commit.erase(tid);
+ }
+}
+
+void MDSTableClient::resend_commits()
+{
+ for (map<version_t,LogSegment*>::iterator p = pending_commit.begin();
+ p != pending_commit.end();
+ ++p) {
+ dout(10) << "resending commit on " << p->first << dendl;
+ auto req = MMDSTableRequest::create(table, TABLESERVER_OP_COMMIT, 0, p->first);
+ mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
+ }
+}
+
+void MDSTableClient::resend_prepares()
+{
+ while (!waiting_for_reqid.empty()) {
+ pending_prepare[++last_reqid] = waiting_for_reqid.front();
+ waiting_for_reqid.pop_front();
+ }
+
+ for (map<uint64_t, _pending_prepare>::iterator p = pending_prepare.begin();
+ p != pending_prepare.end();
+ ++p) {
+ dout(10) << "resending prepare on " << p->first << dendl;
+ auto req = MMDSTableRequest::create(table, TABLESERVER_OP_PREPARE, p->first);
+ req->bl = p->second.mutation;
+ mds->send_message_mds(req, mds->get_mds_map()->get_tableserver());
+ }
+}
+
+void MDSTableClient::handle_mds_failure(mds_rank_t who)
+{
+ if (who != mds->get_mds_map()->get_tableserver())
+ return; // do nothing.
+
+ dout(7) << "tableserver mds." << who << " fails" << dendl;
+ server_ready = false;
+}
diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
new file mode 100644
index 00000000..f2bf461a
--- /dev/null
+++ b/src/mds/MDSTableClient.h
@@ -0,0 +1,103 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDSTABLECLIENT_H
+#define CEPH_MDSTABLECLIENT_H
+
+#include "include/types.h"
+#include "MDSContext.h"
+#include "mds_table_types.h"
+
+#include "messages/MMDSTableRequest.h"
+
+class MDSRank;
+class LogSegment;
+
+class MDSTableClient {
+protected:
+ MDSRank *mds;
+ int table;
+
+ uint64_t last_reqid;
+
+ bool server_ready;
+
+ // prepares
+ struct _pending_prepare {
+ MDSContext *onfinish;
+ version_t *ptid;
+ bufferlist *pbl;
+ bufferlist mutation;
+
+ _pending_prepare() : onfinish(0), ptid(0), pbl(0) {}
+ _pending_prepare(MDSContext *c, version_t *pt, bufferlist *pb, bufferlist& m) :
+ onfinish(c), ptid(pt), pbl(pb), mutation(m) {}
+ };
+
+ map<uint64_t, _pending_prepare> pending_prepare;
+ map<version_t, uint64_t> prepared_update;
+ list<_pending_prepare> waiting_for_reqid;
+
+ // pending commits
+ map<version_t, LogSegment*> pending_commit;
+ map<version_t, MDSContext::vec > ack_waiters;
+
+ void handle_reply(class MMDSTableQuery *m);
+ void _logged_ack(version_t tid);
+ friend class C_LoggedAck;
+
+public:
+ MDSTableClient(MDSRank *m, int tab) :
+ mds(m), table(tab), last_reqid(~0ULL), server_ready(false) {}
+ virtual ~MDSTableClient() {}
+
+ void handle_request(const MMDSTableRequest::const_ref &m);
+
+ void _prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, MDSContext *onfinish);
+ void commit(version_t tid, LogSegment *ls);
+
+ void resend_commits();
+ void resend_prepares();
+
+ // for recovery (by me)
+ void got_journaled_agree(version_t tid, LogSegment *ls);
+ void got_journaled_ack(version_t tid);
+
+ bool has_committed(version_t tid) const {
+ return pending_commit.count(tid) == 0;
+ }
+ void wait_for_ack(version_t tid, MDSContext *c) {
+ ack_waiters[tid].push_back(c);
+ }
+
+ set<version_t> get_journaled_tids() const {
+ set<version_t> tids;
+ for (auto p : pending_commit)
+ tids.insert(p.first);
+ return tids;
+ }
+
+ void handle_mds_failure(mds_rank_t mds);
+
+ // child must implement
+ virtual void resend_queries() = 0;
+ virtual void handle_query_result(const MMDSTableRequest::const_ref &m) = 0;
+ virtual void handle_notify_prep(const MMDSTableRequest::const_ref &m) = 0;
+ virtual void notify_commit(version_t tid) = 0;
+
+ // and friendly front-end for _prepare.
+
+};
+
+#endif
diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc
new file mode 100644
index 00000000..cd7724f5
--- /dev/null
+++ b/src/mds/MDSTableServer.cc
@@ -0,0 +1,373 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "MDSTableServer.h"
+#include "MDSRank.h"
+#include "MDLog.h"
+#include "msg/Messenger.h"
+
+#include "events/ETableServer.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << rank << ".tableserver(" << get_mdstable_name(table) << ") "
+
+void MDSTableServer::handle_request(const MMDSTableRequest::const_ref &req)
+{
+ ceph_assert(req->op >= 0);
+ switch (req->op) {
+ case TABLESERVER_OP_QUERY: return handle_query(req);
+ case TABLESERVER_OP_PREPARE: return handle_prepare(req);
+ case TABLESERVER_OP_COMMIT: return handle_commit(req);
+ case TABLESERVER_OP_ROLLBACK: return handle_rollback(req);
+ case TABLESERVER_OP_NOTIFY_ACK: return handle_notify_ack(req);
+ default: ceph_abort_msg("unrecognized mds_table_server request op");
+ }
+}
+
+class C_Prepare : public MDSLogContextBase {
+ MDSTableServer *server;
+ MMDSTableRequest::const_ref req;
+ version_t tid;
+ MDSRank *get_mds() override { return server->mds; }
+public:
+
+ C_Prepare(MDSTableServer *s, const MMDSTableRequest::const_ref r, version_t v) : server(s), req(r), tid(v) {}
+ void finish(int r) override {
+ server->_prepare_logged(req, tid);
+ }
+};
+
+// prepare
+void MDSTableServer::handle_prepare(const MMDSTableRequest::const_ref &req)
+{
+ dout(7) << "handle_prepare " << *req << dendl;
+ mds_rank_t from = mds_rank_t(req->get_source().num());
+
+ ceph_assert(g_conf()->mds_kill_mdstable_at != 1);
+
+ projected_version++;
+
+ ETableServer *le = new ETableServer(table, TABLESERVER_OP_PREPARE, req->reqid, from,
+ projected_version, projected_version);
+ mds->mdlog->start_entry(le);
+ le->mutation = req->bl;
+ mds->mdlog->submit_entry(le, new C_Prepare(this, req, projected_version));
+ mds->mdlog->flush();
+}
+
+void MDSTableServer::_prepare_logged(const MMDSTableRequest::const_ref &req, version_t tid)
+{
+ dout(7) << "_create_logged " << *req << " tid " << tid << dendl;
+ mds_rank_t from = mds_rank_t(req->get_source().num());
+
+ ceph_assert(g_conf()->mds_kill_mdstable_at != 2);
+
+ _note_prepare(from, req->reqid);
+ bufferlist out;
+ _prepare(req->bl, req->reqid, from, out);
+ ceph_assert(version == tid);
+
+ auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_AGREE, req->reqid, tid);
+ reply->bl = std::move(out);
+
+ if (_notify_prep(tid)) {
+ auto& p = pending_notifies[tid];
+ p.notify_ack_gather = active_clients;
+ p.mds = from;
+ p.reply = reply;
+ } else {
+ mds->send_message_mds(reply, from);
+ }
+}
+
+void MDSTableServer::handle_notify_ack(const MMDSTableRequest::const_ref &m)
+{
+ dout(7) << __func__ << " " << *m << dendl;
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+ version_t tid = m->get_tid();
+
+ auto p = pending_notifies.find(tid);
+ if (p != pending_notifies.end()) {
+ if (p->second.notify_ack_gather.erase(from)) {
+ if (p->second.notify_ack_gather.empty()) {
+ if (p->second.onfinish)
+ p->second.onfinish->complete(0);
+ else
+ mds->send_message_mds(p->second.reply, p->second.mds);
+ pending_notifies.erase(p);
+ }
+ } else {
+ dout(0) << "got unexpected notify ack for tid " << tid << " from mds." << from << dendl;
+ }
+ } else {
+ }
+}
+
+class C_Commit : public MDSLogContextBase {
+ MDSTableServer *server;
+ MMDSTableRequest::const_ref req;
+ MDSRank *get_mds() override { return server->mds; }
+public:
+ C_Commit(MDSTableServer *s, const MMDSTableRequest::const_ref &r) : server(s), req(r) {}
+ void finish(int r) override {
+ server->_commit_logged(req);
+ }
+};
+
+// commit
+void MDSTableServer::handle_commit(const MMDSTableRequest::const_ref &req)
+{
+ dout(7) << "handle_commit " << *req << dendl;
+
+ version_t tid = req->get_tid();
+
+ if (pending_for_mds.count(tid)) {
+
+ if (committing_tids.count(tid)) {
+ dout(0) << "got commit for tid " << tid << ", already committing, waiting." << dendl;
+ return;
+ }
+
+ ceph_assert(g_conf()->mds_kill_mdstable_at != 5);
+
+ projected_version++;
+ committing_tids.insert(tid);
+
+ mds->mdlog->start_submit_entry(new ETableServer(table, TABLESERVER_OP_COMMIT, 0, MDS_RANK_NONE,
+ tid, projected_version),
+ new C_Commit(this, req));
+ }
+ else if (tid <= version) {
+ dout(0) << "got commit for tid " << tid << " <= " << version
+ << ", already committed, sending ack." << dendl;
+ auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_ACK, req->reqid, tid);
+ mds->send_message(reply, req->get_connection());
+ }
+ else {
+ // wtf.
+ dout(0) << "got commit for tid " << tid << " > " << version << dendl;
+ ceph_assert(tid <= version);
+ }
+}
+
+void MDSTableServer::_commit_logged(const MMDSTableRequest::const_ref &req)
+{
+ dout(7) << "_commit_logged, sending ACK" << dendl;
+
+ ceph_assert(g_conf()->mds_kill_mdstable_at != 6);
+ version_t tid = req->get_tid();
+
+ pending_for_mds.erase(tid);
+ committing_tids.erase(tid);
+
+ _commit(tid, req);
+ _note_commit(tid);
+
+ auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_ACK, req->reqid, req->get_tid());
+ mds->send_message_mds(reply, mds_rank_t(req->get_source().num()));
+}
+
+class C_Rollback : public MDSLogContextBase {
+ MDSTableServer *server;
+ MMDSTableRequest::const_ref req;
+ MDSRank *get_mds() override { return server->mds; }
+public:
+ C_Rollback(MDSTableServer *s, const MMDSTableRequest::const_ref &r) : server(s), req(r) {}
+ void finish(int r) override {
+ server->_rollback_logged(req);
+ }
+};
+
+// ROLLBACK
+void MDSTableServer::handle_rollback(const MMDSTableRequest::const_ref &req)
+{
+ dout(7) << "handle_rollback " << *req << dendl;
+
+ ceph_assert(g_conf()->mds_kill_mdstable_at != 8);
+ version_t tid = req->get_tid();
+ ceph_assert(pending_for_mds.count(tid));
+ ceph_assert(!committing_tids.count(tid));
+
+ projected_version++;
+ committing_tids.insert(tid);
+
+ mds->mdlog->start_submit_entry(new ETableServer(table, TABLESERVER_OP_ROLLBACK, 0, MDS_RANK_NONE,
+ tid, projected_version),
+ new C_Rollback(this, req));
+}
+
+void MDSTableServer::_rollback_logged(const MMDSTableRequest::const_ref &req)
+{
+ dout(7) << "_rollback_logged " << *req << dendl;
+
+ version_t tid = req->get_tid();
+
+ pending_for_mds.erase(tid);
+ committing_tids.erase(tid);
+
+ _rollback(tid);
+ _note_rollback(tid);
+}
+
+
+
+// SERVER UPDATE
+class C_ServerUpdate : public MDSLogContextBase {
+ MDSTableServer *server;
+ bufferlist bl;
+ MDSRank *get_mds() override { return server->mds; }
+public:
+ C_ServerUpdate(MDSTableServer *s, bufferlist &b) : server(s), bl(b) {}
+ void finish(int r) override {
+ server->_server_update_logged(bl);
+ }
+};
+
+void MDSTableServer::do_server_update(bufferlist& bl)
+{
+ dout(10) << "do_server_update len " << bl.length() << dendl;
+
+ projected_version++;
+
+ ETableServer *le = new ETableServer(table, TABLESERVER_OP_SERVER_UPDATE, 0, MDS_RANK_NONE, 0, projected_version);
+ mds->mdlog->start_entry(le);
+ le->mutation = bl;
+ mds->mdlog->submit_entry(le, new C_ServerUpdate(this, bl));
+}
+
+void MDSTableServer::_server_update_logged(bufferlist& bl)
+{
+ dout(10) << "_server_update_logged len " << bl.length() << dendl;
+ _server_update(bl);
+ _note_server_update(bl);
+}
+
+// recovery
+
+class C_ServerRecovery : public MDSContext {
+ MDSTableServer *server;
+ MDSRank *get_mds() override { return server->mds; }
+public:
+ C_ServerRecovery(MDSTableServer *s) : server(s) {}
+ void finish(int r) override {
+ server->_do_server_recovery();
+ }
+};
+
+void MDSTableServer::_do_server_recovery()
+{
+ dout(7) << __func__ << " " << active_clients << dendl;
+ map<mds_rank_t, uint64_t> next_reqids;
+
+ for (auto p : pending_for_mds) {
+ mds_rank_t who = p.second.mds;
+ if (!active_clients.count(who))
+ continue;
+
+ if (p.second.reqid >= next_reqids[who])
+ next_reqids[who] = p.second.reqid + 1;
+
+ version_t tid = p.second.tid;
+ auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_AGREE, p.second.reqid, tid);
+ _get_reply_buffer(tid, &reply->bl);
+ mds->send_message_mds(reply, who);
+ }
+
+ for (auto p : active_clients) {
+ auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_SERVER_READY, next_reqids[p]);
+ mds->send_message_mds(reply, p);
+ }
+ recovered = true;
+}
+
+void MDSTableServer::finish_recovery(set<mds_rank_t>& active)
+{
+ dout(7) << __func__ << dendl;
+
+ active_clients = active;
+
+ // don't know if survivor mds have received all 'notify prep' messages.
+ // so we need to send 'notify prep' again.
+ if (!pending_for_mds.empty() && _notify_prep(version)) {
+ auto& q = pending_notifies[version];
+ q.notify_ack_gather = active_clients;
+ q.mds = MDS_RANK_NONE;
+ q.onfinish = new C_ServerRecovery(this);
+ } else {
+ _do_server_recovery();
+ }
+}
+
+void MDSTableServer::handle_mds_recovery(mds_rank_t who)
+{
+ dout(7) << "handle_mds_recovery mds." << who << dendl;
+
+ active_clients.insert(who);
+ if (!recovered) {
+ dout(7) << " still not recovered, delaying" << dendl;
+ return;
+ }
+
+ uint64_t next_reqid = 0;
+ // resend agrees for recovered mds
+ for (auto p = pending_for_mds.begin(); p != pending_for_mds.end(); ++p) {
+ if (p->second.mds != who)
+ continue;
+ ceph_assert(!pending_notifies.count(p->second.tid));
+
+ if (p->second.reqid >= next_reqid)
+ next_reqid = p->second.reqid + 1;
+
+ auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_AGREE, p->second.reqid, p->second.tid);
+ _get_reply_buffer(p->second.tid, &reply->bl);
+ mds->send_message_mds(reply, who);
+ }
+
+ auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_SERVER_READY, next_reqid);
+ mds->send_message_mds(reply, who);
+}
+
+void MDSTableServer::handle_mds_failure_or_stop(mds_rank_t who)
+{
+ dout(7) << __func__ << " mds." << who << dendl;
+
+ active_clients.erase(who);
+
+ list<MMDSTableRequest::ref> rollback;
+ for (auto p = pending_notifies.begin(); p != pending_notifies.end(); ) {
+ auto q = p++;
+ if (q->second.mds == who) {
+ // haven't sent reply yet.
+ rollback.push_back(q->second.reply);
+ pending_notifies.erase(q);
+ } else if (q->second.notify_ack_gather.erase(who)) {
+ // the failed mds will reload snaptable when it recovers.
+ // so we can remove it from the gather set.
+ if (q->second.notify_ack_gather.empty()) {
+ if (q->second.onfinish)
+ q->second.onfinish->complete(0);
+ else
+ mds->send_message_mds(q->second.reply, q->second.mds);
+ pending_notifies.erase(q);
+ }
+ }
+ }
+
+ for (auto &req : rollback) {
+ req->op = TABLESERVER_OP_ROLLBACK;
+ handle_rollback(req);
+ }
+}
diff --git a/src/mds/MDSTableServer.h b/src/mds/MDSTableServer.h
new file mode 100644
index 00000000..83f10315
--- /dev/null
+++ b/src/mds/MDSTableServer.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDSTABLESERVER_H
+#define CEPH_MDSTABLESERVER_H
+
+#include "MDSTable.h"
+#include "MDSContext.h"
+
+#include "messages/MMDSTableRequest.h"
+
+class MDSTableServer : public MDSTable {
+protected:
+ int table;
+ bool recovered;
+ set<mds_rank_t> active_clients;
+private:
+ map<version_t,mds_table_pending_t> pending_for_mds; // ** child should encode this! **
+ set<version_t> committing_tids;
+
+ struct notify_info_t {
+ set<mds_rank_t> notify_ack_gather;
+ mds_rank_t mds;
+ MMDSTableRequest::ref reply;
+ MDSContext *onfinish;
+ notify_info_t() : reply(NULL), onfinish(NULL) {}
+ };
+ map<version_t, notify_info_t> pending_notifies;
+
+ void handle_prepare(const MMDSTableRequest::const_ref &m);
+ void _prepare_logged(const MMDSTableRequest::const_ref &m, version_t tid);
+ friend class C_Prepare;
+
+ void handle_commit(const MMDSTableRequest::const_ref &m);
+ void _commit_logged(const MMDSTableRequest::const_ref &m);
+ friend class C_Commit;
+
+ void handle_rollback(const MMDSTableRequest::const_ref &m);
+ void _rollback_logged(const MMDSTableRequest::const_ref &m);
+ friend class C_Rollback;
+
+ void _server_update_logged(bufferlist& bl);
+ friend class C_ServerUpdate;
+
+ void handle_notify_ack(const MMDSTableRequest::const_ref &m);
+
+public:
+ virtual void handle_query(const MMDSTableRequest::const_ref &m) = 0;
+ virtual void _prepare(const bufferlist &bl, uint64_t reqid, mds_rank_t bymds, bufferlist& out) = 0;
+ virtual void _get_reply_buffer(version_t tid, bufferlist *pbl) const = 0;
+ virtual void _commit(version_t tid, MMDSTableRequest::const_ref req) = 0;
+ virtual void _rollback(version_t tid) = 0;
+ virtual void _server_update(bufferlist& bl) { ceph_abort(); }
+ virtual bool _notify_prep(version_t tid) { return false; };
+
+ void _note_prepare(mds_rank_t mds, uint64_t reqid, bool replay=false) {
+ version++;
+ if (replay)
+ projected_version = version;
+ pending_for_mds[version].mds = mds;
+ pending_for_mds[version].reqid = reqid;
+ pending_for_mds[version].tid = version;
+ }
+ void _note_commit(uint64_t tid, bool replay=false) {
+ version++;
+ if (replay)
+ projected_version = version;
+ pending_for_mds.erase(tid);
+ }
+ void _note_rollback(uint64_t tid, bool replay=false) {
+ version++;
+ if (replay)
+ projected_version = version;
+ pending_for_mds.erase(tid);
+ }
+ void _note_server_update(bufferlist& bl, bool replay=false) {
+ version++;
+ if (replay)
+ projected_version = version;
+ }
+
+ MDSTableServer(MDSRank *m, int tab) :
+ MDSTable(m, get_mdstable_name(tab), false), table(tab), recovered(false) {}
+ ~MDSTableServer() override {}
+
+ void reset_state() override {
+ pending_for_mds.clear();
+ ++version;
+ }
+
+ void handle_request(const MMDSTableRequest::const_ref &m);
+ void do_server_update(bufferlist& bl);
+
+ virtual void encode_server_state(bufferlist& bl) const = 0;
+ virtual void decode_server_state(bufferlist::const_iterator& bl) = 0;
+
+ void encode_state(bufferlist& bl) const override {
+ encode_server_state(bl);
+ encode(pending_for_mds, bl);
+ }
+ void decode_state(bufferlist::const_iterator& bl) override {
+ decode_server_state(bl);
+ decode(pending_for_mds, bl);
+ }
+
+ // recovery
+ void finish_recovery(set<mds_rank_t>& active);
+ void _do_server_recovery();
+ friend class C_ServerRecovery;
+
+ void handle_mds_recovery(mds_rank_t who);
+ void handle_mds_failure_or_stop(mds_rank_t who);
+};
+
+#endif
diff --git a/src/mds/Mantle.cc b/src/mds/Mantle.cc
new file mode 100644
index 00000000..15d325e8
--- /dev/null
+++ b/src/mds/Mantle.cc
@@ -0,0 +1,139 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Michael Sevilla <mikesevilla3@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "mdstypes.h"
+#include "MDSRank.h"
+#include "Mantle.h"
+#include "msg/Messenger.h"
+#include "common/Clock.h"
+#include "CInode.h"
+
+#include <fstream>
+
+#define dout_context g_ceph_context
+#undef dout_prefix
+#define dout_prefix *_dout << "mds.mantle "
+#define mantle_dout(lvl) \
+ do {\
+ auto subsys = ceph_subsys_mds;\
+ if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
+ subsys = ceph_subsys_mds_balancer;\
+ }\
+ dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
+
+#define mantle_dendl dendl; } while (0)
+
+
+static int dout_wrapper(lua_State *L)
+{
+ int level = luaL_checkinteger(L, 1);
+ lua_concat(L, lua_gettop(L)-1);
+ mantle_dout(ceph::dout::need_dynamic(level)) << lua_tostring(L, 2)
+ << mantle_dendl;
+ return 0;
+}
+
+int Mantle::balance(std::string_view script,
+ mds_rank_t whoami,
+ const std::vector<std::map<std::string, double>> &metrics,
+ std::map<mds_rank_t, double> &my_targets)
+{
+ lua_settop(L, 0); /* clear the stack */
+
+ /* load the balancer */
+ if (luaL_loadstring(L, script.data())) {
+ mantle_dout(0) << "WARNING: mantle could not load balancer: "
+ << lua_tostring(L, -1) << mantle_dendl;
+ return -EINVAL;
+ }
+
+ /* tell the balancer which mds is making the decision */
+ lua_pushinteger(L, (lua_Integer)whoami);
+ lua_setglobal(L, "whoami");
+
+ /* global mds metrics to hold all dictionaries */
+ lua_newtable(L);
+
+ /* push name of mds (i) and its metrics onto Lua stack */
+ for (size_t i=0; i < metrics.size(); i++) {
+ lua_newtable(L);
+
+ /* push values into this mds's table; setfield assigns key/pops val */
+ for (const auto &it : metrics[i]) {
+ lua_pushnumber(L, it.second);
+ lua_setfield(L, -2, it.first.c_str());
+ }
+
+ /* in global mds table at stack[-3], set k=stack[-1] to v=stack[-2] */
+ lua_seti(L, -2, i);
+ }
+
+ /* set the name of the global mds table */
+ lua_setglobal(L, "mds");
+
+ ceph_assert(lua_gettop(L) == 1);
+ if (lua_pcall(L, 0, 1, 0) != LUA_OK) {
+ mantle_dout(0) << "WARNING: mantle could not execute script: "
+ << lua_tostring(L, -1) << mantle_dendl;
+ return -EINVAL;
+ }
+
+ /* parse response by iterating over Lua stack */
+ if (lua_istable(L, -1) == 0) {
+ mantle_dout(0) << "WARNING: mantle script returned a malformed response" << mantle_dendl;
+ return -EINVAL;
+ }
+
+ /* fill in return value */
+ for (lua_pushnil(L); lua_next(L, -2); lua_pop(L, 1)) {
+ if (!lua_isinteger(L, -2) || !lua_isnumber(L, -1)) {
+ mantle_dout(0) << "WARNING: mantle script returned a malformed response" << mantle_dendl;
+ return -EINVAL;
+ }
+ mds_rank_t rank(lua_tointeger(L, -2));
+ my_targets[rank] = lua_tonumber(L, -1);
+ }
+
+ return 0;
+}
+
+Mantle::Mantle (void)
+{
+ /* build lua vm state */
+ L = luaL_newstate();
+ if (!L) {
+ mantle_dout(0) << "WARNING: mantle could not load Lua state" << mantle_dendl;
+ throw std::bad_alloc();
+ }
+
+ /* balancer policies can use basic Lua functions */
+ static const luaL_Reg loadedlibs[] = {
+ {"_G", luaopen_base},
+ {LUA_COLIBNAME, luaopen_coroutine},
+ {LUA_STRLIBNAME, luaopen_string},
+ {LUA_MATHLIBNAME, luaopen_math},
+ {LUA_TABLIBNAME, luaopen_table},
+ {LUA_UTF8LIBNAME, luaopen_utf8},
+ {NULL, NULL}
+ };
+
+ const luaL_Reg *lib;
+ for (lib = loadedlibs; lib->func; lib++) {
+ luaL_requiref(L, lib->name, lib->func, 1);
+ lua_pop(L, 1); /* remove lib */
+ }
+
+ /* setup debugging */
+ lua_register(L, "BAL_LOG", dout_wrapper);
+}
diff --git a/src/mds/Mantle.h b/src/mds/Mantle.h
new file mode 100644
index 00000000..ffc1843a
--- /dev/null
+++ b/src/mds/Mantle.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Michael Sevilla <mikesevilla3@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MANTLE_H
+#define CEPH_MANTLE_H
+
+#include <string_view>
+
+#include <lua.hpp>
+#include <vector>
+#include <map>
+#include <string>
+
+#include "mdstypes.h"
+
+class Mantle {
+ public:
+ Mantle();
+ ~Mantle() { if (L) lua_close(L); }
+ int balance(std::string_view script,
+ mds_rank_t whoami,
+ const std::vector <std::map<std::string, double>> &metrics,
+ std::map<mds_rank_t,double> &my_targets);
+
+ protected:
+ lua_State *L;
+};
+
+#endif
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
new file mode 100644
index 00000000..98bf78c3
--- /dev/null
+++ b/src/mds/Migrator.cc
@@ -0,0 +1,3611 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "MDSRank.h"
+#include "MDCache.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+#include "Migrator.h"
+#include "Locker.h"
+#include "Server.h"
+
+#include "MDBalancer.h"
+#include "MDLog.h"
+#include "MDSMap.h"
+#include "Mutation.h"
+
+#include "include/filepath.h"
+#include "common/likely.h"
+
+#include "events/EExport.h"
+#include "events/EImportStart.h"
+#include "events/EImportFinish.h"
+#include "events/ESessions.h"
+
+#include "msg/Messenger.h"
+
+#include "messages/MClientCaps.h"
+
+/*
+ * this is what the dir->dir_auth values look like
+ *
+ * dir_auth authbits
+ * export
+ * me me - before
+ * me, me me - still me, but preparing for export
+ * me, them me - send MExportDir (peer is preparing)
+ * them, me me - journaled EExport
+ * them them - done
+ *
+ * import:
+ * them them - before
+ * me, them me - journaled EImportStart
+ * me me - done
+ *
+ * which implies:
+ * - auth bit is set if i am listed as first _or_ second dir_auth.
+ */
+
+#include "common/config.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator "
+
+
+class MigratorContext : public MDSContext {
+protected:
+ Migrator *mig;
+ MDSRank *get_mds() override {
+ return mig->mds;
+ }
+public:
+ explicit MigratorContext(Migrator *mig_) : mig(mig_) {
+ ceph_assert(mig != NULL);
+ }
+};
+
+class MigratorLogContext : public MDSLogContextBase {
+protected:
+ Migrator *mig;
+ MDSRank *get_mds() override {
+ return mig->mds;
+ }
+public:
+ explicit MigratorLogContext(Migrator *mig_) : mig(mig_) {
+ ceph_assert(mig != NULL);
+ }
+};
+
+void Migrator::dispatch(const Message::const_ref &m)
+{
+ switch (m->get_type()) {
+ // import
+ case MSG_MDS_EXPORTDIRDISCOVER:
+ handle_export_discover(MExportDirDiscover::msgref_cast(m));
+ break;
+ case MSG_MDS_EXPORTDIRPREP:
+ handle_export_prep(MExportDirPrep::msgref_cast(m));
+ break;
+ case MSG_MDS_EXPORTDIR:
+ if (unlikely(inject_session_race)) {
+ dout(0) << "waiting for inject_session_race" << dendl;
+ mds->wait_for_any_client_connection(new C_MDS_RetryMessage(mds, m));
+ } else {
+ handle_export_dir(MExportDir::msgref_cast(m));
+ }
+ break;
+ case MSG_MDS_EXPORTDIRFINISH:
+ handle_export_finish(MExportDirFinish::msgref_cast(m));
+ break;
+ case MSG_MDS_EXPORTDIRCANCEL:
+ handle_export_cancel(MExportDirCancel::msgref_cast(m));
+ break;
+
+ // export
+ case MSG_MDS_EXPORTDIRDISCOVERACK:
+ handle_export_discover_ack(MExportDirDiscoverAck::msgref_cast(m));
+ break;
+ case MSG_MDS_EXPORTDIRPREPACK:
+ handle_export_prep_ack(MExportDirPrepAck::msgref_cast(m));
+ break;
+ case MSG_MDS_EXPORTDIRACK:
+ handle_export_ack(MExportDirAck::msgref_cast(m));
+ break;
+ case MSG_MDS_EXPORTDIRNOTIFYACK:
+ handle_export_notify_ack(MExportDirNotifyAck::msgref_cast(m));
+ break;
+
+ // export 3rd party (dir_auth adjustments)
+ case MSG_MDS_EXPORTDIRNOTIFY:
+ handle_export_notify(MExportDirNotify::msgref_cast(m));
+ break;
+
+ // caps
+ case MSG_MDS_EXPORTCAPS:
+ handle_export_caps(MExportCaps::msgref_cast(m));
+ break;
+ case MSG_MDS_EXPORTCAPSACK:
+ handle_export_caps_ack(MExportCapsAck::msgref_cast(m));
+ break;
+ case MSG_MDS_GATHERCAPS:
+ handle_gather_caps(MGatherCaps::msgref_cast(m));
+ break;
+
+ default:
+ derr << "migrator unknown message " << m->get_type() << dendl;
+ ceph_abort_msg("migrator unknown message");
+ }
+}
+
+
+class C_MDC_EmptyImport : public MigratorContext {
+ CDir *dir;
+public:
+ C_MDC_EmptyImport(Migrator *m, CDir *d) :
+ MigratorContext(m), dir(d) {
+ dir->get(CDir::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ mig->export_empty_import(dir);
+ dir->put(CDir::PIN_PTRWAITER);
+ }
+};
+
+
+void Migrator::export_empty_import(CDir *dir)
+{
+ dout(7) << "export_empty_import " << *dir << dendl;
+ ceph_assert(dir->is_subtree_root());
+
+ if (dir->inode->is_auth()) {
+ dout(7) << " inode is auth" << dendl;
+ return;
+ }
+ if (!dir->is_auth()) {
+ dout(7) << " not auth" << dendl;
+ return;
+ }
+ if (dir->is_freezing() || dir->is_frozen()) {
+ dout(7) << " freezing or frozen" << dendl;
+ return;
+ }
+ if (dir->get_num_head_items() > 0) {
+ dout(7) << " not actually empty" << dendl;
+ return;
+ }
+ if (dir->inode->is_root()) {
+ dout(7) << " root" << dendl;
+ return;
+ }
+
+ mds_rank_t dest = dir->inode->authority().first;
+ //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
+
+ dout(7) << " really empty, exporting to " << dest << dendl;
+ assert (dest != mds->get_nodeid());
+
+ dout(7) << "exporting to mds." << dest
+ << " empty import " << *dir << dendl;
+ export_dir( dir, dest );
+}
+
+void Migrator::find_stale_export_freeze()
+{
+ utime_t now = ceph_clock_now();
+ utime_t cutoff = now;
+ cutoff -= g_conf()->mds_freeze_tree_timeout;
+
+
+ /*
+ * We could have situations like:
+ *
+ * - mds.0 authpins an item in subtree A
+ * - mds.0 sends request to mds.1 to authpin an item in subtree B
+ * - mds.0 freezes subtree A
+ * - mds.1 authpins an item in subtree B
+ * - mds.1 sends request to mds.0 to authpin an item in subtree A
+ * - mds.1 freezes subtree B
+ * - mds.1 receives the remote authpin request from mds.0
+ * (wait because subtree B is freezing)
+ * - mds.0 receives the remote authpin request from mds.1
+ * (wait because subtree A is freezing)
+ *
+ *
+ * - client request authpins items in subtree B
+ * - freeze subtree B
+ * - import subtree A which is parent of subtree B
+ * (authpins parent inode of subtree B, see CDir::set_dir_auth())
+ * - freeze subtree A
+ * - client request tries authpinning items in subtree A
+ * (wait because subtree A is freezing)
+ */
+ for (map<CDir*,export_state_t>::iterator p = export_state.begin();
+ p != export_state.end(); ) {
+ CDir* dir = p->first;
+ export_state_t& stat = p->second;
+ ++p;
+ if (stat.state != EXPORT_DISCOVERING && stat.state != EXPORT_FREEZING)
+ continue;
+ ceph_assert(dir->freeze_tree_state);
+ if (stat.last_cum_auth_pins != dir->freeze_tree_state->auth_pins) {
+ stat.last_cum_auth_pins = dir->freeze_tree_state->auth_pins;
+ stat.last_cum_auth_pins_change = now;
+ continue;
+ }
+ if (stat.last_cum_auth_pins_change >= cutoff)
+ continue;
+ if (stat.num_remote_waiters > 0 ||
+ (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
+ export_try_cancel(dir);
+ }
+ }
+}
+
+void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
+{
+ dout(10) << "export_try_cancel " << *dir << dendl;
+
+ map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+ ceph_assert(it != export_state.end());
+
+ int state = it->second.state;
+ switch (state) {
+ case EXPORT_LOCKING:
+ dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl;
+ num_locking_exports--;
+ it->second.state = EXPORT_CANCELLED;
+ dir->auth_unpin(this);
+ break;
+ case EXPORT_DISCOVERING:
+ dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl;
+ it->second.state = EXPORT_CANCELLED;
+ dir->unfreeze_tree(); // cancel the freeze
+ dir->auth_unpin(this);
+ if (notify_peer &&
+ (!mds->is_cluster_degraded() ||
+ mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
+ mds->send_message_mds(MExportDirCancel::create(dir->dirfrag(), it->second.tid), it->second.peer);
+ break;
+
+ case EXPORT_FREEZING:
+ dout(10) << "export state=freezing : canceling freeze" << dendl;
+ it->second.state = EXPORT_CANCELLED;
+ dir->unfreeze_tree(); // cancel the freeze
+ if (dir->is_subtree_root())
+ cache->try_subtree_merge(dir);
+ if (notify_peer &&
+ (!mds->is_cluster_degraded() ||
+ mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
+ mds->send_message_mds(MExportDirCancel::create(dir->dirfrag(), it->second.tid), it->second.peer);
+ break;
+
+ // NOTE: state order reversal, warning comes after prepping
+ case EXPORT_WARNING:
+ dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
+ it->second.state = EXPORT_CANCELLING;
+ // fall-thru
+
+ case EXPORT_PREPPING:
+ if (state != EXPORT_WARNING) {
+ dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
+ it->second.state = EXPORT_CANCELLED;
+ }
+
+ {
+ // unpin bounds
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+ for (set<CDir*>::iterator q = bounds.begin();
+ q != bounds.end();
+ ++q) {
+ CDir *bd = *q;
+ bd->put(CDir::PIN_EXPORTBOUND);
+ bd->state_clear(CDir::STATE_EXPORTBOUND);
+ }
+ if (state == EXPORT_WARNING) {
+ // notify bystanders
+ export_notify_abort(dir, it->second, bounds);
+ // process delayed expires
+ cache->process_delayed_expire(dir);
+ }
+ }
+ dir->unfreeze_tree();
+ cache->try_subtree_merge(dir);
+ if (notify_peer &&
+ (!mds->is_cluster_degraded() ||
+ mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
+ mds->send_message_mds(MExportDirCancel::create(dir->dirfrag(), it->second.tid), it->second.peer);
+ break;
+
+ case EXPORT_EXPORTING:
+ dout(10) << "export state=exporting : reversing, and unfreezing" << dendl;
+ it->second.state = EXPORT_CANCELLING;
+ export_reverse(dir, it->second);
+ break;
+
+ case EXPORT_LOGGINGFINISH:
+ case EXPORT_NOTIFYING:
+ dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl;
+ // leave export_state, don't clean up now.
+ break;
+ case EXPORT_CANCELLING:
+ break;
+
+ default:
+ ceph_abort();
+ }
+
+ // finish clean-up?
+ if (it->second.state == EXPORT_CANCELLING ||
+ it->second.state == EXPORT_CANCELLED) {
+ MutationRef mut;
+ mut.swap(it->second.mut);
+
+ if (it->second.state == EXPORT_CANCELLED) {
+ export_cancel_finish(it);
+ }
+
+ // drop locks
+ if (state == EXPORT_LOCKING || state == EXPORT_DISCOVERING) {
+ MDRequestRef mdr = static_cast<MDRequestImpl*>(mut.get());
+ ceph_assert(mdr);
+ mds->mdcache->request_kill(mdr);
+ } else if (mut) {
+ mds->locker->drop_locks(mut.get());
+ mut->cleanup();
+ }
+
+ cache->show_subtrees();
+
+ maybe_do_queued_export();
+ }
+}
+
+void Migrator::export_cancel_finish(export_state_iterator& it)
+{
+ CDir *dir = it->first;
+ bool unpin = (it->second.state == EXPORT_CANCELLING);
+ auto parent = std::move(it->second.parent);
+
+ total_exporting_size -= it->second.approx_size;
+ export_state.erase(it);
+
+ ceph_assert(dir->state_test(CDir::STATE_EXPORTING));
+ dir->clear_exporting();
+
+ if (unpin) {
+ // pinned by Migrator::export_notify_abort()
+ dir->auth_unpin(this);
+ }
+ // send pending import_maps? (these need to go out when all exports have finished.)
+ cache->maybe_send_pending_resolves();
+
+ if (parent)
+ child_export_finish(parent, false);
+}
+
+// ==========================================================
+// mds failure handling
+
+void Migrator::handle_mds_failure_or_stop(mds_rank_t who)
+{
+ dout(5) << "handle_mds_failure_or_stop mds." << who << dendl;
+
+ // check my exports
+
+ // first add an extra auth_pin on any freezes, so that canceling a
+ // nested freeze doesn't complete one further up the hierarchy and
+ // confuse the shit out of us. we'll remove it after canceling the
+ // freeze. this way no freeze completions run before we want them
+ // to.
+ list<CDir*> pinned_dirs;
+ for (map<CDir*,export_state_t>::iterator p = export_state.begin();
+ p != export_state.end();
+ ++p) {
+ if (p->second.state == EXPORT_FREEZING) {
+ CDir *dir = p->first;
+ dout(10) << "adding temp auth_pin on freezing " << *dir << dendl;
+ dir->auth_pin(this);
+ pinned_dirs.push_back(dir);
+ }
+ }
+
+ map<CDir*,export_state_t>::iterator p = export_state.begin();
+ while (p != export_state.end()) {
+ map<CDir*,export_state_t>::iterator next = p;
+ ++next;
+ CDir *dir = p->first;
+
+ // abort exports:
+ // - that are going to the failed node
+ // - that aren't frozen yet (to avoid auth_pin deadlock)
+ // - they havne't prepped yet (they may need to discover bounds to do that)
+ if ((p->second.peer == who &&
+ p->second.state != EXPORT_CANCELLING) ||
+ p->second.state == EXPORT_LOCKING ||
+ p->second.state == EXPORT_DISCOVERING ||
+ p->second.state == EXPORT_FREEZING ||
+ p->second.state == EXPORT_PREPPING) {
+ // the guy i'm exporting to failed, or we're just freezing.
+ dout(10) << "cleaning up export state (" << p->second.state << ")"
+ << get_export_statename(p->second.state) << " of " << *dir << dendl;
+ export_try_cancel(dir);
+ } else if (p->second.peer != who) {
+ // bystander failed.
+ if (p->second.warning_ack_waiting.erase(who)) {
+ if (p->second.state == EXPORT_WARNING) {
+ p->second.notify_ack_waiting.erase(who); // they won't get a notify either.
+ // exporter waiting for warning acks, let's fake theirs.
+ dout(10) << "faking export_warning_ack from mds." << who
+ << " on " << *dir << " to mds." << p->second.peer
+ << dendl;
+ if (p->second.warning_ack_waiting.empty())
+ export_go(dir);
+ }
+ }
+ if (p->second.notify_ack_waiting.erase(who)) {
+ // exporter is waiting for notify acks, fake it
+ dout(10) << "faking export_notify_ack from mds." << who
+ << " on " << *dir << " to mds." << p->second.peer
+ << dendl;
+ if (p->second.state == EXPORT_NOTIFYING) {
+ if (p->second.notify_ack_waiting.empty())
+ export_finish(dir);
+ } else if (p->second.state == EXPORT_CANCELLING) {
+ if (p->second.notify_ack_waiting.empty()) {
+ export_cancel_finish(p);
+ }
+ }
+ }
+ }
+
+ // next!
+ p = next;
+ }
+
+
+ // check my imports
+ map<dirfrag_t,import_state_t>::iterator q = import_state.begin();
+ while (q != import_state.end()) {
+ map<dirfrag_t,import_state_t>::iterator next = q;
+ ++next;
+ dirfrag_t df = q->first;
+ CInode *diri = mds->mdcache->get_inode(df.ino);
+ CDir *dir = mds->mdcache->get_dirfrag(df);
+
+ if (q->second.peer == who) {
+ if (dir)
+ dout(10) << "cleaning up import state (" << q->second.state << ")"
+ << get_import_statename(q->second.state) << " of " << *dir << dendl;
+ else
+ dout(10) << "cleaning up import state (" << q->second.state << ")"
+ << get_import_statename(q->second.state) << " of " << df << dendl;
+
+ switch (q->second.state) {
+ case IMPORT_DISCOVERING:
+ dout(10) << "import state=discovering : clearing state" << dendl;
+ import_reverse_discovering(df);
+ break;
+
+ case IMPORT_DISCOVERED:
+ ceph_assert(diri);
+ dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
+ import_reverse_discovered(df, diri);
+ break;
+
+ case IMPORT_PREPPING:
+ ceph_assert(dir);
+ dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
+ import_reverse_prepping(dir, q->second);
+ break;
+
+ case IMPORT_PREPPED:
+ ceph_assert(dir);
+ dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
+ {
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+ import_remove_pins(dir, bounds);
+
+ // adjust auth back to the exporter
+ cache->adjust_subtree_auth(dir, q->second.peer);
+
+ // notify bystanders ; wait in aborting state
+ q->second.state = IMPORT_ABORTING;
+ import_notify_abort(dir, bounds);
+ ceph_assert(g_conf()->mds_kill_import_at != 10);
+ }
+ break;
+
+ case IMPORT_LOGGINGSTART:
+ ceph_assert(dir);
+ dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl;
+ import_reverse(dir);
+ break;
+
+ case IMPORT_ACKING:
+ ceph_assert(dir);
+ // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
+ dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl;
+ {
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+ cache->add_ambiguous_import(dir, bounds);
+ }
+ break;
+
+ case IMPORT_FINISHING:
+ ceph_assert(dir);
+ dout(10) << "import state=finishing : finishing import on " << *dir << dendl;
+ import_finish(dir, true);
+ break;
+
+ case IMPORT_ABORTING:
+ ceph_assert(dir);
+ dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl;
+ break;
+ }
+ } else {
+ auto bystanders_entry = q->second.bystanders.find(who);
+ if (bystanders_entry != q->second.bystanders.end()) {
+ q->second.bystanders.erase(bystanders_entry);
+ if (q->second.state == IMPORT_ABORTING) {
+ ceph_assert(dir);
+ dout(10) << "faking export_notify_ack from mds." << who
+ << " on aborting import " << *dir << " from mds." << q->second.peer
+ << dendl;
+ if (q->second.bystanders.empty())
+ import_reverse_unfreeze(dir);
+ }
+ }
+ }
+
+ // next!
+ q = next;
+ }
+
+ while (!pinned_dirs.empty()) {
+ CDir *dir = pinned_dirs.front();
+ dout(10) << "removing temp auth_pin on " << *dir << dendl;
+ dir->auth_unpin(this);
+ pinned_dirs.pop_front();
+ }
+}
+
+
+
+void Migrator::show_importing()
+{
+ dout(10) << "show_importing" << dendl;
+ for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
+ p != import_state.end();
+ ++p) {
+ CDir *dir = mds->mdcache->get_dirfrag(p->first);
+ if (dir) {
+ dout(10) << " importing from " << p->second.peer
+ << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
+ << " " << p->first << " " << *dir << dendl;
+ } else {
+ dout(10) << " importing from " << p->second.peer
+ << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
+ << " " << p->first << dendl;
+ }
+ }
+}
+
+void Migrator::show_exporting()
+{
+ dout(10) << "show_exporting" << dendl;
+ for (map<CDir*,export_state_t>::iterator p = export_state.begin();
+ p != export_state.end();
+ ++p)
+ dout(10) << " exporting to " << p->second.peer
+ << ": (" << p->second.state << ") " << get_export_statename(p->second.state)
+ << " " << p->first->dirfrag() << " " << *p->first << dendl;
+}
+
+
+
+void Migrator::audit()
+{
+ if (!g_conf()->subsys.should_gather<ceph_subsys_mds, 5>())
+ return; // hrm.
+
+ // import_state
+ show_importing();
+ for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
+ p != import_state.end();
+ ++p) {
+ if (p->second.state == IMPORT_DISCOVERING)
+ continue;
+ if (p->second.state == IMPORT_DISCOVERED) {
+ CInode *in = cache->get_inode(p->first.ino);
+ ceph_assert(in);
+ continue;
+ }
+ CDir *dir = cache->get_dirfrag(p->first);
+ ceph_assert(dir);
+ if (p->second.state == IMPORT_PREPPING)
+ continue;
+ if (p->second.state == IMPORT_ABORTING) {
+ ceph_assert(!dir->is_ambiguous_dir_auth());
+ ceph_assert(dir->get_dir_auth().first != mds->get_nodeid());
+ continue;
+ }
+ ceph_assert(dir->is_ambiguous_dir_auth());
+ ceph_assert(dir->authority().first == mds->get_nodeid() ||
+ dir->authority().second == mds->get_nodeid());
+ }
+
+ // export_state
+ show_exporting();
+ for (map<CDir*,export_state_t>::iterator p = export_state.begin();
+ p != export_state.end();
+ ++p) {
+ CDir *dir = p->first;
+ if (p->second.state == EXPORT_LOCKING ||
+ p->second.state == EXPORT_DISCOVERING ||
+ p->second.state == EXPORT_FREEZING ||
+ p->second.state == EXPORT_CANCELLING)
+ continue;
+ ceph_assert(dir->is_ambiguous_dir_auth());
+ ceph_assert(dir->authority().first == mds->get_nodeid() ||
+ dir->authority().second == mds->get_nodeid());
+ }
+
+ // ambiguous+me subtrees should be importing|exporting
+
+ // write me
+}
+
+
+
+
+
+// ==========================================================
+// EXPORT
+
+void Migrator::export_dir_nicely(CDir *dir, mds_rank_t dest)
+{
+ // enqueue
+ dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl;
+ export_queue.push_back(pair<dirfrag_t,mds_rank_t>(dir->dirfrag(), dest));
+
+ maybe_do_queued_export();
+}
+
+void Migrator::maybe_do_queued_export()
+{
+ static bool running;
+ if (running)
+ return;
+ running = true;
+
+ uint64_t max_total_size = max_export_size * 2;
+
+ while (!export_queue.empty() &&
+ max_total_size > total_exporting_size &&
+ max_total_size - total_exporting_size >=
+ max_export_size * (num_locking_exports + 1)) {
+
+ dirfrag_t df = export_queue.front().first;
+ mds_rank_t dest = export_queue.front().second;
+ export_queue.pop_front();
+
+ CDir *dir = mds->mdcache->get_dirfrag(df);
+ if (!dir) continue;
+ if (!dir->is_auth()) continue;
+
+ dout(0) << "nicely exporting to mds." << dest << " " << *dir << dendl;
+
+ export_dir(dir, dest);
+ }
+
+ running = false;
+}
+
+
+
+
+class C_MDC_ExportFreeze : public MigratorContext {
+ CDir *dir; // dir i'm exporting
+ uint64_t tid;
+public:
+ C_MDC_ExportFreeze(Migrator *m, CDir *e, uint64_t t) :
+ MigratorContext(m), dir(e), tid(t) {
+ dir->get(CDir::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ if (r >= 0)
+ mig->export_frozen(dir, tid);
+ dir->put(CDir::PIN_PTRWAITER);
+ }
+};
+
+
+void Migrator::get_export_lock_set(CDir *dir, MutationImpl::LockOpVec& lov)
+{
+ // path
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+
+ set<CDir*> wouldbe_bounds;
+ cache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds);
+
+ lov.reserve(trace.size() + wouldbe_bounds.size() + 8);
+
+ for (auto& dn : trace)
+ lov.add_rdlock(&dn->lock);
+
+ // prevent scatter gather race
+ lov.add_rdlock(&dir->get_inode()->dirfragtreelock);
+
+ // bound dftlocks:
+ // NOTE: We need to take an rdlock on bounding dirfrags during
+ // migration for a rather irritating reason: when we export the
+ // bound inode, we need to send scatterlock state for the dirfrags
+ // as well, so that the new auth also gets the correct info. If we
+ // race with a refragment, this info is useless, as we can't
+ // redivvy it up. And it's needed for the scatterlocks to work
+ // properly: when the auth is in a sync/lock state it keeps each
+ // dirfrag's portion in the local (auth OR replica) dirfrag.
+ for (auto& dir : wouldbe_bounds)
+ lov.add_rdlock(&dir->get_inode()->dirfragtreelock);
+
+ // above code may add duplicated locks
+ lov.sort_and_merge();
+}
+
+
+/** export_dir(dir, dest)
+ * public method to initiate an export.
+ * will fail if the directory is freezing, frozen, unpinnable, or root.
+ */
+void Migrator::export_dir(CDir *dir, mds_rank_t dest)
+{
+ dout(7) << "export_dir " << *dir << " to " << dest << dendl;
+ ceph_assert(dir->is_auth());
+ ceph_assert(dest != mds->get_nodeid());
+
+ if (!mds->is_stopping() && !dir->inode->is_exportable(dest)) {
+ dout(25) << "dir is export pinned" << dendl;
+ return;
+ }
+
+ if (!(mds->is_active() || mds->is_stopping())) {
+ dout(7) << "i'm not active, no exports for now" << dendl;
+ return;
+ }
+ if (mds->mdcache->is_readonly()) {
+ dout(7) << "read-only FS, no exports for now" << dendl;
+ return;
+ }
+ if (!mds->mdsmap->is_active(dest)) {
+ dout(7) << "dest not active, no exports for now" << dendl;
+ return;
+ }
+ if (mds->is_cluster_degraded()) {
+ dout(7) << "cluster degraded, no exports for now" << dendl;
+ return;
+ }
+ if (dir->inode->is_system()) {
+ dout(7) << "i won't export system dirs (root, mdsdirs, stray, /.ceph, etc.)" << dendl;
+ //ceph_abort();
+ return;
+ }
+
+ CDir* parent_dir = dir->inode->get_projected_parent_dir();
+ if (parent_dir && parent_dir->inode->is_stray()) {
+ if (parent_dir->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) {
+ dout(7) << "i won't export anything in stray" << dendl;
+ return;
+ }
+ }
+
+ if (dir->is_frozen() ||
+ dir->is_freezing()) {
+ dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl;
+ return;
+ }
+ if (dir->state_test(CDir::STATE_EXPORTING)) {
+ dout(7) << "already exporting" << dendl;
+ return;
+ }
+
+ if (g_conf()->mds_thrash_exports) {
+ // create random subtree bound (which will not be exported)
+ list<CDir*> ls;
+ for (auto p = dir->begin(); p != dir->end(); ++p) {
+ auto dn = p->second;
+ CDentry::linkage_t *dnl= dn->get_linkage();
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ if (in->is_dir())
+ in->get_nested_dirfrags(ls);
+ }
+ }
+ if (ls.size() > 0) {
+ int n = rand() % ls.size();
+ auto p = ls.begin();
+ while (n--) ++p;
+ CDir *bd = *p;
+ if (!(bd->is_frozen() || bd->is_freezing())) {
+ ceph_assert(bd->is_auth());
+ dir->state_set(CDir::STATE_AUXSUBTREE);
+ mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
+ dout(0) << "export_dir: create aux subtree " << *bd << " under " << *dir << dendl;
+ }
+ }
+ }
+
+ mds->hit_export_target(dest, -1);
+
+ dir->auth_pin(this);
+ dir->mark_exporting();
+
+ MDRequestRef mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
+ mdr->more()->export_dir = dir;
+ mdr->pin(dir);
+
+ ceph_assert(export_state.count(dir) == 0);
+ export_state_t& stat = export_state[dir];
+ num_locking_exports++;
+ stat.state = EXPORT_LOCKING;
+ stat.peer = dest;
+ stat.tid = mdr->reqid.tid;
+ stat.mut = mdr;
+
+ mds->mdcache->dispatch_request(mdr);
+}
+
+/*
+ * check if directory is too large to be export in whole. If it is,
+ * choose some subdirs, whose total size is suitable.
+ */
+void Migrator::maybe_split_export(CDir* dir, uint64_t max_size, bool null_okay,
+ vector<pair<CDir*, size_t> >& results)
+{
+ static const unsigned frag_size = 800;
+ static const unsigned inode_size = 1000;
+ static const unsigned cap_size = 80;
+ static const unsigned remote_size = 10;
+ static const unsigned null_size = 1;
+
+ // state for depth-first search
+ struct LevelData {
+ CDir *dir;
+ CDir::dentry_key_map::iterator iter;
+ size_t dirfrag_size = frag_size;
+ size_t subdirs_size = 0;
+ bool complete = true;
+ vector<CDir*> siblings;
+ vector<pair<CDir*, size_t> > subdirs;
+ LevelData(const LevelData&) = default;
+ LevelData(CDir *d) :
+ dir(d), iter(d->begin()) {}
+ };
+
+ vector<LevelData> stack;
+ stack.emplace_back(dir);
+
+ size_t found_size = 0;
+ size_t skipped_size = 0;
+
+ for (;;) {
+ auto& data = stack.back();
+ CDir *cur = data.dir;
+ auto& it = data.iter;
+ auto& dirfrag_size = data.dirfrag_size;
+
+ while(it != cur->end()) {
+ CDentry *dn = it->second;
+ ++it;
+
+ dirfrag_size += dn->name.size();
+ if (dn->get_linkage()->is_null()) {
+ dirfrag_size += null_size;
+ continue;
+ }
+ if (dn->get_linkage()->is_remote()) {
+ dirfrag_size += remote_size;
+ continue;
+ }
+
+ CInode *in = dn->get_linkage()->get_inode();
+ dirfrag_size += inode_size;
+ dirfrag_size += in->get_client_caps().size() * cap_size;
+
+ if (in->is_dir()) {
+ vector<CDir*> ls;
+ in->get_nested_dirfrags(ls);
+ std::reverse(ls.begin(), ls.end());
+
+ bool complete = true;
+ for (auto p = ls.begin(); p != ls.end(); ) {
+ if ((*p)->state_test(CDir::STATE_EXPORTING) ||
+ (*p)->is_freezing_dir() || (*p)->is_frozen_dir()) {
+ complete = false;
+ p = ls.erase(p);
+ } else {
+ ++p;
+ }
+ }
+ if (!complete) {
+ // skip exporting dir's ancestors. because they can't get
+ // frozen (exporting dir's parent inode is auth pinned).
+ for (auto p = stack.rbegin(); p < stack.rend(); ++p) {
+ if (!p->complete)
+ break;
+ p->complete = false;
+ }
+ }
+ if (!ls.empty()) {
+ stack.emplace_back(ls.back());
+ ls.pop_back();
+ stack.back().siblings.swap(ls);
+ break;
+ }
+ }
+ }
+ // did above loop push new dirfrag into the stack?
+ if (stack.back().dir != cur)
+ continue;
+
+ if (data.complete) {
+ auto cur_size = data.subdirs_size + dirfrag_size;
+ // we can do nothing with large dirfrag
+ if (cur_size >= max_size && found_size * 2 > max_size)
+ break;
+
+ found_size += dirfrag_size;
+
+ if (stack.size() > 1) {
+ auto& parent = stack[stack.size() - 2];
+ parent.subdirs.emplace_back(cur, cur_size);
+ parent.subdirs_size += cur_size;
+ }
+ } else {
+ // can't merge current dirfrag to its parent if there is skipped subdir
+ results.insert(results.end(), data.subdirs.begin(), data.subdirs.end());
+ skipped_size += dirfrag_size;
+ }
+
+ vector<CDir*> ls;
+ ls.swap(data.siblings);
+
+ stack.pop_back();
+ if (stack.empty())
+ break;
+
+ if (found_size >= max_size)
+ break;
+
+ // next dirfrag
+ if (!ls.empty()) {
+ stack.emplace_back(ls.back());
+ ls.pop_back();
+ stack.back().siblings.swap(ls);
+ }
+ }
+
+ for (auto& p : stack)
+ results.insert(results.end(), p.subdirs.begin(), p.subdirs.end());
+
+ if (results.empty() && (!skipped_size || !null_okay))
+ results.emplace_back(dir, found_size + skipped_size);
+}
+
+class C_M_ExportDirWait : public MigratorContext {
+ MDRequestRef mdr;
+ int count;
+public:
+ C_M_ExportDirWait(Migrator *m, MDRequestRef mdr, int count)
+ : MigratorContext(m), mdr(mdr), count(count) {}
+ void finish(int r) override {
+ mig->dispatch_export_dir(mdr, count);
+ }
+};
+
+void Migrator::dispatch_export_dir(MDRequestRef& mdr, int count)
+{
+ CDir *dir = mdr->more()->export_dir;
+ dout(7) << "dispatch_export_dir " << *mdr << " " << *dir << dendl;
+
+ map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+ if (it == export_state.end() || it->second.tid != mdr->reqid.tid) {
+ // export must have aborted.
+ dout(7) << "export must have aborted " << *mdr << dendl;
+ ceph_assert(mdr->killed || mdr->aborted);
+ if (mdr->aborted) {
+ mdr->aborted = false;
+ mds->mdcache->request_kill(mdr);
+ }
+ return;
+ }
+ ceph_assert(it->second.state == EXPORT_LOCKING);
+
+ mds_rank_t dest = it->second.peer;
+
+ if (!mds->is_export_target(dest)) {
+ dout(7) << "dest is not yet an export target" << dendl;
+ if (count > 3) {
+ dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl;
+ export_try_cancel(dir);
+ return;
+ }
+
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+
+ mds->wait_for_mdsmap(mds->mdsmap->get_epoch(), new C_M_ExportDirWait(this, mdr, count+1));
+ return;
+ }
+
+ if (!dir->inode->get_parent_dn()) {
+ dout(7) << "waiting for dir to become stable before export: " << *dir << dendl;
+ dir->add_waiter(CDir::WAIT_CREATED, new C_M_ExportDirWait(this, mdr, 1));
+ return;
+ }
+
+ if (mdr->aborted || dir->is_frozen() || dir->is_freezing()) {
+ dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl;
+ export_try_cancel(dir);
+ return;
+ }
+
+ // locks?
+ MutationImpl::LockOpVec lov;
+ get_export_lock_set(dir, lov);
+ // If auth MDS of the subtree root inode is neither the exporter MDS
+ // nor the importer MDS and it gathers subtree root's fragstat/neststat
+ // while the subtree is exporting. It's possible that the exporter MDS
+ // and the importer MDS both are auth MDS of the subtree root or both
+ // are not auth MDS of the subtree root at the time they receive the
+ // lock messages. So the auth MDS of the subtree root inode may get no
+ // or duplicated fragstat/neststat for the subtree root dirfrag.
+ lov.add_wrlock(&dir->get_inode()->filelock);
+ lov.add_wrlock(&dir->get_inode()->nestlock);
+ if (dir->get_inode()->is_auth()) {
+ dir->get_inode()->filelock.set_scatter_wanted();
+ dir->get_inode()->nestlock.set_scatter_wanted();
+ }
+
+ if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) {
+ if (mdr->aborted)
+ export_try_cancel(dir);
+ return;
+ }
+
+ ceph_assert(g_conf()->mds_kill_export_at != 1);
+
+ auto parent = it->second.parent;
+
+ vector<pair<CDir*, size_t> > results;
+ maybe_split_export(dir, max_export_size, (bool)parent, results);
+
+ if (results.size() == 1 && results.front().first == dir) {
+ num_locking_exports--;
+ it->second.state = EXPORT_DISCOVERING;
+ // send ExportDirDiscover (ask target)
+ filepath path;
+ dir->inode->make_path(path);
+ auto discover = MExportDirDiscover::create(dir->dirfrag(), path,
+ mds->get_nodeid(), it->second.tid);
+ mds->send_message_mds(discover, dest);
+ ceph_assert(g_conf()->mds_kill_export_at != 2);
+
+ it->second.last_cum_auth_pins_change = ceph_clock_now();
+ it->second.approx_size = results.front().second;
+ total_exporting_size += it->second.approx_size;
+
+ // start the freeze, but hold it up with an auth_pin.
+ dir->freeze_tree();
+ ceph_assert(dir->is_freezing_tree());
+ dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir, it->second.tid));
+ return;
+ }
+
+ if (parent) {
+ parent->pending_children += results.size();
+ } else {
+ parent = std::make_shared<export_base_t>(dir->dirfrag(), dest,
+ results.size(), export_queue_gen);
+ }
+
+ if (results.empty()) {
+ dout(7) << "subtree's children all are under exporting, retry rest parts of parent export "
+ << parent->dirfrag << dendl;
+ parent->restart = true;
+ } else {
+ dout(7) << "subtree is too large, splitting it into: " << dendl;
+ }
+
+ for (auto& p : results) {
+ CDir *sub = p.first;
+ ceph_assert(sub != dir);
+ dout(7) << " sub " << *sub << dendl;
+
+ sub->auth_pin(this);
+ sub->mark_exporting();
+
+ MDRequestRef _mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
+ _mdr->more()->export_dir = sub;
+ _mdr->pin(sub);
+
+ ceph_assert(export_state.count(sub) == 0);
+ auto& stat = export_state[sub];
+ num_locking_exports++;
+ stat.state = EXPORT_LOCKING;
+ stat.peer = dest;
+ stat.tid = _mdr->reqid.tid;
+ stat.mut = _mdr;
+ stat.parent = parent;
+ mds->mdcache->dispatch_request(_mdr);
+ }
+
+ // cancel the original one
+ export_try_cancel(dir);
+}
+
+void Migrator::child_export_finish(std::shared_ptr<export_base_t>& parent, bool success)
+{
+ if (success)
+ parent->restart = true;
+ if (--parent->pending_children == 0) {
+ if (parent->restart &&
+ parent->export_queue_gen == export_queue_gen) {
+ CDir *origin = mds->mdcache->get_dirfrag(parent->dirfrag);
+ if (origin && origin->is_auth()) {
+ dout(7) << "child_export_finish requeue " << *origin << dendl;
+ export_queue.emplace_front(origin->dirfrag(), parent->dest);
+ }
+ }
+ }
+}
+
+/*
+ * called on receipt of MExportDirDiscoverAck
+ * the importer now has the directory's _inode_ in memory, and pinned.
+ */
+void Migrator::handle_export_discover_ack(const MExportDirDiscoverAck::const_ref &m)
+{
+ CDir *dir = cache->get_dirfrag(m->get_dirfrag());
+ mds_rank_t dest(m->get_source().num());
+ ceph_assert(dir);
+
+ dout(7) << "export_discover_ack from " << m->get_source()
+ << " on " << *dir << dendl;
+
+ mds->hit_export_target(dest, -1);
+
+ map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+ if (it == export_state.end() ||
+ it->second.tid != m->get_tid() ||
+ it->second.peer != dest) {
+ dout(7) << "must have aborted" << dendl;
+ } else {
+ ceph_assert(it->second.state == EXPORT_DISCOVERING);
+
+ if (m->is_success()) {
+ // release locks to avoid deadlock
+ MDRequestRef mdr = static_cast<MDRequestImpl*>(it->second.mut.get());
+ ceph_assert(mdr);
+ mds->mdcache->request_finish(mdr);
+ it->second.mut.reset();
+ // freeze the subtree
+ it->second.state = EXPORT_FREEZING;
+ dir->auth_unpin(this);
+ ceph_assert(g_conf()->mds_kill_export_at != 3);
+
+ } else {
+ dout(7) << "peer failed to discover (not active?), canceling" << dendl;
+ export_try_cancel(dir, false);
+ }
+ }
+}
+
+class C_M_ExportSessionsFlushed : public MigratorContext {
+ CDir *dir;
+ uint64_t tid;
+public:
+ C_M_ExportSessionsFlushed(Migrator *m, CDir *d, uint64_t t) :
+ MigratorContext(m), dir(d), tid(t) {
+ dir->get(CDir::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ mig->export_sessions_flushed(dir, tid);
+ dir->put(CDir::PIN_PTRWAITER);
+ }
+};
+
+void Migrator::export_sessions_flushed(CDir *dir, uint64_t tid)
+{
+ dout(7) << "export_sessions_flushed " << *dir << dendl;
+
+ map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+ if (it == export_state.end() ||
+ it->second.state == EXPORT_CANCELLING ||
+ it->second.tid != tid) {
+ // export must have aborted.
+ dout(7) << "export must have aborted on " << dir << dendl;
+ return;
+ }
+
+ ceph_assert(it->second.state == EXPORT_PREPPING || it->second.state == EXPORT_WARNING);
+ ceph_assert(it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0);
+ it->second.warning_ack_waiting.erase(MDS_RANK_NONE);
+ if (it->second.state == EXPORT_WARNING && it->second.warning_ack_waiting.empty())
+ export_go(dir); // start export.
+}
+
+void Migrator::export_frozen(CDir *dir, uint64_t tid)
+{
+ dout(7) << "export_frozen on " << *dir << dendl;
+
+ map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+ if (it == export_state.end() || it->second.tid != tid) {
+ dout(7) << "export must have aborted" << dendl;
+ return;
+ }
+
+ ceph_assert(it->second.state == EXPORT_FREEZING);
+ ceph_assert(dir->is_frozen_tree_root());
+
+ CInode *diri = dir->get_inode();
+
+ // ok, try to grab all my locks.
+ MutationImpl::LockOpVec lov;
+ get_export_lock_set(dir, lov);
+ if ((diri->is_auth() && diri->is_frozen()) ||
+ !mds->locker->can_rdlock_set(lov) ||
+ !diri->filelock.can_wrlock(-1) ||
+ !diri->nestlock.can_wrlock(-1)) {
+ dout(7) << "export_dir couldn't acquire all needed locks, failing. "
+ << *dir << dendl;
+ export_try_cancel(dir);
+ return;
+ }
+
+ it->second.mut = new MutationImpl();
+ if (diri->is_auth())
+ it->second.mut->auth_pin(diri);
+ mds->locker->rdlock_take_set(lov, it->second.mut);
+ mds->locker->wrlock_force(&diri->filelock, it->second.mut);
+ mds->locker->wrlock_force(&diri->nestlock, it->second.mut);
+
+ cache->show_subtrees();
+
+ // CDir::_freeze_tree() should have forced it into subtree.
+ ceph_assert(dir->get_dir_auth() == mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
+ // note the bounds.
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+
+ // generate prep message, log entry.
+ auto prep = MExportDirPrep::create(dir->dirfrag(), it->second.tid);
+
+ // include list of bystanders
+ for (const auto &p : dir->get_replicas()) {
+ if (p.first != it->second.peer) {
+ dout(10) << "bystander mds." << p.first << dendl;
+ prep->add_bystander(p.first);
+ }
+ }
+
+ // include base dirfrag
+ cache->replicate_dir(dir, it->second.peer, prep->basedir);
+
+ /*
+ * include spanning tree for all nested exports.
+ * these need to be on the destination _before_ the final export so that
+ * dir_auth updates on any nested exports are properly absorbed.
+ * this includes inodes and dirfrags included in the subtree, but
+ * only the inodes at the bounds.
+ *
+ * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
+ */
+ set<inodeno_t> inodes_added;
+ set<dirfrag_t> dirfrags_added;
+
+ // check bounds
+ for (set<CDir*>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p) {
+ CDir *bound = *p;
+
+ // pin it.
+ bound->get(CDir::PIN_EXPORTBOUND);
+ bound->state_set(CDir::STATE_EXPORTBOUND);
+
+ dout(7) << " export bound " << *bound << dendl;
+ prep->add_bound( bound->dirfrag() );
+
+ // trace to bound
+ bufferlist tracebl;
+ CDir *cur = bound;
+
+ char start = '-';
+ while (1) {
+ // don't repeat inodes
+ if (inodes_added.count(cur->inode->ino()))
+ break;
+ inodes_added.insert(cur->inode->ino());
+
+ // prepend dentry + inode
+ ceph_assert(cur->inode->is_auth());
+ bufferlist bl;
+ cache->replicate_dentry(cur->inode->parent, it->second.peer, bl);
+ dout(7) << " added " << *cur->inode->parent << dendl;
+ cache->replicate_inode(cur->inode, it->second.peer, bl,
+ mds->mdsmap->get_up_features());
+ dout(7) << " added " << *cur->inode << dendl;
+ bl.claim_append(tracebl);
+ tracebl.claim(bl);
+
+ cur = cur->get_parent_dir();
+
+ // don't repeat dirfrags
+ if (dirfrags_added.count(cur->dirfrag()) ||
+ cur == dir) {
+ start = 'd'; // start with dentry
+ break;
+ }
+ dirfrags_added.insert(cur->dirfrag());
+
+ // prepend dir
+ cache->replicate_dir(cur, it->second.peer, bl);
+ dout(7) << " added " << *cur << dendl;
+ bl.claim_append(tracebl);
+ tracebl.claim(bl);
+
+ start = 'f'; // start with dirfrag
+ }
+ bufferlist final_bl;
+ dirfrag_t df = cur->dirfrag();
+ encode(df, final_bl);
+ encode(start, final_bl);
+ final_bl.claim_append(tracebl);
+ prep->add_trace(final_bl);
+ }
+
+ // send.
+ it->second.state = EXPORT_PREPPING;
+ mds->send_message_mds(prep, it->second.peer);
+ assert (g_conf()->mds_kill_export_at != 4);
+
+ // make sure any new instantiations of caps are flushed out
+ ceph_assert(it->second.warning_ack_waiting.empty());
+
+ set<client_t> export_client_set;
+ get_export_client_set(dir, export_client_set);
+
+ MDSGatherBuilder gather(g_ceph_context);
+ mds->server->flush_client_sessions(export_client_set, gather);
+ if (gather.has_subs()) {
+ it->second.warning_ack_waiting.insert(MDS_RANK_NONE);
+ gather.set_finisher(new C_M_ExportSessionsFlushed(this, dir, it->second.tid));
+ gather.activate();
+ }
+}
+
+void Migrator::get_export_client_set(CDir *dir, set<client_t>& client_set)
+{
+ deque<CDir*> dfs;
+ dfs.push_back(dir);
+ while (!dfs.empty()) {
+ CDir *dir = dfs.front();
+ dfs.pop_front();
+ for (auto& p : *dir) {
+ CDentry *dn = p.second;
+ if (!dn->get_linkage()->is_primary())
+ continue;
+ CInode *in = dn->get_linkage()->get_inode();
+ if (in->is_dir()) {
+ // directory?
+ vector<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (auto& q : ls) {
+ if (!q->state_test(CDir::STATE_EXPORTBOUND)) {
+ // include nested dirfrag
+ ceph_assert(q->get_dir_auth().first == CDIR_AUTH_PARENT);
+ dfs.push_back(q); // it's ours, recurse (later)
+ }
+ }
+ }
+ for (auto& q : in->get_client_caps()) {
+ client_set.insert(q.first);
+ }
+ }
+ }
+}
+
+void Migrator::get_export_client_set(CInode *in, set<client_t>& client_set)
+{
+ for (const auto &p : in->get_client_caps()) {
+ client_set.insert(p.first);
+ }
+}
+
+void Migrator::handle_export_prep_ack(const MExportDirPrepAck::const_ref &m)
+{
+ CDir *dir = cache->get_dirfrag(m->get_dirfrag());
+ mds_rank_t dest(m->get_source().num());
+ ceph_assert(dir);
+
+ dout(7) << "export_prep_ack " << *dir << dendl;
+
+ mds->hit_export_target(dest, -1);
+
+ map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+ if (it == export_state.end() ||
+ it->second.tid != m->get_tid() ||
+ it->second.peer != mds_rank_t(m->get_source().num())) {
+ // export must have aborted.
+ dout(7) << "export must have aborted" << dendl;
+ return;
+ }
+ ceph_assert(it->second.state == EXPORT_PREPPING);
+
+ if (!m->is_success()) {
+ dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl;
+ export_try_cancel(dir, false);
+ return;
+ }
+
+ assert (g_conf()->mds_kill_export_at != 5);
+ // send warnings
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+
+ ceph_assert(it->second.warning_ack_waiting.empty() ||
+ (it->second.warning_ack_waiting.size() == 1 &&
+ it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0));
+ ceph_assert(it->second.notify_ack_waiting.empty());
+
+ for (const auto &p : dir->get_replicas()) {
+ if (p.first == it->second.peer) continue;
+ if (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first))
+ continue; // only if active
+ it->second.warning_ack_waiting.insert(p.first);
+ it->second.notify_ack_waiting.insert(p.first); // we'll eventually get a notifyack, too!
+
+ auto notify = MExportDirNotify::create(dir->dirfrag(), it->second.tid, true,
+ mds_authority_t(mds->get_nodeid(),CDIR_AUTH_UNKNOWN),
+ mds_authority_t(mds->get_nodeid(),it->second.peer));
+ for (auto &cdir : bounds) {
+ notify->get_bounds().push_back(cdir->dirfrag());
+ }
+ mds->send_message_mds(notify, p.first);
+
+ }
+
+ it->second.state = EXPORT_WARNING;
+
+ ceph_assert(g_conf()->mds_kill_export_at != 6);
+ // nobody to warn?
+ if (it->second.warning_ack_waiting.empty())
+ export_go(dir); // start export.
+}
+
+
+class C_M_ExportGo : public MigratorContext {
+ CDir *dir;
+ uint64_t tid;
+public:
+ C_M_ExportGo(Migrator *m, CDir *d, uint64_t t) :
+ MigratorContext(m), dir(d), tid(t) {
+ dir->get(CDir::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ mig->export_go_synced(dir, tid);
+ dir->put(CDir::PIN_PTRWAITER);
+ }
+};
+
+void Migrator::export_go(CDir *dir)
+{
+ auto it = export_state.find(dir);
+ ceph_assert(it != export_state.end());
+ dout(7) << "export_go " << *dir << " to " << it->second.peer << dendl;
+
+ // first sync log to flush out e.g. any cap imports
+ mds->mdlog->wait_for_safe(new C_M_ExportGo(this, dir, it->second.tid));
+ mds->mdlog->flush();
+}
+
+void Migrator::export_go_synced(CDir *dir, uint64_t tid)
+{
+ map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+ if (it == export_state.end() ||
+ it->second.state == EXPORT_CANCELLING ||
+ it->second.tid != tid) {
+ // export must have aborted.
+ dout(7) << "export must have aborted on " << dir << dendl;
+ return;
+ }
+ ceph_assert(it->second.state == EXPORT_WARNING);
+ mds_rank_t dest = it->second.peer;
+
+ dout(7) << "export_go_synced " << *dir << " to " << dest << dendl;
+
+ cache->show_subtrees();
+
+ it->second.state = EXPORT_EXPORTING;
+ ceph_assert(g_conf()->mds_kill_export_at != 7);
+
+ ceph_assert(dir->is_frozen_tree_root());
+
+ // set ambiguous auth
+ cache->adjust_subtree_auth(dir, mds->get_nodeid(), dest);
+
+ // take away the popularity we're sending.
+ mds->balancer->subtract_export(dir);
+
+ // fill export message with cache data
+ auto req = MExportDir::create(dir->dirfrag(), it->second.tid);
+ map<client_t,entity_inst_t> exported_client_map;
+ map<client_t,client_metadata_t> exported_client_metadata_map;
+ uint64_t num_exported_inodes = encode_export_dir(req->export_data,
+ dir, // recur start point
+ exported_client_map,
+ exported_client_metadata_map);
+ encode(exported_client_map, req->client_map, mds->mdsmap->get_up_features());
+ encode(exported_client_metadata_map, req->client_map);
+
+ // add bounds to message
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+ for (set<CDir*>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p)
+ req->add_export((*p)->dirfrag());
+
+ // send
+ mds->send_message_mds(req, dest);
+ ceph_assert(g_conf()->mds_kill_export_at != 8);
+
+ mds->hit_export_target(dest, num_exported_inodes+1);
+
+ // stats
+ if (mds->logger) mds->logger->inc(l_mds_exported);
+ if (mds->logger) mds->logger->inc(l_mds_exported_inodes, num_exported_inodes);
+
+ cache->show_subtrees();
+}
+
+
+/** encode_export_inode
+ * update our local state for this inode to export.
+ * encode relevant state to be sent over the wire.
+ * used by: encode_export_dir, file_rename (if foreign)
+ *
+ * FIXME: the separation between CInode.encode_export and these methods
+ * is pretty arbitrary and dumb.
+ */
+void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state,
+ map<client_t,entity_inst_t>& exported_client_map,
+ map<client_t,client_metadata_t>& exported_client_metadata_map)
+{
+ dout(7) << "encode_export_inode " << *in << dendl;
+ ceph_assert(!in->is_replica(mds->get_nodeid()));
+
+ encode(in->inode.ino, enc_state);
+ encode(in->last, enc_state);
+ in->encode_export(enc_state);
+
+ // caps
+ encode_export_inode_caps(in, true, enc_state, exported_client_map, exported_client_metadata_map);
+}
+
+void Migrator::encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl,
+ map<client_t,entity_inst_t>& exported_client_map,
+ map<client_t,client_metadata_t>& exported_client_metadata_map)
+{
+ dout(20) << "encode_export_inode_caps " << *in << dendl;
+
+ // encode caps
+ map<client_t,Capability::Export> cap_map;
+ in->export_client_caps(cap_map);
+ encode(cap_map, bl);
+ if (auth_cap) {
+ encode(in->get_mds_caps_wanted(), bl);
+
+ in->state_set(CInode::STATE_EXPORTINGCAPS);
+ in->get(CInode::PIN_EXPORTINGCAPS);
+ }
+
+ // make note of clients named by exported capabilities
+ for (const auto &p : in->get_client_caps()) {
+ if (exported_client_map.count(p.first))
+ continue;
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
+ exported_client_map[p.first] = session->info.inst;
+ exported_client_metadata_map[p.first] = session->info.client_metadata;
+ }
+}
+
+void Migrator::finish_export_inode_caps(CInode *in, mds_rank_t peer,
+ map<client_t,Capability::Import>& peer_imported)
+{
+ dout(20) << "finish_export_inode_caps " << *in << dendl;
+
+ in->state_clear(CInode::STATE_EXPORTINGCAPS);
+ in->put(CInode::PIN_EXPORTINGCAPS);
+
+ // tell (all) clients about migrating caps..
+ for (const auto &p : in->get_client_caps()) {
+ const Capability *cap = &p.second;
+ dout(7) << "finish_export_inode_caps telling client." << p.first
+ << " exported caps on " << *in << dendl;
+ auto m = MClientCaps::create(CEPH_CAP_OP_EXPORT, in->ino(), 0,
+ cap->get_cap_id(), cap->get_mseq(), mds->get_osd_epoch_barrier());
+
+ map<client_t,Capability::Import>::iterator q = peer_imported.find(p.first);
+ ceph_assert(q != peer_imported.end());
+ m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
+ (q->second.cap_id > 0 ? peer : -1), 0);
+ mds->send_message_client_counted(m, p.first);
+ }
+ in->clear_client_caps_after_export();
+ mds->locker->eval(in, CEPH_CAP_LOCKS);
+}
+
+void Migrator::finish_export_inode(CInode *in, mds_rank_t peer,
+ map<client_t,Capability::Import>& peer_imported,
+ MDSContext::vec& finished)
+{
+ dout(12) << "finish_export_inode " << *in << dendl;
+
+ // clean
+ if (in->is_dirty())
+ in->mark_clean();
+
+ // clear/unpin cached_by (we're no longer the authority)
+ in->clear_replica_map();
+
+ // twiddle lock states for auth -> replica transition
+ in->authlock.export_twiddle();
+ in->linklock.export_twiddle();
+ in->dirfragtreelock.export_twiddle();
+ in->filelock.export_twiddle();
+ in->nestlock.export_twiddle();
+ in->xattrlock.export_twiddle();
+ in->snaplock.export_twiddle();
+ in->flocklock.export_twiddle();
+ in->policylock.export_twiddle();
+
+ // mark auth
+ ceph_assert(in->is_auth());
+ in->state_clear(CInode::STATE_AUTH);
+ in->replica_nonce = CInode::EXPORT_NONCE;
+
+ in->clear_dirty_rstat();
+
+ // no more auth subtree? clear scatter dirty
+ if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
+ in->clear_scatter_dirty();
+
+ in->clear_dirty_parent();
+
+ in->clear_file_locks();
+
+ // waiters
+ in->take_waiting(CInode::WAIT_ANY_MASK, finished);
+
+ in->finish_export();
+
+ finish_export_inode_caps(in, peer, peer_imported);
+}
+
+uint64_t Migrator::encode_export_dir(bufferlist& exportbl,
+ CDir *dir,
+ map<client_t,entity_inst_t>& exported_client_map,
+ map<client_t,client_metadata_t>& exported_client_metadata_map)
+{
+ uint64_t num_exported = 0;
+
+ dout(7) << "encode_export_dir " << *dir << " " << dir->get_num_head_items() << " head items" << dendl;
+
+ ceph_assert(dir->get_projected_version() == dir->get_version());
+
+#ifdef MDS_VERIFY_FRAGSTAT
+ if (dir->is_complete())
+ dir->verify_fragstat();
+#endif
+
+ // dir
+ dirfrag_t df = dir->dirfrag();
+ encode(df, exportbl);
+ dir->encode_export(exportbl);
+
+ __u32 nden = dir->items.size();
+ encode(nden, exportbl);
+
+ // dentries
+ list<CDir*> subdirs;
+ for (auto &p : *dir) {
+ CDentry *dn = p.second;
+ CInode *in = dn->get_linkage()->get_inode();
+
+ num_exported++;
+
+ // -- dentry
+ dout(7) << "encode_export_dir exporting " << *dn << dendl;
+
+ // dn name
+ encode(dn->get_name(), exportbl);
+ encode(dn->last, exportbl);
+
+ // state
+ dn->encode_export(exportbl);
+
+ // points to...
+
+ // null dentry?
+ if (dn->get_linkage()->is_null()) {
+ exportbl.append("N", 1); // null dentry
+ continue;
+ }
+
+ if (dn->get_linkage()->is_remote()) {
+ // remote link
+ exportbl.append("L", 1); // remote link
+
+ inodeno_t ino = dn->get_linkage()->get_remote_ino();
+ unsigned char d_type = dn->get_linkage()->get_remote_d_type();
+ encode(ino, exportbl);
+ encode(d_type, exportbl);
+ continue;
+ }
+
+ // primary link
+ // -- inode
+ exportbl.append("I", 1); // inode dentry
+
+ encode_export_inode(in, exportbl, exported_client_map, exported_client_metadata_map); // encode, and (update state for) export
+
+ // directory?
+ list<CDir*> dfs;
+ in->get_dirfrags(dfs);
+ for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
+ CDir *t = *p;
+ if (!t->state_test(CDir::STATE_EXPORTBOUND)) {
+ // include nested dirfrag
+ ceph_assert(t->get_dir_auth().first == CDIR_AUTH_PARENT);
+ subdirs.push_front(t); // it's ours, recurse (later)
+ }
+ }
+ }
+
+ // subdirs
+ for (auto &dir : subdirs)
+ num_exported += encode_export_dir(exportbl, dir, exported_client_map, exported_client_metadata_map);
+
+ return num_exported;
+}
+
+void Migrator::finish_export_dir(CDir *dir, mds_rank_t peer,
+ map<inodeno_t,map<client_t,Capability::Import> >& peer_imported,
+ MDSContext::vec& finished, int *num_dentries)
+{
+ dout(10) << "finish_export_dir " << *dir << dendl;
+
+ // release open_by
+ dir->clear_replica_map();
+
+ // mark
+ ceph_assert(dir->is_auth());
+ dir->state_clear(CDir::STATE_AUTH);
+ dir->remove_bloom();
+ dir->replica_nonce = CDir::EXPORT_NONCE;
+
+ if (dir->is_dirty())
+ dir->mark_clean();
+
+ // suck up all waiters
+ dir->take_waiting(CDir::WAIT_ANY_MASK, finished); // all dir waiters
+
+ // pop
+ dir->finish_export();
+
+ // dentries
+ list<CDir*> subdirs;
+ for (auto &p : *dir) {
+ CDentry *dn = p.second;
+ CInode *in = dn->get_linkage()->get_inode();
+
+ // dentry
+ dn->finish_export();
+
+ // inode?
+ if (dn->get_linkage()->is_primary()) {
+ finish_export_inode(in, peer, peer_imported[in->ino()], finished);
+
+ // subdirs?
+ in->get_nested_dirfrags(subdirs);
+ }
+
+ cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
+ ++(*num_dentries);
+ }
+
+ // subdirs
+ for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); ++it)
+ finish_export_dir(*it, peer, peer_imported, finished, num_dentries);
+}
+
+class C_MDS_ExportFinishLogged : public MigratorLogContext {
+ CDir *dir;
+public:
+ C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : MigratorLogContext(m), dir(d) {}
+ void finish(int r) override {
+ mig->export_logged_finish(dir);
+ }
+};
+
+
+/*
+ * i should get an export_ack from the export target.
+ */
+void Migrator::handle_export_ack(const MExportDirAck::const_ref &m)
+{
+ CDir *dir = cache->get_dirfrag(m->get_dirfrag());
+ mds_rank_t dest(m->get_source().num());
+ ceph_assert(dir);
+ ceph_assert(dir->is_frozen_tree_root()); // i'm exporting!
+
+ // yay!
+ dout(7) << "handle_export_ack " << *dir << dendl;
+
+ mds->hit_export_target(dest, -1);
+
+ map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+ ceph_assert(it != export_state.end());
+ ceph_assert(it->second.state == EXPORT_EXPORTING);
+ ceph_assert(it->second.tid == m->get_tid());
+
+ auto bp = m->imported_caps.cbegin();
+ decode(it->second.peer_imported, bp);
+
+ it->second.state = EXPORT_LOGGINGFINISH;
+ assert (g_conf()->mds_kill_export_at != 9);
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+
+ // log completion.
+ // include export bounds, to ensure they're in the journal.
+ EExport *le = new EExport(mds->mdlog, dir, it->second.peer);;
+ mds->mdlog->start_entry(le);
+
+ le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
+ le->metablob.add_dir(dir, false);
+ for (set<CDir*>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p) {
+ CDir *bound = *p;
+ le->get_bounds().insert(bound->dirfrag());
+ le->metablob.add_dir_context(bound);
+ le->metablob.add_dir(bound, false);
+ }
+
+ // list us second, them first.
+ // this keeps authority().first in sync with subtree auth state in the journal.
+ cache->adjust_subtree_auth(dir, it->second.peer, mds->get_nodeid());
+
+ // log export completion, then finish (unfreeze, trigger finish context, etc.)
+ mds->mdlog->submit_entry(le, new C_MDS_ExportFinishLogged(this, dir));
+ mds->mdlog->flush();
+ assert (g_conf()->mds_kill_export_at != 10);
+}
+
+void Migrator::export_notify_abort(CDir *dir, export_state_t& stat, set<CDir*>& bounds)
+{
+ dout(7) << "export_notify_abort " << *dir << dendl;
+
+ ceph_assert(stat.state == EXPORT_CANCELLING);
+
+ if (stat.notify_ack_waiting.empty()) {
+ stat.state = EXPORT_CANCELLED;
+ return;
+ }
+
+ dir->auth_pin(this);
+
+ for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
+ p != stat.notify_ack_waiting.end();
+ ++p) {
+ auto notify = MExportDirNotify::create(dir->dirfrag(), stat.tid, true,
+ pair<int,int>(mds->get_nodeid(), stat.peer),
+ pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
+ for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
+ notify->get_bounds().push_back((*i)->dirfrag());
+ mds->send_message_mds(notify, *p);
+ }
+}
+
+/*
+ * this happens if hte dest failes after i send teh export data but before it is acked
+ * that is, we don't know they safely received and logged it, so we reverse our changes
+ * and go on.
+ */
+void Migrator::export_reverse(CDir *dir, export_state_t& stat)
+{
+ dout(7) << "export_reverse " << *dir << dendl;
+
+ set<CInode*> to_eval;
+
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+
+ // remove exporting pins
+ list<CDir*> rq;
+ rq.push_back(dir);
+ while (!rq.empty()) {
+ CDir *t = rq.front();
+ rq.pop_front();
+ t->abort_export();
+ for (auto &p : *t) {
+ CDentry *dn = p.second;
+ dn->abort_export();
+ if (!dn->get_linkage()->is_primary())
+ continue;
+ CInode *in = dn->get_linkage()->get_inode();
+ in->abort_export();
+ if (in->state_test(CInode::STATE_EVALSTALECAPS)) {
+ in->state_clear(CInode::STATE_EVALSTALECAPS);
+ to_eval.insert(in);
+ }
+ if (in->is_dir())
+ in->get_nested_dirfrags(rq);
+ }
+ }
+
+ // unpin bounds
+ for (auto bd : bounds) {
+ bd->put(CDir::PIN_EXPORTBOUND);
+ bd->state_clear(CDir::STATE_EXPORTBOUND);
+ }
+
+ // notify bystanders
+ export_notify_abort(dir, stat, bounds);
+
+ // unfreeze tree, with possible subtree merge.
+ cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
+
+ // process delayed expires
+ cache->process_delayed_expire(dir);
+
+ dir->unfreeze_tree();
+ cache->try_subtree_merge(dir);
+
+ // revoke/resume stale caps
+ for (auto in : to_eval) {
+ bool need_issue = false;
+ for (auto &p : in->client_caps) {
+ Capability *cap = &p.second;
+ if (!cap->is_stale()) {
+ need_issue = true;
+ break;
+ }
+ }
+ if (need_issue &&
+ (!in->is_auth() || !mds->locker->eval(in, CEPH_CAP_LOCKS)))
+ mds->locker->issue_caps(in);
+ }
+
+ cache->show_cache();
+}
+
+
+/*
+ * once i get the ack, and logged the EExportFinish(true),
+ * send notifies (if any), otherwise go straight to finish.
+ *
+ */
+void Migrator::export_logged_finish(CDir *dir)
+{
+ dout(7) << "export_logged_finish " << *dir << dendl;
+
+ export_state_t& stat = export_state[dir];
+
+ // send notifies
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+
+ for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
+ p != stat.notify_ack_waiting.end();
+ ++p) {
+ auto notify = MExportDirNotify::create(dir->dirfrag(), stat.tid, true,
+ pair<int,int>(mds->get_nodeid(), stat.peer),
+ pair<int,int>(stat.peer, CDIR_AUTH_UNKNOWN));
+
+ for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
+ notify->get_bounds().push_back((*i)->dirfrag());
+
+ mds->send_message_mds(notify, *p);
+ }
+
+ // wait for notifyacks
+ stat.state = EXPORT_NOTIFYING;
+ assert (g_conf()->mds_kill_export_at != 11);
+
+ // no notifies to wait for?
+ if (stat.notify_ack_waiting.empty()) {
+ export_finish(dir); // skip notify/notify_ack stage.
+ } else {
+ // notify peer to send cap import messages to clients
+ if (!mds->is_cluster_degraded() ||
+ mds->mdsmap->is_clientreplay_or_active_or_stopping(stat.peer)) {
+ mds->send_message_mds(MExportDirFinish::create(dir->dirfrag(), false, stat.tid), stat.peer);
+ } else {
+ dout(7) << "not sending MExportDirFinish, dest has failed" << dendl;
+ }
+ }
+}
+
+/*
+ * warning:
+ * i'll get an ack from each bystander.
+ * when i get them all, do the export.
+ * notify:
+ * i'll get an ack from each bystander.
+ * when i get them all, unfreeze and send the finish.
+ */
+void Migrator::handle_export_notify_ack(const MExportDirNotifyAck::const_ref &m)
+{
+ CDir *dir = cache->get_dirfrag(m->get_dirfrag());
+ mds_rank_t dest(m->get_source().num());
+ ceph_assert(dir);
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+
+ mds->hit_export_target(dest, -1);
+
+ auto export_state_entry = export_state.find(dir);
+ if (export_state_entry != export_state.end()) {
+ export_state_t& stat = export_state_entry->second;
+ if (stat.state == EXPORT_WARNING &&
+ stat.warning_ack_waiting.erase(from)) {
+ // exporting. process warning.
+ dout(7) << "handle_export_notify_ack from " << m->get_source()
+ << ": exporting, processing warning on " << *dir << dendl;
+ if (stat.warning_ack_waiting.empty())
+ export_go(dir); // start export.
+ } else if (stat.state == EXPORT_NOTIFYING &&
+ stat.notify_ack_waiting.erase(from)) {
+ // exporting. process notify.
+ dout(7) << "handle_export_notify_ack from " << m->get_source()
+ << ": exporting, processing notify on " << *dir << dendl;
+ if (stat.notify_ack_waiting.empty())
+ export_finish(dir);
+ } else if (stat.state == EXPORT_CANCELLING &&
+ m->get_new_auth().second == CDIR_AUTH_UNKNOWN && // not warning ack
+ stat.notify_ack_waiting.erase(from)) {
+ dout(7) << "handle_export_notify_ack from " << m->get_source()
+ << ": cancelling export, processing notify on " << *dir << dendl;
+ if (stat.notify_ack_waiting.empty()) {
+ export_cancel_finish(export_state_entry);
+ }
+ }
+ }
+ else {
+ auto import_state_entry = import_state.find(dir->dirfrag());
+ if (import_state_entry != import_state.end()) {
+ import_state_t& stat = import_state_entry->second;
+ if (stat.state == IMPORT_ABORTING) {
+ // reversing import
+ dout(7) << "handle_export_notify_ack from " << m->get_source()
+ << ": aborting import on " << *dir << dendl;
+ ceph_assert(stat.bystanders.count(from));
+ stat.bystanders.erase(from);
+ if (stat.bystanders.empty())
+ import_reverse_unfreeze(dir);
+ }
+ }
+ }
+}
+
+void Migrator::export_finish(CDir *dir)
+{
+ dout(5) << "export_finish " << *dir << dendl;
+
+ assert (g_conf()->mds_kill_export_at != 12);
+ map<CDir*,export_state_t>::iterator it = export_state.find(dir);
+ if (it == export_state.end()) {
+ dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl;
+ return;
+ }
+
+ // send finish/commit to new auth
+ if (!mds->is_cluster_degraded() ||
+ mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) {
+ mds->send_message_mds(MExportDirFinish::create(dir->dirfrag(), true, it->second.tid), it->second.peer);
+ } else {
+ dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl;
+ }
+ ceph_assert(g_conf()->mds_kill_export_at != 13);
+
+ // finish export (adjust local cache state)
+ int num_dentries = 0;
+ MDSContext::vec finished;
+ finish_export_dir(dir, it->second.peer,
+ it->second.peer_imported, finished, &num_dentries);
+
+ ceph_assert(!dir->is_auth());
+ cache->adjust_subtree_auth(dir, it->second.peer);
+
+ // unpin bounds
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+ for (set<CDir*>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p) {
+ CDir *bd = *p;
+ bd->put(CDir::PIN_EXPORTBOUND);
+ bd->state_clear(CDir::STATE_EXPORTBOUND);
+ }
+
+ if (dir->state_test(CDir::STATE_AUXSUBTREE))
+ dir->state_clear(CDir::STATE_AUXSUBTREE);
+
+ // discard delayed expires
+ cache->discard_delayed_expire(dir);
+
+ dout(7) << "export_finish unfreezing" << dendl;
+
+ // unfreeze tree, with possible subtree merge.
+ // (we do this _after_ removing EXPORTBOUND pins, to allow merges)
+ dir->unfreeze_tree();
+ cache->try_subtree_merge(dir);
+
+ // no more auth subtree? clear scatter dirty
+ if (!dir->get_inode()->is_auth() &&
+ !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
+ dir->get_inode()->clear_scatter_dirty();
+ // wake up scatter_nudge waiters
+ dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, finished);
+ }
+
+ if (!finished.empty())
+ mds->queue_waiters(finished);
+
+ MutationRef mut = std::move(it->second.mut);
+ auto parent = std::move(it->second.parent);
+ // remove from exporting list, clean up state
+ total_exporting_size -= it->second.approx_size;
+ export_state.erase(it);
+
+ ceph_assert(dir->state_test(CDir::STATE_EXPORTING));
+ dir->clear_exporting();
+
+ cache->show_subtrees();
+ audit();
+
+ cache->trim(num_dentries); // try trimming exported dentries
+
+ // send pending import_maps?
+ mds->mdcache->maybe_send_pending_resolves();
+
+ // drop locks, unpin path
+ if (mut) {
+ mds->locker->drop_locks(mut.get());
+ mut->cleanup();
+ }
+
+ if (parent)
+ child_export_finish(parent, true);
+
+ maybe_do_queued_export();
+}
+
+
+
+class C_MDS_ExportDiscover : public MigratorContext {
+public:
+ C_MDS_ExportDiscover(Migrator *mig, const MExportDirDiscover::const_ref& m) : MigratorContext(mig), m(m) {}
+ void finish(int r) override {
+ mig->handle_export_discover(m, true);
+ }
+private:
+ MExportDirDiscover::const_ref m;
+};
+
+class C_MDS_ExportDiscoverFactory : public MDSContextFactory {
+public:
+ C_MDS_ExportDiscoverFactory(Migrator *mig, MExportDirDiscover::const_ref m) : mig(mig), m(m) {}
+ MDSContext *build() {
+ return new C_MDS_ExportDiscover(mig, m);
+ }
+private:
+ Migrator *mig;
+ MExportDirDiscover::const_ref m;
+};
+
+// ==========================================================
+// IMPORT
+
+void Migrator::handle_export_discover(const MExportDirDiscover::const_ref &m, bool started)
+{
+ mds_rank_t from = m->get_source_mds();
+ ceph_assert(from != mds->get_nodeid());
+
+ dout(7) << "handle_export_discover on " << m->get_path() << dendl;
+
+ // note import state
+ dirfrag_t df = m->get_dirfrag();
+
+ if (!mds->is_active()) {
+ dout(7) << " not active, send NACK " << dendl;
+ mds->send_message_mds(MExportDirDiscoverAck::create(df, m->get_tid(), false), from);
+ return;
+ }
+
+ // only start discovering on this message once.
+ import_state_t *p_state;
+ map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
+ if (!started) {
+ ceph_assert(it == import_state.end());
+ p_state = &import_state[df];
+ p_state->state = IMPORT_DISCOVERING;
+ p_state->peer = from;
+ p_state->tid = m->get_tid();
+ } else {
+ // am i retrying after ancient path_traverse results?
+ if (it == import_state.end() ||
+ it->second.peer != from ||
+ it->second.tid != m->get_tid()) {
+ dout(7) << " dropping obsolete message" << dendl;
+ return;
+ }
+ ceph_assert(it->second.state == IMPORT_DISCOVERING);
+ p_state = &it->second;
+ }
+
+ C_MDS_ExportDiscoverFactory cf(this, m);
+ if (!mds->mdcache->is_open()) {
+ dout(5) << " waiting for root" << dendl;
+ mds->mdcache->wait_for_open(cf.build());
+ return;
+ }
+
+ assert (g_conf()->mds_kill_import_at != 1);
+
+ // do we have it?
+ CInode *in = cache->get_inode(m->get_dirfrag().ino);
+ if (!in) {
+ // must discover it!
+ filepath fpath(m->get_path());
+ vector<CDentry*> trace;
+ MDRequestRef null_ref;
+ int r = cache->path_traverse(null_ref, cf, fpath, &trace, NULL, MDS_TRAVERSE_DISCOVER);
+ if (r > 0) return;
+ if (r < 0) {
+ dout(7) << "handle_export_discover failed to discover or not dir " << m->get_path() << ", NAK" << dendl;
+ ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!!
+ }
+
+ ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
+ }
+
+ // yay
+ dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl;
+
+ p_state->state = IMPORT_DISCOVERED;
+
+ // pin inode in the cache (for now)
+ ceph_assert(in->is_dir());
+ in->get(CInode::PIN_IMPORTING);
+
+ // reply
+ dout(7) << " sending export_discover_ack on " << *in << dendl;
+ mds->send_message_mds(MExportDirDiscoverAck::create(df, m->get_tid()), p_state->peer);
+ assert (g_conf()->mds_kill_import_at != 2);
+}
+
+void Migrator::import_reverse_discovering(dirfrag_t df)
+{
+ import_state.erase(df);
+}
+
+void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri)
+{
+ // unpin base
+ diri->put(CInode::PIN_IMPORTING);
+ import_state.erase(df);
+}
+
+void Migrator::import_reverse_prepping(CDir *dir, import_state_t& stat)
+{
+ set<CDir*> bounds;
+ cache->map_dirfrag_set(stat.bound_ls, bounds);
+ import_remove_pins(dir, bounds);
+ import_reverse_final(dir);
+}
+
+void Migrator::handle_export_cancel(const MExportDirCancel::const_ref &m)
+{
+ dout(7) << "handle_export_cancel on " << m->get_dirfrag() << dendl;
+ dirfrag_t df = m->get_dirfrag();
+ map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
+ if (it == import_state.end()) {
+ ceph_abort_msg("got export_cancel in weird state");
+ } else if (it->second.state == IMPORT_DISCOVERING) {
+ import_reverse_discovering(df);
+ } else if (it->second.state == IMPORT_DISCOVERED) {
+ CInode *in = cache->get_inode(df.ino);
+ ceph_assert(in);
+ import_reverse_discovered(df, in);
+ } else if (it->second.state == IMPORT_PREPPING) {
+ CDir *dir = mds->mdcache->get_dirfrag(df);
+ ceph_assert(dir);
+ import_reverse_prepping(dir, it->second);
+ } else if (it->second.state == IMPORT_PREPPED) {
+ CDir *dir = mds->mdcache->get_dirfrag(df);
+ ceph_assert(dir);
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+ import_remove_pins(dir, bounds);
+ // adjust auth back to the exportor
+ cache->adjust_subtree_auth(dir, it->second.peer);
+ import_reverse_unfreeze(dir);
+ } else {
+ ceph_abort_msg("got export_cancel in weird state");
+ }
+}
+
+class C_MDS_ExportPrep : public MigratorContext {
+public:
+ C_MDS_ExportPrep(Migrator *mig, const MExportDirPrep::const_ref& m) : MigratorContext(mig), m(m) {}
+ void finish(int r) override {
+ mig->handle_export_prep(m, true);
+ }
+private:
+ MExportDirPrep::const_ref m;
+};
+
+class C_MDS_ExportPrepFactory : public MDSContextFactory {
+public:
+ C_MDS_ExportPrepFactory(Migrator *mig, MExportDirPrep::const_ref m) : mig(mig), m(m) {}
+ MDSContext *build() {
+ return new C_MDS_ExportPrep(mig, m);
+ }
+private:
+ Migrator *mig;
+ MExportDirPrep::const_ref m;
+};
+
+void Migrator::handle_export_prep(const MExportDirPrep::const_ref &m, bool did_assim)
+{
+ mds_rank_t oldauth = mds_rank_t(m->get_source().num());
+ ceph_assert(oldauth != mds->get_nodeid());
+
+ CDir *dir;
+ CInode *diri;
+ MDSContext::vec finished;
+
+ // assimilate root dir.
+ map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
+ if (!did_assim) {
+ ceph_assert(it != import_state.end());
+ ceph_assert(it->second.state == IMPORT_DISCOVERED);
+ ceph_assert(it->second.peer == oldauth);
+ diri = cache->get_inode(m->get_dirfrag().ino);
+ ceph_assert(diri);
+ auto p = m->basedir.cbegin();
+ dir = cache->add_replica_dir(p, diri, oldauth, finished);
+ dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl;
+ } else {
+ if (it == import_state.end() ||
+ it->second.peer != oldauth ||
+ it->second.tid != m->get_tid()) {
+ dout(7) << "handle_export_prep obsolete message, dropping" << dendl;
+ return;
+ }
+ ceph_assert(it->second.state == IMPORT_PREPPING);
+ ceph_assert(it->second.peer == oldauth);
+
+ dir = cache->get_dirfrag(m->get_dirfrag());
+ ceph_assert(dir);
+ dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl;
+ diri = dir->get_inode();
+ }
+ ceph_assert(dir->is_auth() == false);
+
+ cache->show_subtrees();
+
+ // build import bound map
+ map<inodeno_t, fragset_t> import_bound_fragset;
+ for (const auto &bound : m->get_bounds()) {
+ dout(10) << " bound " << bound << dendl;
+ import_bound_fragset[bound.ino].insert(bound.frag);
+ }
+
+ // assimilate contents?
+ if (!did_assim) {
+ dout(7) << "doing assim on " << *dir << dendl;
+
+ // change import state
+ it->second.state = IMPORT_PREPPING;
+ it->second.bound_ls = m->get_bounds();
+ it->second.bystanders = m->get_bystanders();
+ ceph_assert(g_conf()->mds_kill_import_at != 3);
+
+ // bystander list
+ dout(7) << "bystanders are " << it->second.bystanders << dendl;
+
+ // move pin to dir
+ diri->put(CInode::PIN_IMPORTING);
+ dir->get(CDir::PIN_IMPORTING);
+ dir->state_set(CDir::STATE_IMPORTING);
+
+ // assimilate traces to exports
+ // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
+ for (const auto &bl : m->traces) {
+ auto q = bl.cbegin();
+ dirfrag_t df;
+ decode(df, q);
+ char start;
+ decode(start, q);
+ dout(10) << " trace from " << df << " start " << start << " len " << bl.length() << dendl;
+
+ CDir *cur = 0;
+ if (start == 'd') {
+ cur = cache->get_dirfrag(df);
+ ceph_assert(cur);
+ dout(10) << " had " << *cur << dendl;
+ } else if (start == 'f') {
+ CInode *in = cache->get_inode(df.ino);
+ ceph_assert(in);
+ dout(10) << " had " << *in << dendl;
+ cur = cache->add_replica_dir(q, in, oldauth, finished);
+ dout(10) << " added " << *cur << dendl;
+ } else if (start == '-') {
+ // nothing
+ } else
+ ceph_abort_msg("unrecognized start char");
+
+ while (!q.end()) {
+ CDentry *dn = cache->add_replica_dentry(q, cur, finished);
+ dout(10) << " added " << *dn << dendl;
+ CInode *in = cache->add_replica_inode(q, dn, finished);
+ dout(10) << " added " << *in << dendl;
+ if (q.end())
+ break;
+ cur = cache->add_replica_dir(q, in, oldauth, finished);
+ dout(10) << " added " << *cur << dendl;
+ }
+ }
+
+ // make bound sticky
+ for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
+ p != import_bound_fragset.end();
+ ++p) {
+ CInode *in = cache->get_inode(p->first);
+ ceph_assert(in);
+ in->get_stickydirs();
+ dout(7) << " set stickydirs on bound inode " << *in << dendl;
+ }
+
+ } else {
+ dout(7) << " not doing assim on " << *dir << dendl;
+ }
+
+ MDSGatherBuilder gather(g_ceph_context);
+
+ if (!finished.empty())
+ mds->queue_waiters(finished);
+
+
+ bool success = true;
+ if (mds->is_active()) {
+ // open all bounds
+ set<CDir*> import_bounds;
+ for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
+ p != import_bound_fragset.end();
+ ++p) {
+ CInode *in = cache->get_inode(p->first);
+ ceph_assert(in);
+
+ // map fragset into a frag_t list, based on the inode fragtree
+ frag_vec_t leaves;
+ for (const auto& frag : p->second) {
+ in->dirfragtree.get_leaves_under(frag, leaves);
+ }
+ dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << leaves << dendl;
+
+ for (const auto& leaf : leaves) {
+ CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, leaf));
+ if (!bound) {
+ dout(7) << " opening bounding dirfrag " << leaf << " on " << *in << dendl;
+ cache->open_remote_dirfrag(in, leaf, gather.new_sub());
+ continue;
+ }
+
+ if (!bound->state_test(CDir::STATE_IMPORTBOUND)) {
+ dout(7) << " pinning import bound " << *bound << dendl;
+ bound->get(CDir::PIN_IMPORTBOUND);
+ bound->state_set(CDir::STATE_IMPORTBOUND);
+ } else {
+ dout(7) << " already pinned import bound " << *bound << dendl;
+ }
+ import_bounds.insert(bound);
+ }
+ }
+
+ if (gather.has_subs()) {
+ C_MDS_ExportPrepFactory cf(this, m);
+ gather.set_finisher(cf.build());
+ gather.activate();
+ return;
+ }
+
+ dout(7) << " all ready, noting auth and freezing import region" << dendl;
+
+ if (!mds->mdcache->is_readonly() &&
+ diri->filelock.can_wrlock(-1) &&
+ diri->nestlock.can_wrlock(-1)) {
+ it->second.mut = new MutationImpl();
+ // force some locks. hacky.
+ mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut);
+ mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut);
+
+ // note that i am an ambiguous auth for this subtree.
+ // specify bounds, since the exporter explicitly defines the region.
+ cache->adjust_bounded_subtree_auth(dir, import_bounds,
+ pair<int,int>(oldauth, mds->get_nodeid()));
+ cache->verify_subtree_bounds(dir, import_bounds);
+ // freeze.
+ dir->_freeze_tree();
+ // note new state
+ it->second.state = IMPORT_PREPPED;
+ } else {
+ dout(7) << " couldn't acquire all needed locks, failing. " << *dir << dendl;
+ success = false;
+ }
+ } else {
+ dout(7) << " not active, failing. " << *dir << dendl;
+ success = false;
+ }
+
+ if (!success)
+ import_reverse_prepping(dir, it->second);
+
+ // ok!
+ dout(7) << " sending export_prep_ack on " << *dir << dendl;
+ mds->send_message(MExportDirPrepAck::create(dir->dirfrag(), success, m->get_tid()), m->get_connection());
+
+ ceph_assert(g_conf()->mds_kill_import_at != 4);
+}
+
+
+
+
+class C_MDS_ImportDirLoggedStart : public MigratorLogContext {
+ dirfrag_t df;
+ CDir *dir;
+ mds_rank_t from;
+public:
+ map<client_t,pair<Session*,uint64_t> > imported_session_map;
+
+ C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, mds_rank_t f) :
+ MigratorLogContext(m), df(d->dirfrag()), dir(d), from(f) {
+ dir->get(CDir::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ mig->import_logged_start(df, dir, from, imported_session_map);
+ dir->put(CDir::PIN_PTRWAITER);
+ }
+};
+
+void Migrator::handle_export_dir(const MExportDir::const_ref &m)
+{
+ assert (g_conf()->mds_kill_import_at != 5);
+ CDir *dir = cache->get_dirfrag(m->dirfrag);
+ ceph_assert(dir);
+
+ mds_rank_t oldauth = mds_rank_t(m->get_source().num());
+ dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl;
+
+ ceph_assert(!dir->is_auth());
+ ceph_assert(dir->freeze_tree_state);
+
+ map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->dirfrag);
+ ceph_assert(it != import_state.end());
+ ceph_assert(it->second.state == IMPORT_PREPPED);
+ ceph_assert(it->second.tid == m->get_tid());
+ ceph_assert(it->second.peer == oldauth);
+
+ if (!dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()))
+ dir->get_inode()->dirfragtree.force_to_leaf(g_ceph_context, dir->get_frag());
+
+ cache->show_subtrees();
+
+ C_MDS_ImportDirLoggedStart *onlogged = new C_MDS_ImportDirLoggedStart(this, dir, oldauth);
+
+ // start the journal entry
+ EImportStart *le = new EImportStart(mds->mdlog, dir->dirfrag(), m->bounds, oldauth);
+ mds->mdlog->start_entry(le);
+
+ le->metablob.add_dir_context(dir);
+
+ // adjust auth (list us _first_)
+ cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth);
+
+ // new client sessions, open these after we journal
+ // include imported sessions in EImportStart
+ auto cmp = m->client_map.cbegin();
+ map<client_t,entity_inst_t> client_map;
+ map<client_t,client_metadata_t> client_metadata_map;
+ decode(client_map, cmp);
+ decode(client_metadata_map, cmp);
+ ceph_assert(cmp.end());
+ le->cmapv = mds->server->prepare_force_open_sessions(client_map, client_metadata_map,
+ onlogged->imported_session_map);
+ encode(client_map, le->client_map, mds->mdsmap->get_up_features());
+ encode(client_metadata_map, le->client_map);
+
+ auto blp = m->export_data.cbegin();
+ int num_imported_inodes = 0;
+ while (!blp.end()) {
+ num_imported_inodes +=
+ decode_import_dir(blp,
+ oldauth,
+ dir, // import root
+ le,
+ mds->mdlog->get_current_segment(),
+ it->second.peer_exports,
+ it->second.updated_scatterlocks);
+ }
+ dout(10) << " " << m->bounds.size() << " imported bounds" << dendl;
+
+ // include bounds in EImportStart
+ set<CDir*> import_bounds;
+ for (const auto &bound : m->bounds) {
+ CDir *bd = cache->get_dirfrag(bound);
+ ceph_assert(bd);
+ le->metablob.add_dir(bd, false); // note that parent metadata is already in the event
+ import_bounds.insert(bd);
+ }
+ cache->verify_subtree_bounds(dir, import_bounds);
+
+ // adjust popularity
+ mds->balancer->add_import(dir);
+
+ dout(7) << "handle_export_dir did " << *dir << dendl;
+
+ // note state
+ it->second.state = IMPORT_LOGGINGSTART;
+ assert (g_conf()->mds_kill_import_at != 6);
+
+ // log it
+ mds->mdlog->submit_entry(le, onlogged);
+ mds->mdlog->flush();
+
+ // some stats
+ if (mds->logger) {
+ mds->logger->inc(l_mds_imported);
+ mds->logger->inc(l_mds_imported_inodes, num_imported_inodes);
+ }
+}
+
+
+/*
+ * this is an import helper
+ * called by import_finish, and import_reverse and friends.
+ */
+void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds)
+{
+ import_state_t& stat = import_state[dir->dirfrag()];
+ // root
+ dir->put(CDir::PIN_IMPORTING);
+ dir->state_clear(CDir::STATE_IMPORTING);
+
+ // bounding inodes
+ set<inodeno_t> did;
+ for (list<dirfrag_t>::iterator p = stat.bound_ls.begin();
+ p != stat.bound_ls.end();
+ ++p) {
+ if (did.count(p->ino))
+ continue;
+ did.insert(p->ino);
+ CInode *in = cache->get_inode(p->ino);
+ ceph_assert(in);
+ in->put_stickydirs();
+ }
+
+ if (stat.state == IMPORT_PREPPING) {
+ for (auto bd : bounds) {
+ if (bd->state_test(CDir::STATE_IMPORTBOUND)) {
+ bd->put(CDir::PIN_IMPORTBOUND);
+ bd->state_clear(CDir::STATE_IMPORTBOUND);
+ }
+ }
+ } else if (stat.state >= IMPORT_PREPPED) {
+ // bounding dirfrags
+ for (auto bd : bounds) {
+ ceph_assert(bd->state_test(CDir::STATE_IMPORTBOUND));
+ bd->put(CDir::PIN_IMPORTBOUND);
+ bd->state_clear(CDir::STATE_IMPORTBOUND);
+ }
+ }
+}
+
+class C_MDC_QueueContexts : public MigratorContext {
+public:
+ MDSContext::vec contexts;
+ C_MDC_QueueContexts(Migrator *m) : MigratorContext(m) {}
+ void finish(int r) override {
+ // execute contexts immediately after 'this' context
+ get_mds()->queue_waiters_front(contexts);
+ }
+};
+
+/*
+ * note: this does teh full work of reversing and import and cleaning up
+ * state.
+ * called by both handle_mds_failure and by handle_resolve (if we are
+ * a survivor coping with an exporter failure+recovery).
+ */
+void Migrator::import_reverse(CDir *dir)
+{
+ dout(7) << "import_reverse " << *dir << dendl;
+
+ import_state_t& stat = import_state[dir->dirfrag()];
+ stat.state = IMPORT_ABORTING;
+
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+
+ // remove pins
+ import_remove_pins(dir, bounds);
+
+ // update auth, with possible subtree merge.
+ ceph_assert(dir->is_subtree_root());
+ if (mds->is_resolve())
+ cache->trim_non_auth_subtree(dir);
+
+ cache->adjust_subtree_auth(dir, stat.peer);
+
+ auto fin = new C_MDC_QueueContexts(this);
+ if (!dir->get_inode()->is_auth() &&
+ !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
+ dir->get_inode()->clear_scatter_dirty();
+ // wake up scatter_nudge waiters
+ dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
+ }
+
+ int num_dentries = 0;
+ // adjust auth bits.
+ list<CDir*> q;
+ q.push_back(dir);
+ while (!q.empty()) {
+ CDir *cur = q.front();
+ q.pop_front();
+
+ // dir
+ cur->abort_import();
+
+ for (auto &p : *cur) {
+ CDentry *dn = p.second;
+
+ // dentry
+ dn->state_clear(CDentry::STATE_AUTH);
+ dn->clear_replica_map();
+ dn->set_replica_nonce(CDentry::EXPORT_NONCE);
+ if (dn->is_dirty())
+ dn->mark_clean();
+
+ // inode?
+ if (dn->get_linkage()->is_primary()) {
+ CInode *in = dn->get_linkage()->get_inode();
+ in->state_clear(CDentry::STATE_AUTH);
+ in->clear_replica_map();
+ in->set_replica_nonce(CInode::EXPORT_NONCE);
+ if (in->is_dirty())
+ in->mark_clean();
+ in->clear_dirty_rstat();
+ if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
+ in->clear_scatter_dirty();
+ in->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
+ }
+
+ in->clear_dirty_parent();
+
+ in->authlock.clear_gather();
+ in->linklock.clear_gather();
+ in->dirfragtreelock.clear_gather();
+ in->filelock.clear_gather();
+
+ in->clear_file_locks();
+
+ // non-bounding dir?
+ list<CDir*> dfs;
+ in->get_dirfrags(dfs);
+ for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p)
+ if (bounds.count(*p) == 0)
+ q.push_back(*p);
+ }
+
+ cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
+ ++num_dentries;
+ }
+ }
+
+ dir->add_waiter(CDir::WAIT_UNFREEZE, fin);
+
+ if (stat.state == IMPORT_ACKING) {
+ // remove imported caps
+ for (map<CInode*,map<client_t,Capability::Export> >::iterator p = stat.peer_exports.begin();
+ p != stat.peer_exports.end();
+ ++p) {
+ CInode *in = p->first;
+ for (map<client_t,Capability::Export>::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q) {
+ Capability *cap = in->get_client_cap(q->first);
+ if (!cap) {
+ ceph_assert(!stat.session_map.count(q->first));
+ continue;
+ }
+ if (cap->is_importing())
+ in->remove_client_cap(q->first);
+ }
+ in->put(CInode::PIN_IMPORTINGCAPS);
+ }
+ for (auto& p : stat.session_map) {
+ Session *session = p.second.first;
+ session->dec_importing();
+ }
+ }
+
+ // log our failure
+ mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); // log failure
+
+ cache->trim(num_dentries); // try trimming dentries
+
+ // notify bystanders; wait in aborting state
+ import_notify_abort(dir, bounds);
+}
+
+void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
+{
+ dout(7) << "import_notify_finish " << *dir << dendl;
+
+ import_state_t& stat = import_state[dir->dirfrag()];
+ for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
+ p != stat.bystanders.end();
+ ++p) {
+ auto notify = MExportDirNotify::create(dir->dirfrag(), stat.tid, false,
+ pair<int,int>(stat.peer, mds->get_nodeid()),
+ pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
+ for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
+ notify->get_bounds().push_back((*i)->dirfrag());
+ mds->send_message_mds(notify, *p);
+ }
+}
+
+void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
+{
+ dout(7) << "import_notify_abort " << *dir << dendl;
+
+ import_state_t& stat = import_state[dir->dirfrag()];
+ for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
+ p != stat.bystanders.end(); ) {
+ if (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)) {
+ // this can happen if both exporter and bystander fail in the same mdsmap epoch
+ stat.bystanders.erase(p++);
+ continue;
+ }
+ auto notify = MExportDirNotify::create(dir->dirfrag(), stat.tid, true,
+ mds_authority_t(stat.peer, mds->get_nodeid()),
+ mds_authority_t(stat.peer, CDIR_AUTH_UNKNOWN));
+ for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
+ notify->get_bounds().push_back((*i)->dirfrag());
+ mds->send_message_mds(notify, *p);
+ ++p;
+ }
+ if (stat.bystanders.empty()) {
+ dout(7) << "no bystanders, finishing reverse now" << dendl;
+ import_reverse_unfreeze(dir);
+ } else {
+ assert (g_conf()->mds_kill_import_at != 10);
+ }
+}
+
+void Migrator::import_reverse_unfreeze(CDir *dir)
+{
+ dout(7) << "import_reverse_unfreeze " << *dir << dendl;
+ ceph_assert(!dir->is_auth());
+ cache->discard_delayed_expire(dir);
+ dir->unfreeze_tree();
+ if (dir->is_subtree_root())
+ cache->try_subtree_merge(dir);
+ import_reverse_final(dir);
+}
+
+void Migrator::import_reverse_final(CDir *dir)
+{
+ dout(7) << "import_reverse_final " << *dir << dendl;
+
+ // clean up
+ map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
+ ceph_assert(it != import_state.end());
+
+ MutationRef mut = it->second.mut;
+ import_state.erase(it);
+
+ // send pending import_maps?
+ mds->mdcache->maybe_send_pending_resolves();
+
+ if (mut) {
+ mds->locker->drop_locks(mut.get());
+ mut->cleanup();
+ }
+
+ cache->show_subtrees();
+ //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
+}
+
+
+
+
+void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
+ map<client_t,pair<Session*,uint64_t> >& imported_session_map)
+{
+ dout(7) << "import_logged " << *dir << dendl;
+
+ map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
+ if (it == import_state.end() ||
+ it->second.state != IMPORT_LOGGINGSTART) {
+ dout(7) << "import " << df << " must have aborted" << dendl;
+ mds->server->finish_force_open_sessions(imported_session_map);
+ return;
+ }
+
+ // note state
+ it->second.state = IMPORT_ACKING;
+
+ assert (g_conf()->mds_kill_import_at != 7);
+
+ // force open client sessions and finish cap import
+ mds->server->finish_force_open_sessions(imported_session_map, false);
+
+ map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
+ for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
+ p != it->second.peer_exports.end();
+ ++p) {
+ // parameter 'peer' is NONE, delay sending cap import messages to client
+ finish_import_inode_caps(p->first, MDS_RANK_NONE, true, imported_session_map,
+ p->second, imported_caps[p->first->ino()]);
+ }
+
+ it->second.session_map.swap(imported_session_map);
+
+ // send notify's etc.
+ dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl;
+
+ // test surviving observer of a failed migration that did not complete
+ //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
+
+ auto ack = MExportDirAck::create(dir->dirfrag(), it->second.tid);
+ encode(imported_caps, ack->imported_caps);
+
+ mds->send_message_mds(ack, from);
+ assert (g_conf()->mds_kill_import_at != 8);
+
+ cache->show_subtrees();
+}
+
+void Migrator::handle_export_finish(const MExportDirFinish::const_ref &m)
+{
+ CDir *dir = cache->get_dirfrag(m->get_dirfrag());
+ ceph_assert(dir);
+ dout(7) << "handle_export_finish on " << *dir << (m->is_last() ? " last" : "") << dendl;
+
+ map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
+ ceph_assert(it != import_state.end());
+ ceph_assert(it->second.tid == m->get_tid());
+
+ import_finish(dir, false, m->is_last());
+}
+
+void Migrator::import_finish(CDir *dir, bool notify, bool last)
+{
+ dout(7) << "import_finish on " << *dir << dendl;
+
+ map<dirfrag_t,import_state_t>::iterator it = import_state.find(dir->dirfrag());
+ ceph_assert(it != import_state.end());
+ ceph_assert(it->second.state == IMPORT_ACKING || it->second.state == IMPORT_FINISHING);
+
+ if (it->second.state == IMPORT_ACKING) {
+ ceph_assert(dir->is_auth());
+ cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
+ }
+
+ // log finish
+ ceph_assert(g_conf()->mds_kill_import_at != 9);
+
+ if (it->second.state == IMPORT_ACKING) {
+ for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
+ p != it->second.peer_exports.end();
+ ++p) {
+ CInode *in = p->first;
+ ceph_assert(in->is_auth());
+ for (map<client_t,Capability::Export>::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q) {
+ auto r = it->second.session_map.find(q->first);
+ if (r == it->second.session_map.end())
+ continue;
+
+ Session *session = r->second.first;
+ Capability *cap = in->get_client_cap(q->first);
+ ceph_assert(cap);
+ cap->merge(q->second, true);
+ cap->clear_importing();
+ mds->mdcache->do_cap_import(session, in, cap, q->second.cap_id, q->second.seq,
+ q->second.mseq - 1, it->second.peer, CEPH_CAP_FLAG_AUTH);
+ }
+ p->second.clear();
+ in->replica_caps_wanted = 0;
+ }
+ for (auto& p : it->second.session_map) {
+ Session *session = p.second.first;
+ session->dec_importing();
+ }
+ }
+
+ if (!last) {
+ ceph_assert(it->second.state == IMPORT_ACKING);
+ it->second.state = IMPORT_FINISHING;
+ return;
+ }
+
+ // remove pins
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+
+ if (notify)
+ import_notify_finish(dir, bounds);
+
+ import_remove_pins(dir, bounds);
+
+ map<CInode*, map<client_t,Capability::Export> > peer_exports;
+ it->second.peer_exports.swap(peer_exports);
+
+ // clear import state (we're done!)
+ MutationRef mut = it->second.mut;
+ import_state.erase(it);
+
+ mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
+
+ // process delayed expires
+ cache->process_delayed_expire(dir);
+
+ // unfreeze tree, with possible subtree merge.
+ dir->unfreeze_tree();
+ cache->try_subtree_merge(dir);
+
+ cache->show_subtrees();
+ //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
+
+ if (mut) {
+ mds->locker->drop_locks(mut.get());
+ mut->cleanup();
+ }
+
+ // re-eval imported caps
+ for (map<CInode*, map<client_t,Capability::Export> >::iterator p = peer_exports.begin();
+ p != peer_exports.end();
+ ++p) {
+ if (p->first->is_auth())
+ mds->locker->eval(p->first, CEPH_CAP_LOCKS, true);
+ p->first->put(CInode::PIN_IMPORTINGCAPS);
+ }
+
+ // send pending import_maps?
+ mds->mdcache->maybe_send_pending_resolves();
+
+ // did i just import mydir?
+ if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
+ cache->populate_mydir();
+
+ // is it empty?
+ if (dir->get_num_head_items() == 0 &&
+ !dir->inode->is_auth()) {
+ // reexport!
+ export_empty_import(dir);
+ }
+}
+
+
+void Migrator::decode_import_inode(CDentry *dn, bufferlist::const_iterator& blp,
+ mds_rank_t oldauth, LogSegment *ls,
+ map<CInode*, map<client_t,Capability::Export> >& peer_exports,
+ list<ScatterLock*>& updated_scatterlocks)
+{
+ dout(15) << "decode_import_inode on " << *dn << dendl;
+
+ inodeno_t ino;
+ snapid_t last;
+ decode(ino, blp);
+ decode(last, blp);
+
+ bool added = false;
+ CInode *in = cache->get_inode(ino, last);
+ if (!in) {
+ in = new CInode(mds->mdcache, true, 1, last);
+ added = true;
+ }
+
+ // state after link -- or not! -sage
+ in->decode_import(blp, ls); // cap imports are noted for later action
+
+ // caps
+ decode_import_inode_caps(in, true, blp, peer_exports);
+
+ // link before state -- or not! -sage
+ if (dn->get_linkage()->get_inode() != in) {
+ ceph_assert(!dn->get_linkage()->get_inode());
+ dn->dir->link_primary_inode(dn, in);
+ }
+
+ if (in->is_dir())
+ dn->dir->pop_lru_subdirs.push_back(&in->item_pop_lru);
+
+ // add inode?
+ if (added) {
+ cache->add_inode(in);
+ dout(10) << "added " << *in << dendl;
+ } else {
+ dout(10) << " had " << *in << dendl;
+ }
+
+ if (in->inode.is_dirty_rstat())
+ in->mark_dirty_rstat();
+
+ // clear if dirtyscattered, since we're going to journal this
+ // but not until we _actually_ finish the import...
+ if (in->filelock.is_dirty()) {
+ updated_scatterlocks.push_back(&in->filelock);
+ mds->locker->mark_updated_scatterlock(&in->filelock);
+ }
+
+ if (in->dirfragtreelock.is_dirty()) {
+ updated_scatterlocks.push_back(&in->dirfragtreelock);
+ mds->locker->mark_updated_scatterlock(&in->dirfragtreelock);
+ }
+
+ // adjust replica list
+ //assert(!in->is_replica(oldauth)); // not true on failed export
+ in->add_replica(oldauth, CInode::EXPORT_NONCE);
+ if (in->is_replica(mds->get_nodeid()))
+ in->remove_replica(mds->get_nodeid());
+
+ if (in->snaplock.is_stable() &&
+ in->snaplock.get_state() != LOCK_SYNC)
+ mds->locker->try_eval(&in->snaplock, NULL);
+}
+
+void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
+ bufferlist::const_iterator &blp,
+ map<CInode*, map<client_t,Capability::Export> >& peer_exports)
+{
+ map<client_t,Capability::Export> cap_map;
+ decode(cap_map, blp);
+ if (auth_cap) {
+ mempool::mds_co::compact_map<int32_t,int32_t> mds_wanted;
+ decode(mds_wanted, blp);
+ mds_wanted.erase(mds->get_nodeid());
+ in->set_mds_caps_wanted(mds_wanted);
+ }
+ if (!cap_map.empty() ||
+ (auth_cap && (in->get_caps_wanted() & ~CEPH_CAP_PIN))) {
+ peer_exports[in].swap(cap_map);
+ in->get(CInode::PIN_IMPORTINGCAPS);
+ }
+}
+
+void Migrator::finish_import_inode_caps(CInode *in, mds_rank_t peer, bool auth_cap,
+ const map<client_t,pair<Session*,uint64_t> >& session_map,
+ const map<client_t,Capability::Export> &export_map,
+ map<client_t,Capability::Import> &import_map)
+{
+ for (auto& it : export_map) {
+ dout(10) << "finish_import_inode_caps for client." << it.first << " on " << *in << dendl;
+
+ auto p = session_map.find(it.first);
+ if (p == session_map.end()) {
+ dout(10) << " no session for client." << it.first << dendl;
+ (void)import_map[it.first];
+ continue;
+ }
+
+ Session *session = p->second.first;
+
+ Capability *cap = in->get_client_cap(it.first);
+ if (!cap) {
+ cap = in->add_client_cap(it.first, session);
+ if (peer < 0)
+ cap->mark_importing();
+ }
+
+ // Always ask exporter mds to send cap export messages for auth caps.
+ // For non-auth caps, ask exporter mds to send cap export messages to
+ // clients who haven't opened sessions. The cap export messages will
+ // make clients open sessions.
+ if (auth_cap || !session->get_connection()) {
+ Capability::Import& im = import_map[it.first];
+ im.cap_id = cap->get_cap_id();
+ im.mseq = auth_cap ? it.second.mseq : cap->get_mseq();
+ im.issue_seq = cap->get_last_seq() + 1;
+ }
+
+ if (peer >= 0) {
+ cap->merge(it.second, auth_cap);
+ mds->mdcache->do_cap_import(session, in, cap, it.second.cap_id,
+ it.second.seq, it.second.mseq - 1, peer,
+ auth_cap ? CEPH_CAP_FLAG_AUTH : CEPH_CAP_FLAG_RELEASE);
+ }
+ }
+
+ if (peer >= 0) {
+ in->replica_caps_wanted = 0;
+ in->put(CInode::PIN_IMPORTINGCAPS);
+ }
+}
+
+int Migrator::decode_import_dir(bufferlist::const_iterator& blp,
+ mds_rank_t oldauth,
+ CDir *import_root,
+ EImportStart *le,
+ LogSegment *ls,
+ map<CInode*,map<client_t,Capability::Export> >& peer_exports,
+ list<ScatterLock*>& updated_scatterlocks)
+{
+ // set up dir
+ dirfrag_t df;
+ decode(df, blp);
+
+ CInode *diri = cache->get_inode(df.ino);
+ ceph_assert(diri);
+ CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag);
+ ceph_assert(dir);
+
+ dout(7) << "decode_import_dir " << *dir << dendl;
+
+ if (!dir->freeze_tree_state) {
+ ceph_assert(dir->get_version() == 0);
+ dir->freeze_tree_state = import_root->freeze_tree_state;
+ }
+
+ // assimilate state
+ dir->decode_import(blp, ls);
+
+ // adjust replica list
+ //assert(!dir->is_replica(oldauth)); // not true on failed export
+ dir->add_replica(oldauth, CDir::EXPORT_NONCE);
+ if (dir->is_replica(mds->get_nodeid()))
+ dir->remove_replica(mds->get_nodeid());
+
+ // add to journal entry
+ if (le)
+ le->metablob.add_import_dir(dir);
+
+ int num_imported = 0;
+
+ // take all waiters on this dir
+ // NOTE: a pass of imported data is guaranteed to get all of my waiters because
+ // a replica's presense in my cache implies/forces it's presense in authority's.
+ MDSContext::vec waiters;
+ dir->take_waiting(CDir::WAIT_ANY_MASK, waiters);
+ for (auto c : waiters)
+ dir->add_waiter(CDir::WAIT_UNFREEZE, c); // UNFREEZE will get kicked both on success or failure
+
+ dout(15) << "doing contents" << dendl;
+
+ // contents
+ __u32 nden;
+ decode(nden, blp);
+
+ for (; nden>0; nden--) {
+ num_imported++;
+
+ // dentry
+ string dname;
+ snapid_t last;
+ decode(dname, blp);
+ decode(last, blp);
+
+ CDentry *dn = dir->lookup_exact_snap(dname, last);
+ if (!dn)
+ dn = dir->add_null_dentry(dname, 1, last);
+
+ dn->decode_import(blp, ls);
+
+ dn->add_replica(oldauth, CDentry::EXPORT_NONCE);
+ if (dn->is_replica(mds->get_nodeid()))
+ dn->remove_replica(mds->get_nodeid());
+
+ // dentry lock in unreadable state can block path traverse
+ if (dn->lock.get_state() != LOCK_SYNC)
+ mds->locker->try_eval(&dn->lock, NULL);
+
+ dout(15) << "decode_import_dir got " << *dn << dendl;
+
+ // points to...
+ char icode;
+ decode(icode, blp);
+
+ if (icode == 'N') {
+ // null dentry
+ ceph_assert(dn->get_linkage()->is_null());
+
+ // fall thru
+ }
+ else if (icode == 'L') {
+ // remote link
+ inodeno_t ino;
+ unsigned char d_type;
+ decode(ino, blp);
+ decode(d_type, blp);
+ if (dn->get_linkage()->is_remote()) {
+ ceph_assert(dn->get_linkage()->get_remote_ino() == ino);
+ } else {
+ dir->link_remote_inode(dn, ino, d_type);
+ }
+ }
+ else if (icode == 'I') {
+ // inode
+ ceph_assert(le);
+ decode_import_inode(dn, blp, oldauth, ls,
+ peer_exports, updated_scatterlocks);
+ }
+
+ // add dentry to journal entry
+ if (le)
+ le->metablob.add_import_dentry(dn);
+ }
+
+#ifdef MDS_VERIFY_FRAGSTAT
+ if (dir->is_complete())
+ dir->verify_fragstat();
+#endif
+
+ dir->inode->maybe_export_pin();
+
+ dout(7) << "decode_import_dir done " << *dir << dendl;
+ return num_imported;
+}
+
+
+
+
+
+// authority bystander
+
+void Migrator::handle_export_notify(const MExportDirNotify::const_ref &m)
+{
+ if (!(mds->is_clientreplay() || mds->is_active() || mds->is_stopping())) {
+ return;
+ }
+
+ CDir *dir = cache->get_dirfrag(m->get_dirfrag());
+
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+ mds_authority_t old_auth = m->get_old_auth();
+ mds_authority_t new_auth = m->get_new_auth();
+
+ if (!dir) {
+ dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
+ << " on missing dir " << m->get_dirfrag() << dendl;
+ } else if (dir->authority() != old_auth) {
+ dout(7) << "handle_export_notify old_auth was " << dir->authority()
+ << " != " << old_auth << " -> " << new_auth
+ << " on " << *dir << dendl;
+ } else {
+ dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
+ << " on " << *dir << dendl;
+ // adjust auth
+ set<CDir*> have;
+ cache->map_dirfrag_set(m->get_bounds(), have);
+ cache->adjust_bounded_subtree_auth(dir, have, new_auth);
+
+ // induce a merge?
+ cache->try_subtree_merge(dir);
+ }
+
+ // send ack
+ if (m->wants_ack()) {
+ mds->send_message_mds(MExportDirNotifyAck::create(m->get_dirfrag(), m->get_tid(), m->get_new_auth()), from);
+ } else {
+ // aborted. no ack.
+ dout(7) << "handle_export_notify no ack requested" << dendl;
+ }
+}
+
+/** cap exports **/
+void Migrator::export_caps(CInode *in)
+{
+ mds_rank_t dest = in->authority().first;
+ dout(7) << "export_caps to mds." << dest << " " << *in << dendl;
+
+ ceph_assert(in->is_any_caps());
+ ceph_assert(!in->is_auth());
+ ceph_assert(!in->is_ambiguous_auth());
+ ceph_assert(!in->state_test(CInode::STATE_EXPORTINGCAPS));
+
+ auto ex = MExportCaps::create();
+ ex->ino = in->ino();
+
+ encode_export_inode_caps(in, false, ex->cap_bl, ex->client_map, ex->client_metadata_map);
+
+ mds->send_message_mds(ex, dest);
+}
+
+void Migrator::handle_export_caps_ack(const MExportCapsAck::const_ref &ack)
+{
+ mds_rank_t from = ack->get_source().num();
+ CInode *in = cache->get_inode(ack->ino);
+ if (in) {
+ ceph_assert(!in->is_auth());
+
+ dout(10) << "handle_export_caps_ack " << *ack << " from "
+ << ack->get_source() << " on " << *in << dendl;
+
+ map<client_t,Capability::Import> imported_caps;
+ map<client_t,uint64_t> caps_ids;
+ auto blp = ack->cap_bl.cbegin();
+ decode(imported_caps, blp);
+ decode(caps_ids, blp);
+
+ for (auto& it : imported_caps) {
+ Capability *cap = in->get_client_cap(it.first);
+ if (!cap || cap->get_cap_id() != caps_ids.at(it.first))
+ continue;
+
+ dout(7) << __func__ << " telling client." << it.first
+ << " exported caps on " << *in << dendl;
+ auto m = MClientCaps::create(CEPH_CAP_OP_EXPORT, in->ino(), 0,
+ cap->get_cap_id(), cap->get_mseq(),
+ mds->get_osd_epoch_barrier());
+ m->set_cap_peer(it.second.cap_id, it.second.issue_seq, it.second.mseq, from, 0);
+ mds->send_message_client_counted(m, it.first);
+
+ in->remove_client_cap(it.first);
+ }
+
+ mds->locker->request_inode_file_caps(in);
+ mds->locker->try_eval(in, CEPH_CAP_LOCKS);
+ }
+}
+
+void Migrator::handle_gather_caps(const MGatherCaps::const_ref &m)
+{
+ CInode *in = cache->get_inode(m->ino);
+ if (!in)
+ return;
+
+ dout(10) << "handle_gather_caps " << *m << " from " << m->get_source()
+ << " on " << *in << dendl;
+
+ if (in->is_any_caps() &&
+ !in->is_auth() &&
+ !in->is_ambiguous_auth() &&
+ !in->state_test(CInode::STATE_EXPORTINGCAPS))
+ export_caps(in);
+}
+
+class C_M_LoggedImportCaps : public MigratorLogContext {
+ CInode *in;
+ mds_rank_t from;
+public:
+ map<client_t,pair<Session*,uint64_t> > imported_session_map;
+ map<CInode*, map<client_t,Capability::Export> > peer_exports;
+
+ C_M_LoggedImportCaps(Migrator *m, CInode *i, mds_rank_t f) : MigratorLogContext(m), in(i), from(f) {}
+ void finish(int r) override {
+ mig->logged_import_caps(in, from, imported_session_map, peer_exports);
+ }
+};
+
+void Migrator::handle_export_caps(const MExportCaps::const_ref &ex)
+{
+ dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl;
+ CInode *in = cache->get_inode(ex->ino);
+
+ ceph_assert(in);
+ ceph_assert(in->is_auth());
+
+ // FIXME
+ if (!in->can_auth_pin()) {
+ return;
+ }
+
+ in->auth_pin(this);
+
+ map<client_t,entity_inst_t> client_map{ex->client_map};
+ map<client_t,client_metadata_t> client_metadata_map{ex->client_metadata_map};
+
+ C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
+ this, in, mds_rank_t(ex->get_source().num()));
+
+ version_t pv = mds->server->prepare_force_open_sessions(client_map, client_metadata_map,
+ finish->imported_session_map);
+ // decode new caps
+ auto blp = ex->cap_bl.cbegin();
+ decode_import_inode_caps(in, false, blp, finish->peer_exports);
+ ceph_assert(!finish->peer_exports.empty()); // thus, inode is pinned.
+
+ // journal open client sessions
+ ESessions *le = new ESessions(pv, std::move(client_map),
+ std::move(client_metadata_map));
+ mds->mdlog->start_submit_entry(le, finish);
+ mds->mdlog->flush();
+}
+
+
+void Migrator::logged_import_caps(CInode *in,
+ mds_rank_t from,
+ map<client_t,pair<Session*,uint64_t> >& imported_session_map,
+ map<CInode*, map<client_t,Capability::Export> >& peer_exports)
+{
+ dout(10) << "logged_import_caps on " << *in << dendl;
+ // see export_go() vs export_go_synced()
+ ceph_assert(in->is_auth());
+
+ // force open client sessions and finish cap import
+ mds->server->finish_force_open_sessions(imported_session_map);
+
+ auto it = peer_exports.find(in);
+ ceph_assert(it != peer_exports.end());
+
+ // clients will release caps from the exporter when they receive the cap import message.
+ map<client_t,Capability::Import> imported_caps;
+ finish_import_inode_caps(in, from, false, imported_session_map, it->second, imported_caps);
+ mds->locker->eval(in, CEPH_CAP_LOCKS, true);
+
+ if (!imported_caps.empty()) {
+ auto ack = MExportCapsAck::create(in->ino());
+ map<client_t,uint64_t> peer_caps_ids;
+ for (auto &p : imported_caps )
+ peer_caps_ids[p.first] = it->second.at(p.first).cap_id;
+
+ encode(imported_caps, ack->cap_bl);
+ encode(peer_caps_ids, ack->cap_bl);
+ mds->send_message_mds(ack, from);
+ }
+
+ in->auth_unpin(this);
+}
+
+Migrator::Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) {
+ max_export_size = g_conf().get_val<Option::size_t>("mds_max_export_size");
+ inject_session_race = g_conf().get_val<bool>("mds_inject_migrator_session_race");
+}
+
+void Migrator::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
+{
+ if (changed.count("mds_max_export_size"))
+ max_export_size = g_conf().get_val<Option::size_t>("mds_max_export_size");
+ if (changed.count("mds_inject_migrator_session_race")) {
+ inject_session_race = g_conf().get_val<bool>("mds_inject_migrator_session_race");
+ dout(0) << "mds_inject_migrator_session_race is " << inject_session_race << dendl;
+ }
+}
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
new file mode 100644
index 00000000..de35b427
--- /dev/null
+++ b/src/mds/Migrator.h
@@ -0,0 +1,376 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ * Handles the import and export of mds authorities and actual cache data.
+ * See src/doc/exports.txt for a description.
+ */
+
+#ifndef CEPH_MDS_MIGRATOR_H
+#define CEPH_MDS_MIGRATOR_H
+
+#include "include/types.h"
+
+#include "MDSContext.h"
+
+#include <map>
+#include <list>
+#include <set>
+#include <string_view>
+
+class MDSRank;
+class CDir;
+class CInode;
+class CDentry;
+class Session;
+
+#include "messages/MExportCaps.h"
+#include "messages/MExportCapsAck.h"
+#include "messages/MExportDir.h"
+#include "messages/MExportDirAck.h"
+#include "messages/MExportDirCancel.h"
+#include "messages/MExportDirDiscover.h"
+#include "messages/MExportDirDiscoverAck.h"
+#include "messages/MExportDirFinish.h"
+#include "messages/MExportDirNotify.h"
+#include "messages/MExportDirNotifyAck.h"
+#include "messages/MExportDirPrep.h"
+#include "messages/MExportDirPrepAck.h"
+#include "messages/MGatherCaps.h"
+
+class EImportStart;
+
+class Migrator {
+public:
+ // export stages. used to clean up intelligently if there's a failure.
+ const static int EXPORT_CANCELLED = 0; // cancelled
+ const static int EXPORT_CANCELLING = 1; // waiting for cancel notifyacks
+ const static int EXPORT_LOCKING = 2; // acquiring locks
+ const static int EXPORT_DISCOVERING = 3; // dest is disovering export dir
+ const static int EXPORT_FREEZING = 4; // we're freezing the dir tree
+ const static int EXPORT_PREPPING = 5; // sending dest spanning tree to export bounds
+ const static int EXPORT_WARNING = 6; // warning bystanders of dir_auth_pending
+ const static int EXPORT_EXPORTING = 7; // sent actual export, waiting for ack
+ const static int EXPORT_LOGGINGFINISH = 8; // logging EExportFinish
+ const static int EXPORT_NOTIFYING = 9; // waiting for notifyacks
+ static std::string_view get_export_statename(int s) {
+ switch (s) {
+ case EXPORT_CANCELLING: return "cancelling";
+ case EXPORT_LOCKING: return "locking";
+ case EXPORT_DISCOVERING: return "discovering";
+ case EXPORT_FREEZING: return "freezing";
+ case EXPORT_PREPPING: return "prepping";
+ case EXPORT_WARNING: return "warning";
+ case EXPORT_EXPORTING: return "exporting";
+ case EXPORT_LOGGINGFINISH: return "loggingfinish";
+ case EXPORT_NOTIFYING: return "notifying";
+ default: ceph_abort(); return std::string_view();
+ }
+ }
+
+ // -- imports --
+ const static int IMPORT_DISCOVERING = 1; // waiting for prep
+ const static int IMPORT_DISCOVERED = 2; // waiting for prep
+ const static int IMPORT_PREPPING = 3; // opening dirs on bounds
+ const static int IMPORT_PREPPED = 4; // opened bounds, waiting for import
+ const static int IMPORT_LOGGINGSTART = 5; // got import, logging EImportStart
+ const static int IMPORT_ACKING = 6; // logged EImportStart, sent ack, waiting for finish
+ const static int IMPORT_FINISHING = 7; // sent cap imports, waiting for finish
+ const static int IMPORT_ABORTING = 8; // notifying bystanders of an abort before unfreezing
+ static std::string_view get_import_statename(int s) {
+ switch (s) {
+ case IMPORT_DISCOVERING: return "discovering";
+ case IMPORT_DISCOVERED: return "discovered";
+ case IMPORT_PREPPING: return "prepping";
+ case IMPORT_PREPPED: return "prepped";
+ case IMPORT_LOGGINGSTART: return "loggingstart";
+ case IMPORT_ACKING: return "acking";
+ case IMPORT_FINISHING: return "finishing";
+ case IMPORT_ABORTING: return "aborting";
+ default: ceph_abort(); return std::string_view();
+ }
+ }
+
+ // -- cons --
+ Migrator(MDSRank *m, MDCache *c);
+
+ void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
+
+protected:
+ struct export_base_t {
+ dirfrag_t dirfrag;
+ mds_rank_t dest;
+ unsigned pending_children;
+ uint64_t export_queue_gen;
+ bool restart = false;
+ export_base_t(dirfrag_t df, mds_rank_t d, unsigned c, uint64_t g) :
+ dirfrag(df), dest(d), pending_children(c), export_queue_gen(g) {}
+ };
+
+ // export fun
+ struct export_state_t {
+ int state = 0;
+ mds_rank_t peer = MDS_RANK_NONE;
+ uint64_t tid = 0;
+ std::set<mds_rank_t> warning_ack_waiting;
+ std::set<mds_rank_t> notify_ack_waiting;
+ std::map<inodeno_t,std::map<client_t,Capability::Import> > peer_imported;
+ MutationRef mut;
+ size_t approx_size = 0;
+ // for freeze tree deadlock detection
+ utime_t last_cum_auth_pins_change;
+ int last_cum_auth_pins = 0;
+ int num_remote_waiters = 0; // number of remote authpin waiters
+ export_state_t() {}
+
+ std::shared_ptr<export_base_t> parent;
+ };
+ std::map<CDir*, export_state_t> export_state;
+ typedef map<CDir*, export_state_t>::iterator export_state_iterator;
+
+ uint64_t total_exporting_size = 0;
+ unsigned num_locking_exports = 0; // exports in locking state (approx_size == 0)
+
+ std::list<pair<dirfrag_t,mds_rank_t> > export_queue;
+ uint64_t export_queue_gen = 1;
+
+ // import fun
+ struct import_state_t {
+ int state;
+ mds_rank_t peer;
+ uint64_t tid;
+ std::set<mds_rank_t> bystanders;
+ std::list<dirfrag_t> bound_ls;
+ std::list<ScatterLock*> updated_scatterlocks;
+ std::map<client_t,pair<Session*,uint64_t> > session_map;
+ std::map<CInode*, std::map<client_t,Capability::Export> > peer_exports;
+ MutationRef mut;
+ import_state_t() : state(0), peer(0), tid(0), mut() {}
+ };
+
+ std::map<dirfrag_t, import_state_t> import_state;
+
+ void handle_export_discover_ack(const MExportDirDiscoverAck::const_ref &m);
+ void export_frozen(CDir *dir, uint64_t tid);
+ void handle_export_prep_ack(const MExportDirPrepAck::const_ref &m);
+ void export_sessions_flushed(CDir *dir, uint64_t tid);
+ void export_go(CDir *dir);
+ void export_go_synced(CDir *dir, uint64_t tid);
+ void export_try_cancel(CDir *dir, bool notify_peer=true);
+ void export_cancel_finish(export_state_iterator& it);
+ void export_reverse(CDir *dir, export_state_t& stat);
+ void export_notify_abort(CDir *dir, export_state_t& stat, std::set<CDir*>& bounds);
+ void handle_export_ack(const MExportDirAck::const_ref &m);
+ void export_logged_finish(CDir *dir);
+ void handle_export_notify_ack(const MExportDirNotifyAck::const_ref &m);
+ void export_finish(CDir *dir);
+
+ void handle_gather_caps(const MGatherCaps::const_ref &m);
+
+ friend class C_MDC_ExportFreeze;
+ friend class C_MDS_ExportFinishLogged;
+ friend class C_M_ExportGo;
+ friend class C_M_ExportSessionsFlushed;
+ friend class C_MDS_ExportDiscover;
+ friend class C_MDS_ExportPrep;
+ friend class MigratorContext;
+ friend class MigratorLogContext;
+
+ // importer
+ void handle_export_discover(const MExportDirDiscover::const_ref &m, bool started=false);
+ void handle_export_cancel(const MExportDirCancel::const_ref &m);
+ void handle_export_prep(const MExportDirPrep::const_ref &m, bool did_assim=false);
+ void handle_export_dir(const MExportDir::const_ref &m);
+
+ void import_reverse_discovering(dirfrag_t df);
+ void import_reverse_discovered(dirfrag_t df, CInode *diri);
+ void import_reverse_prepping(CDir *dir, import_state_t& stat);
+ void import_remove_pins(CDir *dir, std::set<CDir*>& bounds);
+ void import_reverse_unfreeze(CDir *dir);
+ void import_reverse_final(CDir *dir);
+ void import_notify_abort(CDir *dir, std::set<CDir*>& bounds);
+ void import_notify_finish(CDir *dir, std::set<CDir*>& bounds);
+ void import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
+ std::map<client_t,pair<Session*,uint64_t> >& imported_session_map);
+ void handle_export_finish(const MExportDirFinish::const_ref &m);
+
+ void handle_export_caps(const MExportCaps::const_ref &m);
+ void handle_export_caps_ack(const MExportCapsAck::const_ref &m);
+ void logged_import_caps(CInode *in,
+ mds_rank_t from,
+ std::map<client_t,pair<Session*,uint64_t> >& imported_session_map,
+ std::map<CInode*, std::map<client_t,Capability::Export> >& cap_imports);
+
+
+ friend class C_MDS_ImportDirLoggedStart;
+ friend class C_MDS_ImportDirLoggedFinish;
+ friend class C_M_LoggedImportCaps;
+
+ // bystander
+ void handle_export_notify(const MExportDirNotify::const_ref &m);
+
+
+public:
+
+ void dispatch(const Message::const_ref &);
+
+ void show_importing();
+ void show_exporting();
+
+ int get_num_exporting() const { return export_state.size(); }
+ int get_export_queue_size() const { return export_queue.size(); }
+
+ // -- status --
+ int is_exporting(CDir *dir) const {
+ auto it = export_state.find(dir);
+ if (it != export_state.end()) return it->second.state;
+ return 0;
+ }
+ bool is_exporting() const { return !export_state.empty(); }
+ int is_importing(dirfrag_t df) const {
+ auto it = import_state.find(df);
+ if (it != import_state.end()) return it->second.state;
+ return 0;
+ }
+ bool is_importing() const { return !import_state.empty(); }
+
+ bool is_ambiguous_import(dirfrag_t df) const {
+ auto it = import_state.find(df);
+ if (it == import_state.end())
+ return false;
+ if (it->second.state >= IMPORT_LOGGINGSTART &&
+ it->second.state < IMPORT_ABORTING)
+ return true;
+ return false;
+ }
+
+ int get_import_state(dirfrag_t df) const {
+ auto it = import_state.find(df);
+ ceph_assert(it != import_state.end());
+ return it->second.state;
+ }
+ int get_import_peer(dirfrag_t df) const {
+ auto it = import_state.find(df);
+ ceph_assert(it != import_state.end());
+ return it->second.peer;
+ }
+
+ int get_export_state(CDir *dir) const {
+ auto it = export_state.find(dir);
+ ceph_assert(it != export_state.end());
+ return it->second.state;
+ }
+ // this returns true if we are export @dir,
+ // and are not waiting for @who to be
+ // be warned of ambiguous auth.
+ // only returns meaningful results during EXPORT_WARNING state.
+ bool export_has_warned(CDir *dir, mds_rank_t who) {
+ auto it = export_state.find(dir);
+ ceph_assert(it != export_state.end());
+ ceph_assert(it->second.state == EXPORT_WARNING);
+ return (it->second.warning_ack_waiting.count(who) == 0);
+ }
+
+ bool export_has_notified(CDir *dir, mds_rank_t who) const {
+ auto it = export_state.find(dir);
+ ceph_assert(it != export_state.end());
+ ceph_assert(it->second.state == EXPORT_NOTIFYING);
+ return (it->second.notify_ack_waiting.count(who) == 0);
+ }
+
+ void export_freeze_inc_num_waiters(CDir *dir) {
+ auto it = export_state.find(dir);
+ ceph_assert(it != export_state.end());
+ it->second.num_remote_waiters++;
+ }
+ void find_stale_export_freeze();
+
+ // -- misc --
+ void handle_mds_failure_or_stop(mds_rank_t who);
+
+ void audit();
+
+ // -- import/export --
+ // exporter
+ void dispatch_export_dir(MDRequestRef& mdr, int count);
+ void export_dir(CDir *dir, mds_rank_t dest);
+ void export_empty_import(CDir *dir);
+
+ void export_dir_nicely(CDir *dir, mds_rank_t dest);
+ void maybe_do_queued_export();
+ void clear_export_queue() {
+ export_queue.clear();
+ export_queue_gen++;
+ }
+
+ void maybe_split_export(CDir* dir, uint64_t max_size, bool null_okay,
+ vector<pair<CDir*, size_t> >& results);
+ void child_export_finish(std::shared_ptr<export_base_t>& parent, bool success);
+
+ void get_export_lock_set(CDir *dir, MutationImpl::LockOpVec& lov);
+ void get_export_client_set(CDir *dir, std::set<client_t> &client_set);
+ void get_export_client_set(CInode *in, std::set<client_t> &client_set);
+
+ void encode_export_inode(CInode *in, bufferlist& bl,
+ std::map<client_t,entity_inst_t>& exported_client_map,
+ std::map<client_t,client_metadata_t>& exported_client_metadata_map);
+ void encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl,
+ std::map<client_t,entity_inst_t>& exported_client_map,
+ std::map<client_t,client_metadata_t>& exported_client_metadata_map);
+ void finish_export_inode(CInode *in, mds_rank_t target,
+ std::map<client_t,Capability::Import>& peer_imported,
+ MDSContext::vec& finished);
+ void finish_export_inode_caps(CInode *in, mds_rank_t target,
+ std::map<client_t,Capability::Import>& peer_imported);
+
+
+ uint64_t encode_export_dir(bufferlist& exportbl,
+ CDir *dir,
+ std::map<client_t,entity_inst_t>& exported_client_map,
+ std::map<client_t,client_metadata_t>& exported_client_metadata_map);
+ void finish_export_dir(CDir *dir, mds_rank_t target,
+ std::map<inodeno_t,std::map<client_t,Capability::Import> >& peer_imported,
+ MDSContext::vec& finished, int *num_dentries);
+
+ void clear_export_proxy_pins(CDir *dir);
+
+ void export_caps(CInode *in);
+
+ void decode_import_inode(CDentry *dn, bufferlist::const_iterator& blp,
+ mds_rank_t oldauth, LogSegment *ls,
+ std::map<CInode*, std::map<client_t,Capability::Export> >& cap_imports,
+ std::list<ScatterLock*>& updated_scatterlocks);
+ void decode_import_inode_caps(CInode *in, bool auth_cap, bufferlist::const_iterator &blp,
+ std::map<CInode*, std::map<client_t,Capability::Export> >& cap_imports);
+ void finish_import_inode_caps(CInode *in, mds_rank_t from, bool auth_cap,
+ const std::map<client_t,pair<Session*,uint64_t> >& smap,
+ const std::map<client_t,Capability::Export> &export_map,
+ std::map<client_t,Capability::Import> &import_map);
+ int decode_import_dir(bufferlist::const_iterator& blp,
+ mds_rank_t oldauth,
+ CDir *import_root,
+ EImportStart *le,
+ LogSegment *ls,
+ std::map<CInode*, std::map<client_t,Capability::Export> >& cap_imports,
+ std::list<ScatterLock*>& updated_scatterlocks);
+
+ void import_reverse(CDir *dir);
+
+ void import_finish(CDir *dir, bool notify, bool last=true);
+
+private:
+ MDSRank *mds;
+ MDCache *cache;
+ uint64_t max_export_size = 0;
+ bool inject_session_race = false;
+};
+
+#endif
diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc
new file mode 100644
index 00000000..ee1978e5
--- /dev/null
+++ b/src/mds/Mutation.cc
@@ -0,0 +1,473 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "Mutation.h"
+#include "ScatterLock.h"
+#include "CDir.h"
+
+// MutationImpl
+
+void MutationImpl::pin(MDSCacheObject *o)
+{
+ if (pins.count(o) == 0) {
+ o->get(MDSCacheObject::PIN_REQUEST);
+ pins.insert(o);
+ }
+}
+
+void MutationImpl::unpin(MDSCacheObject *o)
+{
+ ceph_assert(pins.count(o));
+ o->put(MDSCacheObject::PIN_REQUEST);
+ pins.erase(o);
+}
+
+void MutationImpl::set_stickydirs(CInode *in)
+{
+ if (!stickydiri || stickydiri != in) {
+ in->get_stickydirs();
+ if (stickydiri)
+ stickydiri->put_stickydirs();
+ stickydiri = in;
+ }
+}
+
+void MutationImpl::put_stickydirs()
+{
+ if (stickydiri) {
+ stickydiri->put_stickydirs();
+ stickydiri = nullptr;
+
+ }
+}
+
+void MutationImpl::drop_pins()
+{
+ for (auto& o : pins)
+ o->put(MDSCacheObject::PIN_REQUEST);
+ pins.clear();
+}
+
+void MutationImpl::start_locking(SimpleLock *lock, int target)
+{
+ ceph_assert(locking == NULL);
+ pin(lock->get_parent());
+ locking = lock;
+ locking_target_mds = target;
+}
+
+void MutationImpl::finish_locking(SimpleLock *lock)
+{
+ ceph_assert(locking == lock);
+ locking = NULL;
+ locking_target_mds = -1;
+}
+
+void MutationImpl::LockOpVec::erase_rdlock(SimpleLock* lock)
+{
+ for (int i = size() - 1; i >= 0; --i) {
+ auto& op = (*this)[i];
+ if (op.lock == lock && op.is_rdlock()) {
+ erase(begin() + i);
+ return;
+ }
+ }
+}
+
+void MutationImpl::LockOpVec::sort_and_merge()
+{
+ std::sort(begin(), end(), SimpleLock::ptr_lt());
+ // merge ops on the same lock
+ for (auto i = end() - 1; i > begin(); ) {
+ auto j = i;
+ while (--j >= begin()) {
+ if (i->lock != j->lock)
+ break;
+ }
+ if (i - j == 1) {
+ i = j;
+ continue;
+ }
+
+ // merge
+ ++j;
+ for (auto k = i; k > j; --k) {
+ if (k->is_remote_wrlock()) {
+ ceph_assert(!j->is_remote_wrlock());
+ j->wrlock_target = k->wrlock_target;
+ }
+ j->flags |= k->flags;
+ }
+ if (j->is_xlock()) {
+ // xlock overwrites other types
+ ceph_assert(!j->is_remote_wrlock());
+ j->flags = MutationImpl::LockOp::XLOCK;
+ }
+ erase(j + 1, i + 1);
+ i = j - 1;
+ }
+}
+
+// auth pins
+bool MutationImpl::is_auth_pinned(MDSCacheObject *object) const
+{
+ return auth_pins.count(object) || remote_auth_pins.count(object);
+}
+
+void MutationImpl::auth_pin(MDSCacheObject *object)
+{
+ if (!is_auth_pinned(object)) {
+ object->auth_pin(this);
+ auth_pins.insert(object);
+ }
+}
+
+void MutationImpl::auth_unpin(MDSCacheObject *object)
+{
+ ceph_assert(auth_pins.count(object));
+ object->auth_unpin(this);
+ auth_pins.erase(object);
+}
+
+void MutationImpl::drop_local_auth_pins()
+{
+ for (const auto& p : auth_pins) {
+ ceph_assert(p->is_auth());
+ p->auth_unpin(this);
+ }
+ auth_pins.clear();
+}
+
+void MutationImpl::add_projected_inode(CInode *in)
+{
+ projected_inodes.push_back(in);
+}
+
+void MutationImpl::pop_and_dirty_projected_inodes()
+{
+ while (!projected_inodes.empty()) {
+ CInode *in = projected_inodes.front();
+ projected_inodes.pop_front();
+ in->pop_and_dirty_projected_inode(ls);
+ }
+}
+
+void MutationImpl::add_projected_fnode(CDir *dir)
+{
+ projected_fnodes.push_back(dir);
+}
+
+void MutationImpl::pop_and_dirty_projected_fnodes()
+{
+ while (!projected_fnodes.empty()) {
+ CDir *dir = projected_fnodes.front();
+ projected_fnodes.pop_front();
+ dir->pop_and_dirty_projected_fnode(ls);
+ }
+}
+
+void MutationImpl::add_updated_lock(ScatterLock *lock)
+{
+ updated_locks.push_back(lock);
+}
+
+void MutationImpl::add_cow_inode(CInode *in)
+{
+ pin(in);
+ dirty_cow_inodes.push_back(in);
+}
+
+void MutationImpl::add_cow_dentry(CDentry *dn)
+{
+ pin(dn);
+ dirty_cow_dentries.push_back(pair<CDentry*,version_t>(dn, dn->get_projected_version()));
+}
+
+void MutationImpl::apply()
+{
+ pop_and_dirty_projected_inodes();
+ pop_and_dirty_projected_fnodes();
+
+ for (list<CInode*>::iterator p = dirty_cow_inodes.begin();
+ p != dirty_cow_inodes.end();
+ ++p)
+ (*p)->_mark_dirty(ls);
+ for (list<pair<CDentry*,version_t> >::iterator p = dirty_cow_dentries.begin();
+ p != dirty_cow_dentries.end();
+ ++p)
+ p->first->mark_dirty(p->second, ls);
+
+ for (list<ScatterLock*>::iterator p = updated_locks.begin();
+ p != updated_locks.end();
+ ++p)
+ (*p)->mark_dirty();
+}
+
+void MutationImpl::cleanup()
+{
+ drop_local_auth_pins();
+ drop_pins();
+}
+
+void MutationImpl::_dump_op_descriptor_unlocked(ostream& stream) const
+{
+ stream << "Mutation";
+}
+
+// MDRequestImpl
+
+MDRequestImpl::~MDRequestImpl()
+{
+ delete _more;
+}
+
+MDRequestImpl::More* MDRequestImpl::more()
+{
+ if (!_more)
+ _more = new More();
+ return _more;
+}
+
+bool MDRequestImpl::has_more() const
+{
+ return _more != nullptr;
+}
+
+bool MDRequestImpl::has_witnesses()
+{
+ return (_more != nullptr) && (!_more->witnessed.empty());
+}
+
+bool MDRequestImpl::slave_did_prepare()
+{
+ return has_more() && more()->slave_commit;
+}
+
+bool MDRequestImpl::slave_rolling_back()
+{
+ return has_more() && more()->slave_rolling_back;
+}
+
+bool MDRequestImpl::did_ino_allocation() const
+{
+ return alloc_ino || used_prealloc_ino || prealloc_inos.size();
+}
+
+bool MDRequestImpl::freeze_auth_pin(CInode *inode)
+{
+ ceph_assert(!more()->rename_inode || more()->rename_inode == inode);
+ more()->rename_inode = inode;
+ more()->is_freeze_authpin = true;
+ auth_pin(inode);
+ if (!inode->freeze_inode(1)) {
+ return false;
+ }
+ inode->freeze_auth_pin();
+ inode->unfreeze_inode();
+ return true;
+}
+
+void MDRequestImpl::unfreeze_auth_pin(bool clear_inode)
+{
+ ceph_assert(more()->is_freeze_authpin);
+ CInode *inode = more()->rename_inode;
+ if (inode->is_frozen_auth_pin())
+ inode->unfreeze_auth_pin();
+ else
+ inode->unfreeze_inode();
+ more()->is_freeze_authpin = false;
+ if (clear_inode)
+ more()->rename_inode = NULL;
+}
+
+void MDRequestImpl::set_remote_frozen_auth_pin(CInode *inode)
+{
+ more()->rename_inode = inode;
+ more()->is_remote_frozen_authpin = true;
+}
+
+void MDRequestImpl::set_ambiguous_auth(CInode *inode)
+{
+ ceph_assert(!more()->rename_inode || more()->rename_inode == inode);
+ ceph_assert(!more()->is_ambiguous_auth);
+
+ inode->set_ambiguous_auth();
+ more()->rename_inode = inode;
+ more()->is_ambiguous_auth = true;
+}
+
+void MDRequestImpl::clear_ambiguous_auth()
+{
+ CInode *inode = more()->rename_inode;
+ ceph_assert(inode && more()->is_ambiguous_auth);
+ inode->clear_ambiguous_auth();
+ more()->is_ambiguous_auth = false;
+}
+
+bool MDRequestImpl::can_auth_pin(MDSCacheObject *object)
+{
+ return object->can_auth_pin() ||
+ (is_auth_pinned(object) && has_more() &&
+ more()->is_freeze_authpin &&
+ more()->rename_inode == object);
+}
+
+void MDRequestImpl::drop_local_auth_pins()
+{
+ if (has_more() && more()->is_freeze_authpin)
+ unfreeze_auth_pin(true);
+ MutationImpl::drop_local_auth_pins();
+}
+
+const filepath& MDRequestImpl::get_filepath()
+{
+ if (client_request)
+ return client_request->get_filepath();
+ return more()->filepath1;
+}
+
+const filepath& MDRequestImpl::get_filepath2()
+{
+ if (client_request)
+ return client_request->get_filepath2();
+ return more()->filepath2;
+}
+
+void MDRequestImpl::set_filepath(const filepath& fp)
+{
+ ceph_assert(!client_request);
+ more()->filepath1 = fp;
+}
+
+void MDRequestImpl::set_filepath2(const filepath& fp)
+{
+ ceph_assert(!client_request);
+ more()->filepath2 = fp;
+}
+
+bool MDRequestImpl::is_queued_for_replay() const
+{
+ return client_request ? client_request->is_queued_for_replay() : false;
+}
+
+MClientRequest::const_ref MDRequestImpl::release_client_request()
+{
+ msg_lock.lock();
+ MClientRequest::const_ref req;
+ req.swap(client_request);
+ client_request = req;
+ msg_lock.unlock();
+ return req;
+}
+
+void MDRequestImpl::reset_slave_request(const MMDSSlaveRequest::const_ref& req)
+{
+ msg_lock.lock();
+ MMDSSlaveRequest::const_ref old;
+ old.swap(slave_request);
+ slave_request = req;
+ msg_lock.unlock();
+ old.reset();
+}
+
+void MDRequestImpl::print(ostream &out) const
+{
+ out << "request(" << reqid;
+ //if (request) out << " " << *request;
+ if (is_slave()) out << " slave_to mds." << slave_to_mds;
+ if (client_request) out << " cr=" << client_request;
+ if (slave_request) out << " sr=" << slave_request;
+ out << ")";
+}
+
+void MDRequestImpl::dump(Formatter *f) const
+{
+ _dump(f);
+}
+
+void MDRequestImpl::_dump(Formatter *f) const
+{
+ f->dump_string("flag_point", state_string());
+ f->dump_stream("reqid") << reqid;
+ {
+ msg_lock.lock();
+ auto _client_request = client_request;
+ auto _slave_request =slave_request;
+ msg_lock.unlock();
+
+ if (_client_request) {
+ f->dump_string("op_type", "client_request");
+ f->open_object_section("client_info");
+ f->dump_stream("client") << _client_request->get_orig_source();
+ f->dump_int("tid", _client_request->get_tid());
+ f->close_section(); // client_info
+ } else if (is_slave() && _slave_request) { // replies go to an existing mdr
+ f->dump_string("op_type", "slave_request");
+ f->open_object_section("master_info");
+ f->dump_stream("master") << _slave_request->get_orig_source();
+ f->close_section(); // master_info
+
+ f->open_object_section("request_info");
+ f->dump_int("attempt", _slave_request->get_attempt());
+ f->dump_string("op_type",
+ MMDSSlaveRequest::get_opname(_slave_request->get_op()));
+ f->dump_int("lock_type", _slave_request->get_lock_type());
+ f->dump_stream("object_info") << _slave_request->get_object_info();
+ f->dump_stream("srcdnpath") << _slave_request->srcdnpath;
+ f->dump_stream("destdnpath") << _slave_request->destdnpath;
+ f->dump_stream("witnesses") << _slave_request->witnesses;
+ f->dump_bool("has_inode_export",
+ _slave_request->inode_export_v != 0);
+ f->dump_int("inode_export_v", _slave_request->inode_export_v);
+ f->dump_stream("op_stamp") << _slave_request->op_stamp;
+ f->close_section(); // request_info
+ }
+ else if (internal_op != -1) { // internal request
+ f->dump_string("op_type", "internal_op");
+ f->dump_int("internal_op", internal_op);
+ f->dump_string("op_name", ceph_mds_op_name(internal_op));
+ }
+ else {
+ f->dump_string("op_type", "no_available_op_found");
+ }
+ }
+ {
+ f->open_array_section("events");
+ std::lock_guard l(lock);
+ for (auto& i : events) {
+ f->dump_object("event", i);
+ }
+ f->close_section(); // events
+ }
+}
+
+void MDRequestImpl::_dump_op_descriptor_unlocked(ostream& stream) const
+{
+ msg_lock.lock();
+ auto _client_request = client_request;
+ auto _slave_request = slave_request;
+ msg_lock.unlock();
+
+ if (_client_request) {
+ _client_request->print(stream);
+ } else if (_slave_request) {
+ _slave_request->print(stream);
+ } else if (internal_op >= 0) {
+ stream << "internal op " << ceph_mds_op_name(internal_op) << ":" << reqid;
+ } else {
+ // drat, it's triggered by a slave request, but we don't have a message
+ // FIXME
+ stream << "rejoin:" << reqid;
+ }
+}
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
new file mode 100644
index 00000000..3177b1d4
--- /dev/null
+++ b/src/mds/Mutation.h
@@ -0,0 +1,432 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_MUTATION_H
+#define CEPH_MDS_MUTATION_H
+
+#include "include/interval_set.h"
+#include "include/elist.h"
+#include "include/filepath.h"
+
+#include "MDSCacheObject.h"
+#include "MDSContext.h"
+
+#include "SimpleLock.h"
+#include "Capability.h"
+
+#include "common/TrackedOp.h"
+#include "messages/MClientRequest.h"
+#include "messages/MMDSSlaveRequest.h"
+
+class LogSegment;
+class Capability;
+class CInode;
+class CDir;
+class CDentry;
+class Session;
+class ScatterLock;
+struct sr_t;
+
+struct MutationImpl : public TrackedOp {
+ metareqid_t reqid;
+ __u32 attempt = 0; // which attempt for this request
+ LogSegment *ls = nullptr; // the log segment i'm committing to
+
+private:
+ utime_t mds_stamp; ///< mds-local timestamp (real time)
+ utime_t op_stamp; ///< op timestamp (client provided)
+
+public:
+ // flag mutation as slave
+ mds_rank_t slave_to_mds = MDS_RANK_NONE; // this is a slave request if >= 0.
+
+ // -- my pins and locks --
+ // cache pins (so things don't expire)
+ set< MDSCacheObject* > pins;
+ CInode* stickydiri = nullptr;
+
+ // auth pins
+ map<MDSCacheObject*, mds_rank_t> remote_auth_pins;
+ set<MDSCacheObject*> auth_pins;
+
+ // held locks
+ struct LockOp {
+ enum {
+ RDLOCK = 1,
+ WRLOCK = 2,
+ XLOCK = 4,
+ REMOTE_WRLOCK = 8,
+ };
+ SimpleLock* lock;
+ mutable unsigned flags;
+ mutable mds_rank_t wrlock_target;
+ operator SimpleLock*() const {
+ return lock;
+ }
+ LockOp(SimpleLock *l, unsigned f=0, mds_rank_t t=MDS_RANK_NONE) :
+ lock(l), flags(f), wrlock_target(t) {}
+ bool is_rdlock() const { return !!(flags & RDLOCK); }
+ bool is_xlock() const { return !!(flags & XLOCK); }
+ bool is_wrlock() const { return !!(flags & WRLOCK); }
+ void clear_wrlock() const { flags &= ~WRLOCK; }
+ bool is_remote_wrlock() const { return !!(flags & REMOTE_WRLOCK); }
+ void clear_remote_wrlock() const {
+ flags &= ~REMOTE_WRLOCK;
+ wrlock_target = MDS_RANK_NONE;
+ }
+ };
+
+ struct LockOpVec : public vector<LockOp> {
+ void add_rdlock(SimpleLock *lock) {
+ emplace_back(lock, LockOp::RDLOCK);
+ }
+ void erase_rdlock(SimpleLock *lock);
+ void add_xlock(SimpleLock *lock) {
+ emplace_back(lock, LockOp::XLOCK);
+ }
+ void add_wrlock(SimpleLock *lock) {
+ emplace_back(lock, LockOp::WRLOCK);
+ }
+ void add_remote_wrlock(SimpleLock *lock, mds_rank_t rank) {
+ ceph_assert(rank != MDS_RANK_NONE);
+ emplace_back(lock, LockOp::REMOTE_WRLOCK, rank);
+ }
+ void sort_and_merge();
+
+ LockOpVec() {
+ reserve(32);
+ }
+ };
+ typedef set<LockOp, SimpleLock::ptr_lt> lock_set;
+ typedef lock_set::iterator lock_iterator;
+ lock_set locks; // full ordering
+
+ bool is_rdlocked(SimpleLock *lock) const {
+ auto it = locks.find(lock);
+ return it != locks.end() && it->is_rdlock();
+ }
+ bool is_xlocked(SimpleLock *lock) const {
+ auto it = locks.find(lock);
+ return it != locks.end() && it->is_xlock();
+ }
+ bool is_wrlocked(SimpleLock *lock) const {
+ auto it = locks.find(lock);
+ return it != locks.end() && it->is_wrlock();
+ }
+ bool is_remote_wrlocked(SimpleLock *lock) const {
+ auto it = locks.find(lock);
+ return it != locks.end() && it->is_remote_wrlock();
+ }
+
+ // lock we are currently trying to acquire. if we give up for some reason,
+ // be sure to eval() this.
+ SimpleLock *locking = nullptr;
+ mds_rank_t locking_target_mds = -1;
+
+ // if this flag is set, do not attempt to acquire further locks.
+ // (useful for wrlock, which may be a moving auth target)
+ bool done_locking = false;
+ bool committing = false;
+ bool aborted = false;
+ bool killed = false;
+
+ // for applying projected inode changes
+ list<CInode*> projected_inodes;
+ list<CDir*> projected_fnodes;
+ list<ScatterLock*> updated_locks;
+
+ list<CInode*> dirty_cow_inodes;
+ list<pair<CDentry*,version_t> > dirty_cow_dentries;
+
+ // keep our default values synced with MDRequestParam's
+ MutationImpl() : TrackedOp(nullptr, utime_t()) {}
+ MutationImpl(OpTracker *tracker, utime_t initiated,
+ const metareqid_t &ri, __u32 att=0, mds_rank_t slave_to=MDS_RANK_NONE)
+ : TrackedOp(tracker, initiated),
+ reqid(ri), attempt(att),
+ slave_to_mds(slave_to) { }
+ ~MutationImpl() override {
+ ceph_assert(locking == NULL);
+ ceph_assert(pins.empty());
+ ceph_assert(auth_pins.empty());
+ }
+
+ bool is_master() const { return slave_to_mds == MDS_RANK_NONE; }
+ bool is_slave() const { return slave_to_mds != MDS_RANK_NONE; }
+
+ client_t get_client() const {
+ if (reqid.name.is_client())
+ return client_t(reqid.name.num());
+ return -1;
+ }
+
+ void set_mds_stamp(utime_t t) {
+ mds_stamp = t;
+ }
+ utime_t get_mds_stamp() const {
+ return mds_stamp;
+ }
+ void set_op_stamp(utime_t t) {
+ op_stamp = t;
+ }
+ utime_t get_op_stamp() const {
+ if (op_stamp != utime_t())
+ return op_stamp;
+ return get_mds_stamp();
+ }
+
+ // pin items in cache
+ void pin(MDSCacheObject *o);
+ void unpin(MDSCacheObject *o);
+ void set_stickydirs(CInode *in);
+ void put_stickydirs();
+ void drop_pins();
+
+ void start_locking(SimpleLock *lock, int target=-1);
+ void finish_locking(SimpleLock *lock);
+
+ // auth pins
+ bool is_auth_pinned(MDSCacheObject *object) const;
+ void auth_pin(MDSCacheObject *object);
+ void auth_unpin(MDSCacheObject *object);
+ void drop_local_auth_pins();
+ void add_projected_inode(CInode *in);
+ void pop_and_dirty_projected_inodes();
+ void add_projected_fnode(CDir *dir);
+ void pop_and_dirty_projected_fnodes();
+ void add_updated_lock(ScatterLock *lock);
+ void add_cow_inode(CInode *in);
+ void add_cow_dentry(CDentry *dn);
+ void apply();
+ void cleanup();
+
+ virtual void print(ostream &out) const {
+ out << "mutation(" << this << ")";
+ }
+
+ virtual void dump(Formatter *f) const {}
+ void _dump_op_descriptor_unlocked(ostream& stream) const override;
+};
+
+inline ostream& operator<<(ostream &out, const MutationImpl &mut)
+{
+ mut.print(out);
+ return out;
+}
+
+typedef boost::intrusive_ptr<MutationImpl> MutationRef;
+
+
+
+/**
+ * MDRequestImpl: state we track for requests we are currently processing.
+ * mostly information about locks held, so that we can drop them all
+ * the request is finished or forwarded. see request_*().
+ */
+struct MDRequestImpl : public MutationImpl {
+ Session *session;
+ elist<MDRequestImpl*>::item item_session_request; // if not on list, op is aborted.
+
+ // -- i am a client (master) request
+ MClientRequest::const_ref client_request; // client request (if any)
+
+ // store up to two sets of dn vectors, inode pointers, for request path1 and path2.
+ vector<CDentry*> dn[2];
+ CDentry *straydn;
+ CInode *in[2];
+ snapid_t snapid;
+
+ CInode *tracei;
+ CDentry *tracedn;
+
+ inodeno_t alloc_ino, used_prealloc_ino;
+ interval_set<inodeno_t> prealloc_inos;
+
+ int snap_caps = 0;
+ int getattr_caps = 0; ///< caps requested by getattr
+ bool no_early_reply = false;
+ bool did_early_reply = false;
+ bool o_trunc = false; ///< request is an O_TRUNC mutation
+ bool has_completed = false; ///< request has already completed
+
+ bufferlist reply_extra_bl;
+
+ // inos we did a embedded cap release on, and may need to eval if we haven't since reissued
+ map<vinodeno_t, ceph_seq_t> cap_releases;
+
+ // -- i am a slave request
+ MMDSSlaveRequest::const_ref slave_request; // slave request (if one is pending; implies slave == true)
+
+ // -- i am an internal op
+ int internal_op;
+ Context *internal_op_finish;
+ void *internal_op_private;
+
+ // indicates how may retries of request have been made
+ int retry;
+
+ // indicator for vxattr osdmap update
+ bool waited_for_osdmap;
+
+ // break rarely-used fields into a separately allocated structure
+ // to save memory for most ops
+ struct More {
+ int slave_error = 0;
+ set<mds_rank_t> slaves; // mds nodes that have slave requests to me (implies client_request)
+ set<mds_rank_t> waiting_on_slave; // peers i'm waiting for slavereq replies from.
+
+ // for rename/link/unlink
+ set<mds_rank_t> witnessed; // nodes who have journaled a RenamePrepare
+ map<MDSCacheObject*,version_t> pvmap;
+
+ bool has_journaled_slaves = false;
+ bool slave_update_journaled = false;
+ bool slave_rolling_back = false;
+
+ // for rename
+ set<mds_rank_t> extra_witnesses; // replica list from srcdn auth (rename)
+ mds_rank_t srcdn_auth_mds = MDS_RANK_NONE;
+ bufferlist inode_import;
+ version_t inode_import_v = 0;
+ CInode* rename_inode = nullptr;
+ bool is_freeze_authpin = false;
+ bool is_ambiguous_auth = false;
+ bool is_remote_frozen_authpin = false;
+ bool is_inode_exporter = false;
+
+ map<client_t, pair<Session*, uint64_t> > imported_session_map;
+ map<CInode*, map<client_t,Capability::Export> > cap_imports;
+
+ // for lock/flock
+ bool flock_was_waiting = false;
+
+ // for snaps
+ version_t stid = 0;
+ bufferlist snapidbl;
+
+ sr_t *srci_srnode = nullptr;
+ sr_t *desti_srnode = nullptr;
+
+ // called when slave commits or aborts
+ Context *slave_commit = nullptr;
+ bufferlist rollback_bl;
+
+ MDSContext::vec waiting_for_finish;
+
+ // export & fragment
+ CDir* export_dir = nullptr;
+ dirfrag_t fragment_base;
+
+ // for internal ops doing lookup
+ filepath filepath1;
+ filepath filepath2;
+
+ More() {}
+ } *_more;
+
+
+ // ---------------------------------------------------
+ struct Params {
+ metareqid_t reqid;
+ __u32 attempt;
+ MClientRequest::const_ref client_req;
+ Message::const_ref triggering_slave_req;
+ mds_rank_t slave_to;
+ utime_t initiated;
+ utime_t throttled, all_read, dispatched;
+ int internal_op;
+ // keep these default values synced to MutationImpl's
+ Params() : attempt(0), slave_to(MDS_RANK_NONE), internal_op(-1) {}
+ const utime_t& get_recv_stamp() const {
+ return initiated;
+ }
+ const utime_t& get_throttle_stamp() const {
+ return throttled;
+ }
+ const utime_t& get_recv_complete_stamp() const {
+ return all_read;
+ }
+ const utime_t& get_dispatch_stamp() const {
+ return dispatched;
+ }
+ };
+ MDRequestImpl(const Params* params, OpTracker *tracker) :
+ MutationImpl(tracker, params->initiated,
+ params->reqid, params->attempt, params->slave_to),
+ session(NULL), item_session_request(this),
+ client_request(params->client_req), straydn(NULL), snapid(CEPH_NOSNAP),
+ tracei(NULL), tracedn(NULL), alloc_ino(0), used_prealloc_ino(0),
+ internal_op(params->internal_op), internal_op_finish(NULL),
+ internal_op_private(NULL),
+ retry(0),
+ waited_for_osdmap(false), _more(NULL) {
+ in[0] = in[1] = NULL;
+ }
+ ~MDRequestImpl() override;
+
+ More* more();
+ bool has_more() const;
+ bool has_witnesses();
+ bool slave_did_prepare();
+ bool slave_rolling_back();
+ bool did_ino_allocation() const;
+ bool freeze_auth_pin(CInode *inode);
+ void unfreeze_auth_pin(bool clear_inode=false);
+ void set_remote_frozen_auth_pin(CInode *inode);
+ bool can_auth_pin(MDSCacheObject *object);
+ void drop_local_auth_pins();
+ void set_ambiguous_auth(CInode *inode);
+ void clear_ambiguous_auth();
+ const filepath& get_filepath();
+ const filepath& get_filepath2();
+ void set_filepath(const filepath& fp);
+ void set_filepath2(const filepath& fp);
+ bool is_queued_for_replay() const;
+
+ void print(ostream &out) const override;
+ void dump(Formatter *f) const override;
+
+ MClientRequest::const_ref release_client_request();
+ void reset_slave_request(const MMDSSlaveRequest::const_ref& req=nullptr);
+
+ // TrackedOp stuff
+ typedef boost::intrusive_ptr<MDRequestImpl> Ref;
+protected:
+ void _dump(Formatter *f) const override;
+ void _dump_op_descriptor_unlocked(ostream& stream) const override;
+private:
+ mutable ceph::spinlock msg_lock;
+};
+
+typedef boost::intrusive_ptr<MDRequestImpl> MDRequestRef;
+
+
+struct MDSlaveUpdate {
+ int origop;
+ bufferlist rollback;
+ Context *waiter = nullptr;
+ set<CInode*> olddirs;
+ set<CInode*> unlinked;
+ MDSlaveUpdate(int oo, bufferlist &rbl) :
+ origop(oo) {
+ rollback.claim(rbl);
+ }
+ ~MDSlaveUpdate() {
+ if (waiter)
+ waiter->complete(0);
+ }
+};
+
+
+#endif
diff --git a/src/mds/OpenFileTable.cc b/src/mds/OpenFileTable.cc
new file mode 100644
index 00000000..5e0d2ba5
--- /dev/null
+++ b/src/mds/OpenFileTable.cc
@@ -0,0 +1,1189 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "acconfig.h"
+#include "mds/CInode.h"
+#include "mds/CDir.h"
+#include "mds/MDSRank.h"
+#include "mds/MDCache.h"
+#include "osdc/Objecter.h"
+#include "OpenFileTable.h"
+
+#include "common/config.h"
+#include "common/errno.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+ return *_dout << "mds." << mds->get_nodeid() << ".openfiles ";
+}
+
+void OpenFileTable::get_ref(CInode *in)
+{
+ do {
+ auto p = anchor_map.find(in->ino());
+ if (p != anchor_map.end()) {
+ ceph_assert(in->state_test(CInode::STATE_TRACKEDBYOFT));
+ ceph_assert(p->second.nref > 0);
+ p->second.nref++;
+ break;
+ }
+
+ CDentry *dn = in->get_parent_dn();
+ CInode *pin = dn ? dn->get_dir()->get_inode() : nullptr;
+
+ auto ret = anchor_map.emplace(std::piecewise_construct, std::forward_as_tuple(in->ino()),
+ std::forward_as_tuple(in->ino(), (pin ? pin->ino() : inodeno_t(0)),
+ (dn ? dn->get_name() : string()), in->d_type(), 1));
+ ceph_assert(ret.second == true);
+ in->state_set(CInode::STATE_TRACKEDBYOFT);
+
+ auto ret1 = dirty_items.emplace(in->ino(), (int)DIRTY_NEW);
+ if (!ret1.second) {
+ int omap_idx = ret1.first->second;
+ ceph_assert(omap_idx >= 0);
+ ret.first->second.omap_idx = omap_idx;
+ }
+
+ in = pin;
+ } while (in);
+}
+
+void OpenFileTable::put_ref(CInode *in)
+{
+ do {
+ ceph_assert(in->state_test(CInode::STATE_TRACKEDBYOFT));
+ auto p = anchor_map.find(in->ino());
+ ceph_assert(p != anchor_map.end());
+ ceph_assert(p->second.nref > 0);
+
+ if (p->second.nref > 1) {
+ p->second.nref--;
+ break;
+ }
+
+ CDentry *dn = in->get_parent_dn();
+ CInode *pin = dn ? dn->get_dir()->get_inode() : nullptr;
+ if (dn) {
+ ceph_assert(p->second.dirino == pin->ino());
+ ceph_assert(p->second.d_name == dn->get_name());
+ } else {
+ ceph_assert(p->second.dirino == inodeno_t(0));
+ ceph_assert(p->second.d_name == "");
+ }
+
+ int omap_idx = p->second.omap_idx;
+ anchor_map.erase(p);
+ in->state_clear(CInode::STATE_TRACKEDBYOFT);
+
+ auto ret = dirty_items.emplace(in->ino(), omap_idx);
+ if (!ret.second) {
+ if (ret.first->second == DIRTY_NEW) {
+ ceph_assert(omap_idx < 0);
+ dirty_items.erase(ret.first);
+ } else {
+ ceph_assert(omap_idx >= 0);
+ ret.first->second = omap_idx;
+ }
+ }
+
+ in = pin;
+ } while (in);
+}
+
+void OpenFileTable::add_inode(CInode *in)
+{
+ dout(10) << __func__ << " " << *in << dendl;
+ if (!in->is_dir()) {
+ auto p = anchor_map.find(in->ino());
+ ceph_assert(p == anchor_map.end());
+ }
+ get_ref(in);
+}
+
+void OpenFileTable::remove_inode(CInode *in)
+{
+ dout(10) << __func__ << " " << *in << dendl;
+ if (!in->is_dir()) {
+ auto p = anchor_map.find(in->ino());
+ ceph_assert(p != anchor_map.end());
+ ceph_assert(p->second.nref == 1);
+ }
+ put_ref(in);
+}
+
+void OpenFileTable::add_dirfrag(CDir *dir)
+{
+ dout(10) << __func__ << " " << *dir << dendl;
+ ceph_assert(!dir->state_test(CDir::STATE_TRACKEDBYOFT));
+ dir->state_set(CDir::STATE_TRACKEDBYOFT);
+ auto ret = dirfrags.insert(dir->dirfrag());
+ ceph_assert(ret.second);
+ get_ref(dir->get_inode());
+ dirty_items.emplace(dir->ino(), (int)DIRTY_UNDEF);
+}
+
+void OpenFileTable::remove_dirfrag(CDir *dir)
+{
+ dout(10) << __func__ << " " << *dir << dendl;
+ ceph_assert(dir->state_test(CDir::STATE_TRACKEDBYOFT));
+ dir->state_clear(CDir::STATE_TRACKEDBYOFT);
+ auto p = dirfrags.find(dir->dirfrag());
+ ceph_assert(p != dirfrags.end());
+ dirfrags.erase(p);
+ dirty_items.emplace(dir->ino(), (int)DIRTY_UNDEF);
+ put_ref(dir->get_inode());
+}
+
+void OpenFileTable::notify_link(CInode *in)
+{
+ dout(10) << __func__ << " " << *in << dendl;
+ auto p = anchor_map.find(in->ino());
+ ceph_assert(p != anchor_map.end());
+ ceph_assert(p->second.nref > 0);
+ ceph_assert(p->second.dirino == inodeno_t(0));
+ ceph_assert(p->second.d_name == "");
+
+ CDentry *dn = in->get_parent_dn();
+ CInode *pin = dn->get_dir()->get_inode();
+
+ p->second.dirino = pin->ino();
+ p->second.d_name = dn->get_name();
+ dirty_items.emplace(in->ino(), (int)DIRTY_UNDEF);
+
+ get_ref(pin);
+}
+
+void OpenFileTable::notify_unlink(CInode *in)
+{
+ dout(10) << __func__ << " " << *in << dendl;
+ auto p = anchor_map.find(in->ino());
+ ceph_assert(p != anchor_map.end());
+ ceph_assert(p->second.nref > 0);
+
+ CDentry *dn = in->get_parent_dn();
+ CInode *pin = dn->get_dir()->get_inode();
+ ceph_assert(p->second.dirino == pin->ino());
+ ceph_assert(p->second.d_name == dn->get_name());
+
+ p->second.dirino = inodeno_t(0);
+ p->second.d_name = "";
+ dirty_items.emplace(in->ino(), (int)DIRTY_UNDEF);
+
+ put_ref(pin);
+}
+
+object_t OpenFileTable::get_object_name(unsigned idx) const
+{
+ char s[30];
+ snprintf(s, sizeof(s), "mds%d_openfiles.%x", int(mds->get_nodeid()), idx);
+ return object_t(s);
+}
+
+void OpenFileTable::_encode_header(bufferlist &bl, int j_state)
+{
+ std::string_view magic = CEPH_FS_ONDISK_MAGIC;
+ encode(magic, bl);
+ ENCODE_START(1, 1, bl);
+ encode(omap_version, bl);
+ encode(omap_num_objs, bl);
+ encode((__u8)j_state, bl);
+ ENCODE_FINISH(bl);
+}
+
+class C_IO_OFT_Save : public MDSIOContextBase {
+protected:
+ OpenFileTable *oft;
+ uint64_t log_seq;
+ MDSContext *fin;
+ MDSRank *get_mds() override { return oft->mds; }
+public:
+ C_IO_OFT_Save(OpenFileTable *t, uint64_t s, MDSContext *c) :
+ oft(t), log_seq(s), fin(c) {}
+ void finish(int r) {
+ oft->_commit_finish(r, log_seq, fin);
+ }
+ void print(ostream& out) const override {
+ out << "openfiles_save";
+ }
+};
+
+void OpenFileTable::_commit_finish(int r, uint64_t log_seq, MDSContext *fin)
+{
+ dout(10) << __func__ << " log_seq " << log_seq << dendl;
+ if (r < 0) {
+ mds->handle_write_error(r);
+ return;
+ }
+
+ ceph_assert(log_seq <= committing_log_seq);
+ ceph_assert(log_seq >= committed_log_seq);
+ committed_log_seq = log_seq;
+ num_pending_commit--;
+
+ if (fin)
+ fin->complete(r);
+}
+
+class C_IO_OFT_Journal : public MDSIOContextBase {
+protected:
+ OpenFileTable *oft;
+ uint64_t log_seq;
+ MDSContext *fin;
+ std::map<unsigned, std::vector<ObjectOperation> > ops_map;
+ MDSRank *get_mds() override { return oft->mds; }
+public:
+ C_IO_OFT_Journal(OpenFileTable *t, uint64_t s, MDSContext *c,
+ std::map<unsigned, std::vector<ObjectOperation> >& ops) :
+ oft(t), log_seq(s), fin(c) {
+ ops_map.swap(ops);
+ }
+ void finish(int r) {
+ oft->_journal_finish(r, log_seq, fin, ops_map);
+ }
+ void print(ostream& out) const override {
+ out << "openfiles_journal";
+ }
+};
+
+void OpenFileTable::_journal_finish(int r, uint64_t log_seq, MDSContext *c,
+ std::map<unsigned, std::vector<ObjectOperation> >& ops_map)
+{
+ dout(10) << __func__ << " log_seq " << log_seq << dendl;
+ if (r < 0) {
+ mds->handle_write_error(r);
+ return;
+ }
+
+ C_GatherBuilder gather(g_ceph_context,
+ new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c),
+ mds->finisher));
+ SnapContext snapc;
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+ for (auto& it : ops_map) {
+ object_t oid = get_object_name(it.first);
+ for (auto& op : it.second) {
+ mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(),
+ 0, gather.new_sub());
+ }
+ }
+ gather.activate();
+
+ journal_state = JOURNAL_NONE;
+ return;
+}
+
+void OpenFileTable::commit(MDSContext *c, uint64_t log_seq, int op_prio)
+{
+ dout(10) << __func__ << " log_seq " << log_seq << dendl;
+
+ ceph_assert(num_pending_commit == 0);
+ num_pending_commit++;
+ ceph_assert(log_seq >= committing_log_seq);
+ committing_log_seq = log_seq;
+
+ omap_version++;
+
+ C_GatherBuilder gather(g_ceph_context);
+
+ SnapContext snapc;
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+
+ const unsigned max_write_size = mds->mdcache->max_dir_commit_size;
+
+ struct omap_update_ctl {
+ unsigned write_size = 0;
+ unsigned journal_idx = 0;
+ bool clear = false;
+ std::map<string, bufferlist> to_update, journaled_update;
+ std::set<string> to_remove, journaled_remove;
+ };
+ std::vector<omap_update_ctl> omap_updates(omap_num_objs);
+
+ using ceph::encode;
+ auto journal_func = [&](unsigned idx) {
+ auto& ctl = omap_updates.at(idx);
+
+ ObjectOperation op;
+ op.priority = op_prio;
+
+ if (ctl.clear) {
+ ctl.clear = false;
+ op.omap_clear();
+ op.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK);
+ }
+
+ if (ctl.journal_idx == 0) {
+ if (journal_state == JOURNAL_NONE)
+ journal_state = JOURNAL_START;
+ else
+ ceph_assert(journal_state == JOURNAL_START);
+
+ bufferlist header;
+ _encode_header(header, journal_state);
+ op.omap_set_header(header);
+ }
+
+ bufferlist bl;
+ encode(omap_version, bl);
+ encode(ctl.to_update, bl);
+ encode(ctl.to_remove, bl);
+
+ char key[32];
+ snprintf(key, sizeof(key), "_journal.%x", ctl.journal_idx++);
+ std::map<string, bufferlist> tmp_map;
+ tmp_map[key].swap(bl);
+ op.omap_set(tmp_map);
+
+ object_t oid = get_object_name(idx);
+ mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), 0,
+ gather.new_sub());
+
+#ifdef HAVE_STDLIB_MAP_SPLICING
+ ctl.journaled_update.merge(ctl.to_update);
+ ctl.journaled_remove.merge(ctl.to_remove);
+#else
+ ctl.journaled_update.insert(make_move_iterator(begin(ctl.to_update)),
+ make_move_iterator(end(ctl.to_update)));
+ ctl.journaled_remove.insert(make_move_iterator(begin(ctl.to_remove)),
+ make_move_iterator(end(ctl.to_remove)));
+#endif
+ ctl.to_update.clear();
+ ctl.to_remove.clear();
+ };
+
+ std::map<unsigned, std::vector<ObjectOperation> > ops_map;
+
+ auto create_op_func = [&](unsigned idx, bool update_header) {
+ auto& ctl = omap_updates.at(idx);
+
+ auto& op_vec = ops_map[idx];
+ op_vec.resize(op_vec.size() + 1);
+ ObjectOperation& op = op_vec.back();
+ op.priority = op_prio;
+
+ if (ctl.clear) {
+ ctl.clear = false;
+ op.omap_clear();
+ op.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK);
+ }
+
+ if (update_header) {
+ bufferlist header;
+ _encode_header(header, journal_state);
+ op.omap_set_header(header);
+ }
+
+ if (!ctl.to_update.empty()) {
+ op.omap_set(ctl.to_update);
+ ctl.to_update.clear();
+ }
+ if (!ctl.to_remove.empty()) {
+ op.omap_rm_keys(ctl.to_remove);
+ ctl.to_remove.clear();
+ }
+ };
+
+ auto submit_ops_func = [&]() {
+ gather.set_finisher(new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c),
+ mds->finisher));
+ for (auto& it : ops_map) {
+ object_t oid = get_object_name(it.first);
+ for (auto& op : it.second) {
+ mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(),
+ 0, gather.new_sub());
+ }
+ }
+ gather.activate();
+ };
+
+ bool first_commit = !loaded_anchor_map.empty();
+
+ unsigned first_free_idx = 0;
+ unsigned old_num_objs = omap_num_objs;
+ if (omap_num_objs == 0) {
+ omap_num_objs = 1;
+ omap_num_items.resize(omap_num_objs);
+ omap_updates.resize(omap_num_objs);
+ omap_updates.back().clear = true;
+ }
+
+ for (auto& it : dirty_items) {
+ frag_vec_t frags;
+ auto p = anchor_map.find(it.first);
+ if (p != anchor_map.end()) {
+ for (auto q = dirfrags.lower_bound(dirfrag_t(it.first, 0));
+ q != dirfrags.end() && q->ino == it.first;
+ ++q)
+ frags.push_back(q->frag);
+ }
+
+ if (first_commit) {
+ auto q = loaded_anchor_map.find(it.first);
+ if (q != loaded_anchor_map.end()) {
+ ceph_assert(p != anchor_map.end());
+ p->second.omap_idx = q->second.omap_idx;
+ bool same = p->second == q->second;
+ if (same) {
+ auto r = loaded_dirfrags.lower_bound(dirfrag_t(it.first, 0));
+ for (const auto& fg : frags) {
+ if (r == loaded_dirfrags.end() || !(*r == dirfrag_t(it.first, fg))) {
+ same = false;
+ break;
+ }
+ ++r;
+ }
+ if (same && r != loaded_dirfrags.end() && r->ino == it.first)
+ same = false;
+ }
+ loaded_anchor_map.erase(q);
+ if (same)
+ continue;
+ }
+ }
+
+ char key[32];
+ int len = snprintf(key, sizeof(key), "%llx", (unsigned long long)it.first.val);
+
+ int omap_idx;
+ if (p != anchor_map.end()) {
+ omap_idx = p->second.omap_idx;
+ if (omap_idx < 0) {
+ ceph_assert(it.second == DIRTY_NEW);
+ // find omap object to store the key
+ for (unsigned i = first_free_idx; i < omap_num_objs; i++) {
+ if (omap_num_items[i] < MAX_ITEMS_PER_OBJ) {
+ omap_idx = i;
+ break;
+ }
+ }
+ if (omap_idx < 0) {
+ ++omap_num_objs;
+ ceph_assert(omap_num_objs <= MAX_OBJECTS);
+ omap_num_items.resize(omap_num_objs);
+ omap_updates.resize(omap_num_objs);
+ omap_updates.back().clear = true;
+ omap_idx = omap_num_objs - 1;
+ }
+ first_free_idx = omap_idx;
+
+ p->second.omap_idx = omap_idx;
+ ++omap_num_items[omap_idx];
+ }
+ } else {
+ omap_idx = it.second;
+ unsigned& count = omap_num_items.at(omap_idx);
+ ceph_assert(count > 0);
+ --count;
+ if ((unsigned)omap_idx < first_free_idx && count < MAX_ITEMS_PER_OBJ)
+ first_free_idx = omap_idx;
+ }
+ auto& ctl = omap_updates.at(omap_idx);
+
+ if (p != anchor_map.end()) {
+ bufferlist bl;
+ encode(p->second, bl);
+ encode(frags, bl);
+
+ ctl.write_size += bl.length() + len + 2 * sizeof(__u32);
+ ctl.to_update[key].swap(bl);
+ } else {
+ ctl.write_size += len + sizeof(__u32);
+ ctl.to_remove.emplace(key);
+ }
+
+ if (ctl.write_size >= max_write_size) {
+ journal_func(omap_idx);
+ ctl.write_size = 0;
+ }
+ }
+
+ dirty_items.clear();
+
+ if (first_commit) {
+ for (auto& it : loaded_anchor_map) {
+ char key[32];
+ int len = snprintf(key, sizeof(key), "%llx", (unsigned long long)it.first.val);
+
+ int omap_idx = it.second.omap_idx;
+ unsigned& count = omap_num_items.at(omap_idx);
+ ceph_assert(count > 0);
+ --count;
+
+ auto& ctl = omap_updates.at(omap_idx);
+ ctl.write_size += len + sizeof(__u32);
+ ctl.to_remove.emplace(key);
+
+ if (ctl.write_size >= max_write_size) {
+ journal_func(omap_idx);
+ ctl.write_size = 0;
+ }
+ }
+ loaded_anchor_map.clear();
+ loaded_dirfrags.clear();
+ }
+
+ {
+ size_t total_items = 0;
+ unsigned used_objs = 1;
+ std::list<unsigned> objs_to_write;
+ bool journaled = false;
+ for (unsigned i = 0; i < omap_num_objs; i++) {
+ total_items += omap_num_items[i];
+ if (omap_updates[i].journal_idx)
+ journaled = true;
+ else if (omap_updates[i].write_size)
+ objs_to_write.push_back(i);
+
+ if (omap_num_items[i] > 0)
+ used_objs = i + 1;
+ }
+ ceph_assert(total_items == anchor_map.size());
+ // adjust omap object count
+ if (used_objs < omap_num_objs) {
+ omap_num_objs = used_objs;
+ omap_num_items.resize(omap_num_objs);
+ }
+ // skip journal if only one osd request is required and object count
+ // does not change.
+ if (!journaled && old_num_objs == omap_num_objs &&
+ objs_to_write.size() <= 1) {
+ ceph_assert(journal_state == JOURNAL_NONE);
+ ceph_assert(!gather.has_subs());
+
+ unsigned omap_idx = objs_to_write.empty() ? 0 : objs_to_write.front();
+ create_op_func(omap_idx, true);
+ submit_ops_func();
+ return;
+ }
+ }
+
+ for (unsigned omap_idx = 0; omap_idx < omap_updates.size(); omap_idx++) {
+ auto& ctl = omap_updates[omap_idx];
+ if (ctl.write_size > 0) {
+ journal_func(omap_idx);
+ ctl.write_size = 0;
+ }
+ }
+
+ if (journal_state == JOURNAL_START) {
+ ceph_assert(gather.has_subs());
+ journal_state = JOURNAL_FINISH;
+ } else {
+ // only object count changes
+ ceph_assert(journal_state == JOURNAL_NONE);
+ ceph_assert(!gather.has_subs());
+ }
+
+ for (unsigned omap_idx = 0; omap_idx < omap_updates.size(); omap_idx++) {
+ auto& ctl = omap_updates[omap_idx];
+ ceph_assert(ctl.to_update.empty() && ctl.to_remove.empty());
+ if (ctl.journal_idx == 0)
+ ceph_assert(ctl.journaled_update.empty() && ctl.journaled_remove.empty());
+
+ bool first = true;
+ for (auto& it : ctl.journaled_update) {
+ ctl.write_size += it.first.length() + it.second.length() + 2 * sizeof(__u32);
+ ctl.to_update[it.first].swap(it.second);
+ if (ctl.write_size >= max_write_size) {
+ create_op_func(omap_idx, first);
+ ctl.write_size = 0;
+ first = false;
+ }
+ }
+
+ for (auto& key : ctl.journaled_remove) {
+ ctl.write_size += key.length() + sizeof(__u32);
+ ctl.to_remove.emplace(key);
+ if (ctl.write_size >= max_write_size) {
+ create_op_func(omap_idx, first);
+ ctl.write_size = 0;
+ first = false;
+ }
+ }
+
+ for (unsigned i = 0; i < ctl.journal_idx; ++i) {
+ char key[32];
+ snprintf(key, sizeof(key), "_journal.%x", i);
+ ctl.to_remove.emplace(key);
+ }
+
+ // update first object's omap header if object count changes
+ if (ctl.clear ||
+ ctl.journal_idx > 0 ||
+ (omap_idx == 0 && old_num_objs != omap_num_objs))
+ create_op_func(omap_idx, first);
+ }
+
+ ceph_assert(!ops_map.empty());
+ if (journal_state == JOURNAL_FINISH) {
+ gather.set_finisher(new C_OnFinisher(new C_IO_OFT_Journal(this, log_seq, c, ops_map),
+ mds->finisher));
+ gather.activate();
+ } else {
+ submit_ops_func();
+ }
+}
+
+class C_IO_OFT_Load : public MDSIOContextBase {
+protected:
+ OpenFileTable *oft;
+ MDSRank *get_mds() override { return oft->mds; }
+
+public:
+ int header_r = 0; //< Return value from OMAP header read
+ int values_r = 0; //< Return value from OMAP value read
+ bufferlist header_bl;
+ std::map<std::string, bufferlist> values;
+ unsigned index;
+ bool first;
+ bool more = false;
+
+ C_IO_OFT_Load(OpenFileTable *t, unsigned i, bool f) :
+ oft(t), index(i), first(f) {}
+ void finish(int r) override {
+ oft->_load_finish(r, header_r, values_r, index, first, more, header_bl, values);
+ }
+ void print(ostream& out) const override {
+ out << "openfiles_load";
+ }
+};
+
+class C_IO_OFT_Recover : public MDSIOContextBase {
+protected:
+ OpenFileTable *oft;
+ MDSRank *get_mds() override { return oft->mds; }
+public:
+ C_IO_OFT_Recover(OpenFileTable *t) : oft(t) {}
+ void finish(int r) override {
+ oft->_recover_finish(r);
+ }
+ void print(ostream& out) const override {
+ out << "openfiles_recover";
+ }
+};
+
+void OpenFileTable::_recover_finish(int r)
+{
+ if (r < 0) {
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ _reset_states();
+ } else {
+ dout(10) << __func__ << ": load complete" << dendl;
+ }
+
+ journal_state = JOURNAL_NONE;
+ load_done = true;
+ finish_contexts(g_ceph_context, waiting_for_load);
+ waiting_for_load.clear();
+}
+
+void OpenFileTable::_load_finish(int op_r, int header_r, int values_r,
+ unsigned idx, bool first, bool more,
+ bufferlist &header_bl,
+ std::map<std::string, bufferlist> &values)
+{
+ using ceph::decode;
+ int err = -EINVAL;
+
+ auto decode_func = [this](unsigned idx, inodeno_t ino, bufferlist &bl) {
+ auto p = bl.cbegin();
+
+ size_t count = loaded_anchor_map.size();
+ auto it = loaded_anchor_map.emplace_hint(loaded_anchor_map.end(),
+ std::piecewise_construct,
+ std::make_tuple(ino),
+ std::make_tuple());
+ RecoveredAnchor& anchor = it->second;
+ decode(anchor, p);
+ ceph_assert(ino == anchor.ino);
+ anchor.omap_idx = idx;
+ anchor.auth = MDS_RANK_NONE;
+
+ frag_vec_t frags;
+ decode(frags, p);
+ for (const auto& fg : frags)
+ loaded_dirfrags.insert(loaded_dirfrags.end(), dirfrag_t(anchor.ino, fg));
+
+ if (loaded_anchor_map.size() > count)
+ ++omap_num_items[idx];
+ };
+
+ if (op_r < 0) {
+ derr << __func__ << " got " << cpp_strerror(op_r) << dendl;
+ err = op_r;
+ goto out;
+ }
+
+ try {
+ if (first) {
+ auto p = header_bl.cbegin();
+
+ string magic;
+ version_t version;
+ unsigned num_objs;
+ __u8 jstate;
+
+ if (header_bl.length() == 13) {
+ // obsolete format.
+ decode(version, p);
+ decode(num_objs, p);
+ decode(jstate, p);
+ } else {
+ decode(magic, p);
+ if (magic != CEPH_FS_ONDISK_MAGIC) {
+ std::ostringstream oss;
+ oss << "invalid magic '" << magic << "'";
+ throw buffer::malformed_input(oss.str());
+ }
+
+ DECODE_START(1, p);
+ decode(version, p);
+ decode(num_objs, p);
+ decode(jstate, p);
+ DECODE_FINISH(p);
+ }
+
+ if (num_objs > MAX_OBJECTS) {
+ std::ostringstream oss;
+ oss << "invalid object count '" << num_objs << "'";
+ throw buffer::malformed_input(oss.str());
+ }
+ if (jstate > JOURNAL_FINISH) {
+ std::ostringstream oss;
+ oss << "invalid journal state '" << jstate << "'";
+ throw buffer::malformed_input(oss.str());
+ }
+
+ if (version > omap_version) {
+ omap_version = version;
+ omap_num_objs = num_objs;
+ omap_num_items.resize(omap_num_objs);
+ journal_state = jstate;
+ } else if (version == omap_version) {
+ ceph_assert(omap_num_objs == num_objs);
+ if (jstate > journal_state)
+ journal_state = jstate;
+ }
+ }
+
+ for (auto& it : values) {
+ if (it.first.compare(0, 9, "_journal.") == 0) {
+ if (idx >= loaded_journals.size())
+ loaded_journals.resize(idx + 1);
+
+ if (journal_state == JOURNAL_FINISH) {
+ loaded_journals[idx][it.first].swap(it.second);
+ } else { // incomplete journal
+ loaded_journals[idx][it.first].length();
+ }
+ continue;
+ }
+
+ inodeno_t ino;
+ sscanf(it.first.c_str(), "%llx", (unsigned long long*)&ino.val);
+ decode_func(idx, ino, it.second);
+ }
+ } catch (buffer::error &e) {
+ derr << __func__ << ": corrupted header/values: " << e.what() << dendl;
+ goto out;
+ }
+
+ if (more || idx + 1 < omap_num_objs) {
+ // Issue another read if we're not at the end of the omap
+ std::string last_key;
+ if (more)
+ last_key = values.rbegin()->first;
+ else
+ idx++;
+ dout(10) << __func__ << ": continue to load from '" << last_key << "'" << dendl;
+ object_t oid = get_object_name(idx);
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+ C_IO_OFT_Load *c = new C_IO_OFT_Load(this, idx, !more);
+ ObjectOperation op;
+ if (!more)
+ op.omap_get_header(&c->header_bl, &c->header_r);
+ op.omap_get_vals(last_key, "", uint64_t(-1),
+ &c->values, &c->more, &c->values_r);
+ mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, nullptr, 0,
+ new C_OnFinisher(c, mds->finisher));
+ return;
+ }
+
+ // replay journal
+ if (loaded_journals.size() > 0) {
+ dout(10) << __func__ << ": recover journal" << dendl;
+
+ C_GatherBuilder gather(g_ceph_context,
+ new C_OnFinisher(new C_IO_OFT_Recover(this),
+ mds->finisher));
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+ SnapContext snapc;
+
+ for (unsigned omap_idx = 0; omap_idx < loaded_journals.size(); omap_idx++) {
+ auto& loaded_journal = loaded_journals[omap_idx];
+
+ std::vector<ObjectOperation> op_vec;
+ try {
+ for (auto& it : loaded_journal) {
+ if (journal_state != JOURNAL_FINISH)
+ continue;
+ auto p = it.second.cbegin();
+ version_t version;
+ std::map<string, bufferlist> to_update;
+ std::set<string> to_remove;
+ decode(version, p);
+ if (version != omap_version)
+ continue;
+ decode(to_update, p);
+ decode(to_remove, p);
+ it.second.clear();
+
+ for (auto& q : to_update) {
+ inodeno_t ino;
+ sscanf(q.first.c_str(), "%llx", (unsigned long long*)&ino.val);
+ decode_func(omap_idx, ino, q.second);
+ }
+ for (auto& q : to_remove) {
+ inodeno_t ino;
+ sscanf(q.c_str(), "%llx",(unsigned long long*)&ino.val);
+ ceph_assert(ino.val > 0);
+ if (loaded_anchor_map.erase(ino)) {
+ unsigned& count = omap_num_items[omap_idx];
+ ceph_assert(count > 0);
+ --count;
+ }
+ auto r = loaded_dirfrags.lower_bound(dirfrag_t(ino, 0));
+ while (r != loaded_dirfrags.end() && r->ino == ino)
+ loaded_dirfrags.erase(r++);
+ }
+
+ op_vec.resize(op_vec.size() + 1);
+ ObjectOperation& op = op_vec.back();
+ op.priority = CEPH_MSG_PRIO_HIGH;
+ if (!to_update.empty())
+ op.omap_set(to_update);
+ if (!to_remove.empty())
+ op.omap_rm_keys(to_remove);
+ }
+ } catch (buffer::error &e) {
+ derr << __func__ << ": corrupted journal: " << e.what() << dendl;
+ goto out;
+ }
+
+ op_vec.resize(op_vec.size() + 1);
+ ObjectOperation& op = op_vec.back();
+ {
+ bufferlist header;
+ if (journal_state == JOURNAL_FINISH)
+ _encode_header(header, JOURNAL_FINISH);
+ else
+ _encode_header(header, JOURNAL_NONE);
+ op.omap_set_header(header);
+ }
+ {
+ // remove journal
+ std::set<string> to_remove;
+ for (auto &it : loaded_journal)
+ to_remove.emplace(it.first);
+ op.omap_rm_keys(to_remove);
+ }
+ loaded_journal.clear();
+
+ object_t oid = get_object_name(omap_idx);
+ for (auto& op : op_vec) {
+ mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(),
+ 0, gather.new_sub());
+ }
+ }
+ gather.activate();
+ return;
+ }
+
+ journal_state = JOURNAL_NONE;
+ err = 0;
+ dout(10) << __func__ << ": load complete" << dendl;
+out:
+
+ if (err < 0)
+ _reset_states();
+
+ load_done = true;
+ finish_contexts(g_ceph_context, waiting_for_load);
+ waiting_for_load.clear();
+}
+
+void OpenFileTable::load(MDSContext *onload)
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(!load_done);
+ if (onload)
+ waiting_for_load.push_back(onload);
+
+ C_IO_OFT_Load *c = new C_IO_OFT_Load(this, 0, true);
+ object_t oid = get_object_name(0);
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+
+ ObjectOperation op;
+ op.omap_get_header(&c->header_bl, &c->header_r);
+ op.omap_get_vals("", "", uint64_t(-1),
+ &c->values, &c->more, &c->values_r);
+
+ mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, nullptr, 0,
+ new C_OnFinisher(c, mds->finisher));
+}
+
+bool OpenFileTable::get_ancestors(inodeno_t ino, vector<inode_backpointer_t>& ancestors,
+ mds_rank_t& auth_hint)
+{
+ auto p = loaded_anchor_map.find(ino);
+ if (p == loaded_anchor_map.end())
+ return false;
+
+ inodeno_t dirino = p->second.dirino;
+ if (dirino == inodeno_t(0))
+ return false;
+
+ bool first = true;
+ ancestors.clear();
+ while (true) {
+ ancestors.push_back(inode_backpointer_t(dirino, p->second.d_name, 0));
+
+ p = loaded_anchor_map.find(dirino);
+ if (p == loaded_anchor_map.end())
+ break;
+
+ if (first)
+ auth_hint = p->second.auth;
+
+ dirino = p->second.dirino;
+ if (dirino == inodeno_t(0))
+ break;
+
+ first = false;
+ }
+ return true;
+}
+
+class C_OFT_OpenInoFinish: public MDSContext {
+ OpenFileTable *oft;
+ inodeno_t ino;
+ MDSRank *get_mds() override { return oft->mds; }
+public:
+ C_OFT_OpenInoFinish(OpenFileTable *t, inodeno_t i) : oft(t), ino(i) {}
+ void finish(int r) override {
+ oft->_open_ino_finish(ino, r);
+ }
+};
+
+void OpenFileTable::_open_ino_finish(inodeno_t ino, int r)
+{
+ if (prefetch_state == DIR_INODES && r >= 0 && ino != inodeno_t(0)) {
+ auto p = loaded_anchor_map.find(ino);
+ ceph_assert(p != loaded_anchor_map.end());
+ p->second.auth = mds_rank_t(r);
+ }
+
+ if (r != mds->get_nodeid())
+ mds->mdcache->rejoin_prefetch_ino_finish(ino, r);
+
+ num_opening_inodes--;
+ if (num_opening_inodes == 0) {
+ if (prefetch_state == DIR_INODES) {
+ prefetch_state = DIRFRAGS;
+ _prefetch_dirfrags();
+ } else if (prefetch_state == FILE_INODES) {
+ prefetch_state = DONE;
+ logseg_destroyed_inos.clear();
+ destroyed_inos_set.clear();
+ finish_contexts(g_ceph_context, waiting_for_prefetch);
+ waiting_for_prefetch.clear();
+ } else {
+ ceph_abort();
+ }
+ }
+}
+
+void OpenFileTable::_prefetch_dirfrags()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(prefetch_state == DIRFRAGS);
+
+ MDCache *mdcache = mds->mdcache;
+ list<CDir*> fetch_queue;
+
+ CInode *last_in = nullptr;
+ for (auto df : loaded_dirfrags) {
+ CInode *diri;
+ if (last_in && last_in->ino() == df.ino) {
+ diri = last_in;
+ } else {
+ diri = mdcache->get_inode(df.ino);
+ if (!diri)
+ continue;
+ last_in = diri;
+ }
+ if (diri->state_test(CInode::STATE_REJOINUNDEF))
+ continue;
+
+ CDir *dir = diri->get_dirfrag(df.frag);
+ if (dir) {
+ if (dir->is_auth() && !dir->is_complete())
+ fetch_queue.push_back(dir);
+ } else {
+ frag_vec_t leaves;
+ diri->dirfragtree.get_leaves_under(df.frag, leaves);
+ for (const auto& leaf : leaves) {
+ if (diri->is_auth()) {
+ dir = diri->get_or_open_dirfrag(mdcache, leaf);
+ } else {
+ dir = diri->get_dirfrag(leaf);
+ }
+ if (dir && dir->is_auth() && !dir->is_complete())
+ fetch_queue.push_back(dir);
+ }
+ }
+ }
+
+ MDSGatherBuilder gather(g_ceph_context);
+ int num_opening_dirfrags = 0;
+ for (auto dir : fetch_queue) {
+ if (dir->state_test(CDir::STATE_REJOINUNDEF))
+ ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
+ dir->fetch(gather.new_sub());
+
+ if (!(++num_opening_dirfrags % 1000))
+ mds->heartbeat_reset();
+ }
+
+ auto finish_func = [this](int r) {
+ prefetch_state = FILE_INODES;
+ _prefetch_inodes();
+ };
+ if (gather.has_subs()) {
+ gather.set_finisher(
+ new MDSInternalContextWrapper(mds,
+ new FunctionContext(finish_func)));
+ gather.activate();
+ } else {
+ finish_func(0);
+ }
+}
+
+void OpenFileTable::_prefetch_inodes()
+{
+ dout(10) << __func__ << " state " << prefetch_state << dendl;
+ ceph_assert(!num_opening_inodes);
+ num_opening_inodes = 1;
+
+ int64_t pool;
+ if (prefetch_state == DIR_INODES)
+ pool = mds->mdsmap->get_metadata_pool();
+ else if (prefetch_state == FILE_INODES)
+ pool = mds->mdsmap->get_first_data_pool();
+ else
+ ceph_abort();
+
+ MDCache *mdcache = mds->mdcache;
+
+ if (destroyed_inos_set.empty()) {
+ for (auto& it : logseg_destroyed_inos)
+ destroyed_inos_set.insert(it.second.begin(), it.second.end());
+ }
+
+ for (auto& it : loaded_anchor_map) {
+ if (destroyed_inos_set.count(it.first))
+ continue;
+ if (it.second.d_type == DT_DIR) {
+ if (prefetch_state != DIR_INODES)
+ continue;
+ if (MDS_INO_IS_MDSDIR(it.first)) {
+ it.second.auth = MDS_INO_MDSDIR_OWNER(it.first);
+ continue;
+ }
+ if (MDS_INO_IS_STRAY(it.first)) {
+ it.second.auth = MDS_INO_STRAY_OWNER(it.first);
+ continue;
+ }
+ } else {
+ if (prefetch_state != FILE_INODES)
+ continue;
+ // load all file inodes for MDCache::identify_files_to_recover()
+ }
+ CInode *in = mdcache->get_inode(it.first);
+ if (in)
+ continue;
+
+ num_opening_inodes++;
+ mdcache->open_ino(it.first, pool, new C_OFT_OpenInoFinish(this, it.first), false);
+
+ if (!(num_opening_inodes % 1000))
+ mds->heartbeat_reset();
+ }
+
+ _open_ino_finish(inodeno_t(0), 0);
+}
+
+bool OpenFileTable::prefetch_inodes()
+{
+ dout(10) << __func__ << dendl;
+ ceph_assert(!prefetch_state);
+ prefetch_state = DIR_INODES;
+
+ if (!load_done) {
+ wait_for_load(
+ new MDSInternalContextWrapper(mds,
+ new FunctionContext([this](int r) {
+ _prefetch_inodes();
+ })
+ )
+ );
+ return true;
+ }
+
+ _prefetch_inodes();
+ return !is_prefetched();
+}
+
+bool OpenFileTable::should_log_open(CInode *in)
+{
+ if (in->state_test(CInode::STATE_TRACKEDBYOFT)) {
+ // inode just journaled
+ if (in->last_journaled >= committing_log_seq)
+ return false;
+ // item not dirty. it means the item has already been saved
+ auto p = dirty_items.find(in->ino());
+ if (p == dirty_items.end())
+ return false;
+ }
+ return true;
+}
+
+void OpenFileTable::note_destroyed_inos(uint64_t seq, const vector<inodeno_t>& inos)
+{
+ auto& vec = logseg_destroyed_inos[seq];
+ vec.insert(vec.end(), inos.begin(), inos.end());
+}
+
+void OpenFileTable::trim_destroyed_inos(uint64_t seq)
+{
+ auto p = logseg_destroyed_inos.begin();
+ while (p != logseg_destroyed_inos.end()) {
+ if (p->first >= seq)
+ break;
+ logseg_destroyed_inos.erase(p++);
+ }
+}
diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h
new file mode 100644
index 00000000..70d4c09b
--- /dev/null
+++ b/src/mds/OpenFileTable.h
@@ -0,0 +1,151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef OPEN_FILE_TABLE_H
+#define OPEN_FILE_TABLE_H
+
+#include "mdstypes.h"
+#include "Anchor.h"
+
+#include "MDSContext.h"
+
+class CDir;
+class CInode;
+class MDSRank;
+
+class OpenFileTable
+{
+public:
+ explicit OpenFileTable(MDSRank *m) : mds(m) {}
+
+ void add_inode(CInode *in);
+ void remove_inode(CInode *in);
+ void add_dirfrag(CDir *dir);
+ void remove_dirfrag(CDir *dir);
+ void notify_link(CInode *in);
+ void notify_unlink(CInode *in);
+ bool is_any_dirty() const { return !dirty_items.empty(); }
+
+ void commit(MDSContext *c, uint64_t log_seq, int op_prio);
+ uint64_t get_committed_log_seq() const { return committed_log_seq; }
+ uint64_t get_committing_log_seq() const { return committing_log_seq; }
+ bool is_any_committing() const { return num_pending_commit > 0; }
+
+ void load(MDSContext *c);
+ bool is_loaded() const { return load_done; }
+ void wait_for_load(MDSContext *c) {
+ ceph_assert(!load_done);
+ waiting_for_load.push_back(c);
+ }
+
+ bool get_ancestors(inodeno_t ino, vector<inode_backpointer_t>& ancestors,
+ mds_rank_t& auth_hint);
+
+ bool prefetch_inodes();
+ bool is_prefetched() const { return prefetch_state == DONE; }
+ void wait_for_prefetch(MDSContext *c) {
+ ceph_assert(!is_prefetched());
+ waiting_for_prefetch.push_back(c);
+ }
+
+ bool should_log_open(CInode *in);
+
+ void note_destroyed_inos(uint64_t seq, const vector<inodeno_t>& inos);
+ void trim_destroyed_inos(uint64_t seq);
+
+protected:
+ friend class C_IO_OFT_Recover;
+ friend class C_IO_OFT_Load;
+ friend class C_IO_OFT_Save;
+ friend class C_IO_OFT_Journal;
+ friend class C_OFT_OpenInoFinish;
+
+ uint64_t MAX_ITEMS_PER_OBJ = g_conf().get_val<uint64_t>("osd_deep_scrub_large_omap_object_key_threshold");
+ static const unsigned MAX_OBJECTS = 1024; // (1024 * osd_deep_scrub_large_omap_object_key_threshold) items at most
+
+ static const int DIRTY_NEW = -1;
+ static const int DIRTY_UNDEF = -2;
+
+ unsigned num_pending_commit = 0;
+ void _encode_header(bufferlist& bl, int j_state);
+ void _commit_finish(int r, uint64_t log_seq, MDSContext *fin);
+ void _journal_finish(int r, uint64_t log_seq, MDSContext *fin,
+ std::map<unsigned, std::vector<ObjectOperation> >& ops);
+
+ void get_ref(CInode *in);
+ void put_ref(CInode *in);
+
+ object_t get_object_name(unsigned idx) const;
+
+ void _reset_states() {
+ omap_num_objs = 0;
+ omap_num_items.resize(0);
+ journal_state = JOURNAL_NONE;
+ loaded_journals.clear();
+ loaded_anchor_map.clear();
+ loaded_dirfrags.clear();
+ }
+ void _load_finish(int op_r, int header_r, int values_r,
+ unsigned idx, bool first, bool more,
+ bufferlist &header_bl,
+ std::map<std::string, bufferlist> &values);
+ void _recover_finish(int r);
+
+ void _open_ino_finish(inodeno_t ino, int r);
+ void _prefetch_inodes();
+ void _prefetch_dirfrags();
+
+ MDSRank *mds;
+
+ version_t omap_version = 0;
+
+ unsigned omap_num_objs = 0;
+ std::vector<unsigned> omap_num_items;
+
+ map<inodeno_t, OpenedAnchor> anchor_map;
+ set<dirfrag_t> dirfrags;
+
+ std::map<inodeno_t, int> dirty_items; // ino -> dirty state
+
+ uint64_t committed_log_seq = 0;
+ uint64_t committing_log_seq = 0;
+
+ enum {
+ JOURNAL_NONE = 0,
+ JOURNAL_START = 1,
+ JOURNAL_FINISH = 2,
+ };
+ int journal_state = 0;
+
+ std::vector<std::map<std::string, bufferlist> > loaded_journals;
+ map<inodeno_t, RecoveredAnchor> loaded_anchor_map;
+ set<dirfrag_t> loaded_dirfrags;
+ MDSContext::vec waiting_for_load;
+ bool load_done = false;
+
+ enum {
+ DIR_INODES = 1,
+ DIRFRAGS = 2,
+ FILE_INODES = 3,
+ DONE = 4,
+ };
+ unsigned prefetch_state = 0;
+ unsigned num_opening_inodes = 0;
+ MDSContext::vec waiting_for_prefetch;
+
+ std::map<uint64_t, vector<inodeno_t> > logseg_destroyed_inos;
+ std::set<inodeno_t> destroyed_inos_set;
+};
+
+#endif
diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc
new file mode 100644
index 00000000..cc30d9d2
--- /dev/null
+++ b/src/mds/PurgeQueue.cc
@@ -0,0 +1,776 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/debug.h"
+#include "mds/mdstypes.h"
+#include "mds/CInode.h"
+#include "mds/MDCache.h"
+
+#include "PurgeQueue.h"
+
+#include <string.h>
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, rank) << __func__ << ": "
+static ostream& _prefix(std::ostream *_dout, mds_rank_t rank) {
+ return *_dout << "mds." << rank << ".purge_queue ";
+}
+
+const std::map<std::string, PurgeItem::Action> PurgeItem::actions = {
+ {"NONE", PurgeItem::NONE},
+ {"PURGE_FILE", PurgeItem::PURGE_FILE},
+ {"TRUNCATE_FILE", PurgeItem::TRUNCATE_FILE},
+ {"PURGE_DIR", PurgeItem::PURGE_DIR}
+};
+
+void PurgeItem::encode(bufferlist &bl) const
+{
+ ENCODE_START(2, 1, bl);
+ encode((uint8_t)action, bl);
+ encode(ino, bl);
+ encode(size, bl);
+ encode(layout, bl, CEPH_FEATURE_FS_FILE_LAYOUT_V2);
+ encode(old_pools, bl);
+ encode(snapc, bl);
+ encode(fragtree, bl);
+ encode(stamp, bl);
+ uint8_t static const pad = 0xff;
+ for (unsigned int i = 0; i<pad_size; i++) {
+ encode(pad, bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+void PurgeItem::decode(bufferlist::const_iterator &p)
+{
+ DECODE_START(2, p);
+ bool done = false;
+ if (struct_v == 1) {
+ auto p_start = p;
+ try {
+ // bad encoding introduced by v13.2.2
+ decode(stamp, p);
+ decode(pad_size, p);
+ p.advance(pad_size);
+ uint8_t raw_action;
+ decode(raw_action, p);
+ action = (Action)raw_action;
+ decode(ino, p);
+ decode(size, p);
+ decode(layout, p);
+ decode(old_pools, p);
+ decode(snapc, p);
+ decode(fragtree, p);
+ if (p.get_off() > struct_end)
+ throw buffer::end_of_buffer();
+ done = true;
+ } catch (const buffer::error &e) {
+ p = p_start;
+ }
+ }
+ if (!done) {
+ uint8_t raw_action;
+ decode(raw_action, p);
+ action = (Action)raw_action;
+ decode(ino, p);
+ decode(size, p);
+ decode(layout, p);
+ decode(old_pools, p);
+ decode(snapc, p);
+ decode(fragtree, p);
+ if (struct_v >= 2) {
+ decode(stamp, p);
+ }
+ }
+ DECODE_FINISH(p);
+}
+
+// TODO: if Objecter has any slow requests, take that as a hint and
+// slow down our rate of purging (keep accepting pushes though)
+PurgeQueue::PurgeQueue(
+ CephContext *cct_,
+ mds_rank_t rank_,
+ const int64_t metadata_pool_,
+ Objecter *objecter_,
+ Context *on_error_)
+ :
+ cct(cct_),
+ rank(rank_),
+ lock("PurgeQueue"),
+ metadata_pool(metadata_pool_),
+ finisher(cct, "PurgeQueue", "PQ_Finisher"),
+ timer(cct, lock),
+ filer(objecter_, &finisher),
+ objecter(objecter_),
+ journaler("pq", MDS_INO_PURGE_QUEUE + rank, metadata_pool,
+ CEPH_FS_ONDISK_MAGIC, objecter_, nullptr, 0,
+ &finisher),
+ on_error(on_error_),
+ ops_in_flight(0),
+ max_purge_ops(0),
+ drain_initial(0),
+ draining(false),
+ delayed_flush(nullptr),
+ recovered(false)
+{
+ ceph_assert(cct != nullptr);
+ ceph_assert(on_error != nullptr);
+ ceph_assert(objecter != nullptr);
+ journaler.set_write_error_handler(on_error);
+}
+
+PurgeQueue::~PurgeQueue()
+{
+ if (logger) {
+ g_ceph_context->get_perfcounters_collection()->remove(logger.get());
+ }
+ delete on_error;
+}
+
+void PurgeQueue::create_logger()
+{
+ PerfCountersBuilder pcb(g_ceph_context, "purge_queue", l_pq_first, l_pq_last);
+
+ pcb.add_u64_counter(l_pq_executed, "pq_executed", "Purge queue tasks executed",
+ "purg", PerfCountersBuilder::PRIO_INTERESTING);
+
+ pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+ pcb.add_u64(l_pq_executing_ops, "pq_executing_ops", "Purge queue ops in flight");
+ pcb.add_u64(l_pq_executing_ops_high_water, "pq_executing_ops_high_water", "Maximum number of executing file purge ops");
+ pcb.add_u64(l_pq_executing, "pq_executing", "Purge queue tasks in flight");
+ pcb.add_u64(l_pq_executing_high_water, "pq_executing_high_water", "Maximum number of executing file purges");
+
+ logger.reset(pcb.create_perf_counters());
+ g_ceph_context->get_perfcounters_collection()->add(logger.get());
+}
+
+void PurgeQueue::init()
+{
+ std::lock_guard l(lock);
+
+ ceph_assert(logger != nullptr);
+
+ finisher.start();
+ timer.init();
+}
+
+void PurgeQueue::activate()
+{
+ std::lock_guard l(lock);
+
+ if (readonly) {
+ dout(10) << "skipping activate: PurgeQueue is readonly" << dendl;
+ return;
+ }
+
+ if (journaler.get_read_pos() == journaler.get_write_pos())
+ return;
+
+ if (in_flight.empty()) {
+ dout(4) << "start work (by drain)" << dendl;
+ finisher.queue(new FunctionContext([this](int r) {
+ std::lock_guard l(lock);
+ _consume();
+ }));
+ }
+}
+
+void PurgeQueue::shutdown()
+{
+ std::lock_guard l(lock);
+
+ journaler.shutdown();
+ timer.shutdown();
+ finisher.stop();
+}
+
+void PurgeQueue::open(Context *completion)
+{
+ dout(4) << "opening" << dendl;
+
+ std::lock_guard l(lock);
+
+ if (completion)
+ waiting_for_recovery.push_back(completion);
+
+ journaler.recover(new FunctionContext([this](int r){
+ if (r == -ENOENT) {
+ dout(1) << "Purge Queue not found, assuming this is an upgrade and "
+ "creating it." << dendl;
+ create(NULL);
+ } else if (r == 0) {
+ std::lock_guard l(lock);
+ dout(4) << "open complete" << dendl;
+
+ // Journaler only guarantees entries before head write_pos have been
+ // fully flushed. Before appending new entries, we need to find and
+ // drop any partial written entry.
+ if (journaler.last_committed.write_pos < journaler.get_write_pos()) {
+ dout(4) << "recovering write_pos" << dendl;
+ journaler.set_read_pos(journaler.last_committed.write_pos);
+ _recover();
+ return;
+ }
+
+ journaler.set_writeable();
+ recovered = true;
+ finish_contexts(g_ceph_context, waiting_for_recovery);
+ } else {
+ derr << "Error " << r << " loading Journaler" << dendl;
+ _go_readonly(r);
+ }
+ }));
+}
+
+void PurgeQueue::wait_for_recovery(Context* c)
+{
+ std::lock_guard l(lock);
+ if (recovered) {
+ c->complete(0);
+ } else if (readonly) {
+ dout(10) << "cannot wait for recovery: PurgeQueue is readonly" << dendl;
+ c->complete(-EROFS);
+ } else {
+ waiting_for_recovery.push_back(c);
+ }
+}
+
+void PurgeQueue::_recover()
+{
+ ceph_assert(lock.is_locked_by_me());
+
+ // Journaler::is_readable() adjusts write_pos if partial entry is encountered
+ while (1) {
+ if (!journaler.is_readable() &&
+ !journaler.get_error() &&
+ journaler.get_read_pos() < journaler.get_write_pos()) {
+ journaler.wait_for_readable(new FunctionContext([this](int r) {
+ std::lock_guard l(lock);
+ _recover();
+ }));
+ return;
+ }
+
+ if (journaler.get_error()) {
+ int r = journaler.get_error();
+ derr << "Error " << r << " recovering write_pos" << dendl;
+ _go_readonly(r);
+ return;
+ }
+
+ if (journaler.get_read_pos() == journaler.get_write_pos()) {
+ dout(4) << "write_pos recovered" << dendl;
+ // restore original read_pos
+ journaler.set_read_pos(journaler.last_committed.expire_pos);
+ journaler.set_writeable();
+ recovered = true;
+ finish_contexts(g_ceph_context, waiting_for_recovery);
+ return;
+ }
+
+ bufferlist bl;
+ bool readable = journaler.try_read_entry(bl);
+ ceph_assert(readable); // we checked earlier
+ }
+}
+
+void PurgeQueue::create(Context *fin)
+{
+ dout(4) << "creating" << dendl;
+ std::lock_guard l(lock);
+
+ if (fin)
+ waiting_for_recovery.push_back(fin);
+
+ file_layout_t layout = file_layout_t::get_default();
+ layout.pool_id = metadata_pool;
+ journaler.set_writeable();
+ journaler.create(&layout, JOURNAL_FORMAT_RESILIENT);
+ journaler.write_head(new FunctionContext([this](int r) {
+ std::lock_guard l(lock);
+ if (r) {
+ _go_readonly(r);
+ } else {
+ recovered = true;
+ finish_contexts(g_ceph_context, waiting_for_recovery);
+ }
+ }));
+}
+
+/**
+ * The `completion` context will always be called back via a Finisher
+ */
+void PurgeQueue::push(const PurgeItem &pi, Context *completion)
+{
+ dout(4) << "pushing inode " << pi.ino << dendl;
+ std::lock_guard l(lock);
+
+ if (readonly) {
+ dout(10) << "cannot push inode: PurgeQueue is readonly" << dendl;
+ completion->complete(-EROFS);
+ return;
+ }
+
+ // Callers should have waited for open() before using us
+ ceph_assert(!journaler.is_readonly());
+
+ bufferlist bl;
+
+ encode(pi, bl);
+ journaler.append_entry(bl);
+ journaler.wait_for_flush(completion);
+
+ // Maybe go ahead and do something with it right away
+ bool could_consume = _consume();
+ if (!could_consume) {
+ // Usually, it is not necessary to explicitly flush here, because the reader
+ // will get flushes generated inside Journaler::is_readable. However,
+ // if we remain in a _can_consume()==false state for a long period then
+ // we should flush in order to allow MDCache to drop its strays rather
+ // than having them wait for purgequeue to progress.
+ if (!delayed_flush) {
+ delayed_flush = new FunctionContext([this](int r){
+ delayed_flush = nullptr;
+ journaler.flush();
+ });
+
+ timer.add_event_after(
+ g_conf()->mds_purge_queue_busy_flush_period,
+ delayed_flush);
+ }
+ }
+}
+
+uint32_t PurgeQueue::_calculate_ops(const PurgeItem &item) const
+{
+ uint32_t ops_required = 0;
+ if (item.action == PurgeItem::PURGE_DIR) {
+ // Directory, count dirfrags to be deleted
+ frag_vec_t leaves;
+ if (!item.fragtree.is_leaf(frag_t())) {
+ item.fragtree.get_leaves(leaves);
+ }
+ // One for the root, plus any leaves
+ ops_required = 1 + leaves.size();
+ } else {
+ // File, work out concurrent Filer::purge deletes
+ // Account for removing (or zeroing) backtrace
+ const uint64_t num = (item.size > 0) ?
+ Striper::get_num_objects(item.layout, item.size) : 1;
+
+ ops_required = std::min(num, g_conf()->filer_max_purge_ops);
+
+ // Account for deletions for old pools
+ if (item.action != PurgeItem::TRUNCATE_FILE) {
+ ops_required += item.old_pools.size();
+ }
+ }
+
+ return ops_required;
+}
+
+bool PurgeQueue::_can_consume()
+{
+ if (readonly) {
+ dout(10) << "can't consume: PurgeQueue is readonly" << dendl;
+ return false;
+ }
+
+ dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, "
+ << in_flight.size() << "/" << g_conf()->mds_max_purge_files
+ << " files" << dendl;
+
+ if (in_flight.size() == 0 && cct->_conf->mds_max_purge_files > 0) {
+ // Always permit consumption if nothing is in flight, so that the ops
+ // limit can never be so low as to forbid all progress (unless
+ // administrator has deliberately paused purging by setting max
+ // purge files to zero).
+ return true;
+ }
+
+ if (ops_in_flight >= max_purge_ops) {
+ dout(20) << "Throttling on op limit " << ops_in_flight << "/"
+ << max_purge_ops << dendl;
+ return false;
+ }
+
+ if (in_flight.size() >= cct->_conf->mds_max_purge_files) {
+ dout(20) << "Throttling on item limit " << in_flight.size()
+ << "/" << cct->_conf->mds_max_purge_files << dendl;
+ return false;
+ } else {
+ return true;
+ }
+}
+
+void PurgeQueue::_go_readonly(int r)
+{
+ if (readonly) return;
+ dout(1) << "going readonly because internal IO failed: " << strerror(-r) << dendl;
+ readonly = true;
+ on_error->complete(r);
+ on_error = nullptr;
+ journaler.set_readonly();
+ finish_contexts(g_ceph_context, waiting_for_recovery, r);
+}
+
+bool PurgeQueue::_consume()
+{
+ ceph_assert(lock.is_locked_by_me());
+
+ bool could_consume = false;
+ while(_can_consume()) {
+
+ if (delayed_flush) {
+ // We are now going to read from the journal, so any proactive
+ // flush is no longer necessary. This is not functionally necessary
+ // but it can avoid generating extra fragmented flush IOs.
+ timer.cancel_event(delayed_flush);
+ delayed_flush = nullptr;
+ }
+
+ if (int r = journaler.get_error()) {
+ derr << "Error " << r << " recovering write_pos" << dendl;
+ _go_readonly(r);
+ return could_consume;
+ }
+
+ if (!journaler.is_readable()) {
+ dout(10) << " not readable right now" << dendl;
+ // Because we are the writer and the reader of the journal
+ // via the same Journaler instance, we never need to reread_head
+ if (!journaler.have_waiter()) {
+ journaler.wait_for_readable(new FunctionContext([this](int r) {
+ std::lock_guard l(lock);
+ if (r == 0) {
+ _consume();
+ } else if (r != -EAGAIN) {
+ _go_readonly(r);
+ }
+ }));
+ }
+
+ return could_consume;
+ }
+
+ could_consume = true;
+ // The journaler is readable: consume an entry
+ bufferlist bl;
+ bool readable = journaler.try_read_entry(bl);
+ ceph_assert(readable); // we checked earlier
+
+ dout(20) << " decoding entry" << dendl;
+ PurgeItem item;
+ auto q = bl.cbegin();
+ try {
+ decode(item, q);
+ } catch (const buffer::error &err) {
+ derr << "Decode error at read_pos=0x" << std::hex
+ << journaler.get_read_pos() << dendl;
+ _go_readonly(EIO);
+ }
+ dout(20) << " executing item (" << item.ino << ")" << dendl;
+ _execute_item(item, journaler.get_read_pos());
+ }
+
+ dout(10) << " cannot consume right now" << dendl;
+
+ return could_consume;
+}
+
+void PurgeQueue::_execute_item(
+ const PurgeItem &item,
+ uint64_t expire_to)
+{
+ ceph_assert(lock.is_locked_by_me());
+
+ in_flight[expire_to] = item;
+ logger->set(l_pq_executing, in_flight.size());
+ files_high_water = std::max(files_high_water, in_flight.size());
+ logger->set(l_pq_executing_high_water, files_high_water);
+ auto ops = _calculate_ops(item);
+ ops_in_flight += ops;
+ logger->set(l_pq_executing_ops, ops_in_flight);
+ ops_high_water = std::max(ops_high_water, ops_in_flight);
+ logger->set(l_pq_executing_ops_high_water, ops_high_water);
+
+ SnapContext nullsnapc;
+
+ C_GatherBuilder gather(cct);
+ if (item.action == PurgeItem::PURGE_FILE) {
+ if (item.size > 0) {
+ uint64_t num = Striper::get_num_objects(item.layout, item.size);
+ dout(10) << " 0~" << item.size << " objects 0~" << num
+ << " snapc " << item.snapc << " on " << item.ino << dendl;
+ filer.purge_range(item.ino, &item.layout, item.snapc,
+ 0, num, ceph::real_clock::now(), 0,
+ gather.new_sub());
+ }
+
+ // remove the backtrace object if it was not purged
+ object_t oid = CInode::get_object_name(item.ino, frag_t(), "");
+ if (!gather.has_subs() || !item.layout.pool_ns.empty()) {
+ object_locator_t oloc(item.layout.pool_id);
+ dout(10) << " remove backtrace object " << oid
+ << " pool " << oloc.pool << " snapc " << item.snapc << dendl;
+ objecter->remove(oid, oloc, item.snapc,
+ ceph::real_clock::now(), 0,
+ gather.new_sub());
+ }
+
+ // remove old backtrace objects
+ for (const auto &p : item.old_pools) {
+ object_locator_t oloc(p);
+ dout(10) << " remove backtrace object " << oid
+ << " old pool " << p << " snapc " << item.snapc << dendl;
+ objecter->remove(oid, oloc, item.snapc,
+ ceph::real_clock::now(), 0,
+ gather.new_sub());
+ }
+ } else if (item.action == PurgeItem::PURGE_DIR) {
+ object_locator_t oloc(metadata_pool);
+ frag_vec_t leaves;
+ if (!item.fragtree.is_leaf(frag_t()))
+ item.fragtree.get_leaves(leaves);
+ leaves.push_back(frag_t());
+ for (const auto &leaf : leaves) {
+ object_t oid = CInode::get_object_name(item.ino, leaf, "");
+ dout(10) << " remove dirfrag " << oid << dendl;
+ objecter->remove(oid, oloc, nullsnapc,
+ ceph::real_clock::now(),
+ 0, gather.new_sub());
+ }
+ } else if (item.action == PurgeItem::TRUNCATE_FILE) {
+ const uint64_t num = Striper::get_num_objects(item.layout, item.size);
+ dout(10) << " 0~" << item.size << " objects 0~" << num
+ << " snapc " << item.snapc << " on " << item.ino << dendl;
+
+ // keep backtrace object
+ if (num > 1) {
+ filer.purge_range(item.ino, &item.layout, item.snapc,
+ 1, num - 1, ceph::real_clock::now(),
+ 0, gather.new_sub());
+ }
+ filer.zero(item.ino, &item.layout, item.snapc,
+ 0, item.layout.object_size,
+ ceph::real_clock::now(),
+ 0, true, gather.new_sub());
+ } else {
+ derr << "Invalid item (action=" << item.action << ") in purge queue, "
+ "dropping it" << dendl;
+ ops_in_flight -= ops;
+ logger->set(l_pq_executing_ops, ops_in_flight);
+ ops_high_water = std::max(ops_high_water, ops_in_flight);
+ logger->set(l_pq_executing_ops_high_water, ops_high_water);
+ in_flight.erase(expire_to);
+ logger->set(l_pq_executing, in_flight.size());
+ files_high_water = std::max(files_high_water, in_flight.size());
+ logger->set(l_pq_executing_high_water, files_high_water);
+ return;
+ }
+ ceph_assert(gather.has_subs());
+
+ gather.set_finisher(new C_OnFinisher(
+ new FunctionContext([this, expire_to](int r){
+ std::lock_guard l(lock);
+
+ if (r == -EBLACKLISTED) {
+ finisher.queue(on_error, r);
+ on_error = nullptr;
+ return;
+ }
+
+ _execute_item_complete(expire_to);
+ _consume();
+
+ // Have we gone idle? If so, do an extra write_head now instead of
+ // waiting for next flush after journaler_write_head_interval.
+ // Also do this periodically even if not idle, so that the persisted
+ // expire_pos doesn't fall too far behind our progress when consuming
+ // a very long queue.
+ if (in_flight.empty() || journaler.write_head_needed()) {
+ journaler.write_head(nullptr);
+ }
+ }), &finisher));
+
+ gather.activate();
+}
+
+void PurgeQueue::_execute_item_complete(
+ uint64_t expire_to)
+{
+ ceph_assert(lock.is_locked_by_me());
+ dout(10) << "complete at 0x" << std::hex << expire_to << std::dec << dendl;
+ ceph_assert(in_flight.count(expire_to) == 1);
+
+ auto iter = in_flight.find(expire_to);
+ ceph_assert(iter != in_flight.end());
+ if (iter == in_flight.begin()) {
+ uint64_t pos = expire_to;
+ if (!pending_expire.empty()) {
+ auto n = iter;
+ ++n;
+ if (n == in_flight.end()) {
+ pos = *pending_expire.rbegin();
+ pending_expire.clear();
+ } else {
+ auto p = pending_expire.begin();
+ do {
+ if (*p >= n->first)
+ break;
+ pos = *p;
+ pending_expire.erase(p++);
+ } while (p != pending_expire.end());
+ }
+ }
+ dout(10) << "expiring to 0x" << std::hex << pos << std::dec << dendl;
+ journaler.set_expire_pos(pos);
+ } else {
+ // This is completely fine, we're not supposed to purge files in
+ // order when doing them in parallel.
+ dout(10) << "non-sequential completion, not expiring anything" << dendl;
+ pending_expire.insert(expire_to);
+ }
+
+ ops_in_flight -= _calculate_ops(iter->second);
+ logger->set(l_pq_executing_ops, ops_in_flight);
+ ops_high_water = std::max(ops_high_water, ops_in_flight);
+ logger->set(l_pq_executing_ops_high_water, ops_high_water);
+
+ dout(10) << "completed item for ino " << iter->second.ino << dendl;
+
+ in_flight.erase(iter);
+ logger->set(l_pq_executing, in_flight.size());
+ files_high_water = std::max(files_high_water, in_flight.size());
+ logger->set(l_pq_executing_high_water, files_high_water);
+ dout(10) << "in_flight.size() now " << in_flight.size() << dendl;
+
+ logger->inc(l_pq_executed);
+}
+
+void PurgeQueue::update_op_limit(const MDSMap &mds_map)
+{
+ std::lock_guard l(lock);
+
+ if (readonly) {
+ dout(10) << "skipping; PurgeQueue is readonly" << dendl;
+ return;
+ }
+
+ uint64_t pg_count = 0;
+ objecter->with_osdmap([&](const OSDMap& o) {
+ // Number of PGs across all data pools
+ const std::vector<int64_t> &data_pools = mds_map.get_data_pools();
+ for (const auto dp : data_pools) {
+ if (o.get_pg_pool(dp) == NULL) {
+ // It is possible that we have an older OSDMap than MDSMap,
+ // because we don't start watching every OSDMap until after
+ // MDSRank is initialized
+ dout(4) << " data pool " << dp << " not found in OSDMap" << dendl;
+ continue;
+ }
+ pg_count += o.get_pg_num(dp);
+ }
+ });
+
+ // Work out a limit based on n_pgs / n_mdss, multiplied by the user's
+ // preference for how many ops per PG
+ max_purge_ops = uint64_t(((double)pg_count / (double)mds_map.get_max_mds()) *
+ cct->_conf->mds_max_purge_ops_per_pg);
+
+ // User may also specify a hard limit, apply this if so.
+ if (cct->_conf->mds_max_purge_ops) {
+ max_purge_ops = std::min(max_purge_ops, cct->_conf->mds_max_purge_ops);
+ }
+}
+
+void PurgeQueue::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
+{
+ if (changed.count("mds_max_purge_ops")
+ || changed.count("mds_max_purge_ops_per_pg")) {
+ update_op_limit(mds_map);
+ } else if (changed.count("mds_max_purge_files")) {
+ std::lock_guard l(lock);
+ if (in_flight.empty()) {
+ // We might have gone from zero to a finite limit, so
+ // might need to kick off consume.
+ dout(4) << "maybe start work again (max_purge_files="
+ << g_conf()->mds_max_purge_files << dendl;
+ finisher.queue(new FunctionContext([this](int r){
+ std::lock_guard l(lock);
+ _consume();
+ }));
+ }
+ }
+}
+
+bool PurgeQueue::drain(
+ uint64_t *progress,
+ uint64_t *progress_total,
+ size_t *in_flight_count
+ )
+{
+ std::lock_guard l(lock);
+
+ if (readonly) {
+ dout(10) << "skipping drain; PurgeQueue is readonly" << dendl;
+ return true;
+ }
+
+ ceph_assert(progress != nullptr);
+ ceph_assert(progress_total != nullptr);
+ ceph_assert(in_flight_count != nullptr);
+
+ const bool done = in_flight.empty() && (
+ journaler.get_read_pos() == journaler.get_write_pos());
+ if (done) {
+ return true;
+ }
+
+ const uint64_t bytes_remaining = journaler.get_write_pos()
+ - journaler.get_read_pos();
+
+ if (!draining) {
+ // Start of draining: remember how much there was outstanding at
+ // this point so that we can give a progress percentage later
+ draining = true;
+
+ // Life the op throttle as this daemon now has nothing to do but
+ // drain the purge queue, so do it as fast as we can.
+ max_purge_ops = 0xffff;
+ }
+
+ drain_initial = std::max(bytes_remaining, drain_initial);
+
+ *progress = drain_initial - bytes_remaining;
+ *progress_total = drain_initial;
+ *in_flight_count = in_flight.size();
+
+ return false;
+}
+
+std::string_view PurgeItem::get_type_str() const
+{
+ switch(action) {
+ case PurgeItem::NONE: return "NONE";
+ case PurgeItem::PURGE_FILE: return "PURGE_FILE";
+ case PurgeItem::PURGE_DIR: return "PURGE_DIR";
+ case PurgeItem::TRUNCATE_FILE: return "TRUNCATE_FILE";
+ default:
+ return "UNKNOWN";
+ }
+}
+
diff --git a/src/mds/PurgeQueue.h b/src/mds/PurgeQueue.h
new file mode 100644
index 00000000..9a603a26
--- /dev/null
+++ b/src/mds/PurgeQueue.h
@@ -0,0 +1,228 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef PURGE_QUEUE_H_
+#define PURGE_QUEUE_H_
+
+#include "include/compact_set.h"
+#include "mds/MDSMap.h"
+#include "osdc/Journaler.h"
+
+
+/**
+ * Descriptor of the work associated with purging a file. We record
+ * the minimal amount of information from the inode such as the size
+ * and layout: all other un-needed inode metadata (times, permissions, etc)
+ * has been discarded.
+ */
+class PurgeItem
+{
+public:
+ enum Action : uint8_t {
+ NONE = 0,
+ PURGE_FILE = 1,
+ TRUNCATE_FILE,
+ PURGE_DIR
+ };
+
+ utime_t stamp;
+ //None PurgeItem serves as NoOp for splicing out journal entries;
+ //so there has to be a "pad_size" to specify the size of journal
+ //space to be spliced.
+ uint32_t pad_size;
+ Action action;
+ inodeno_t ino;
+ uint64_t size;
+ file_layout_t layout;
+ compact_set<int64_t> old_pools;
+ SnapContext snapc;
+ fragtree_t fragtree;
+
+ PurgeItem()
+ : pad_size(0), action(NONE), ino(0), size(0)
+ {}
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &p);
+
+ static Action str_to_type(std::string_view str) {
+ return PurgeItem::actions.at(std::string(str));
+ }
+
+ void dump(Formatter *f) const
+ {
+ f->dump_int("action", action);
+ f->dump_int("ino", ino);
+ f->dump_int("size", size);
+ f->open_object_section("layout");
+ layout.dump(f);
+ f->close_section();
+ f->open_object_section("SnapContext");
+ snapc.dump(f);
+ f->close_section();
+ f->open_object_section("fragtree");
+ fragtree.dump(f);
+ f->close_section();
+ }
+
+ std::string_view get_type_str() const;
+private:
+ static const std::map<std::string, PurgeItem::Action> actions;
+};
+WRITE_CLASS_ENCODER(PurgeItem)
+
+enum {
+ l_pq_first = 3500,
+
+ // How many items have been finished by PurgeQueue
+ l_pq_executing_ops,
+ l_pq_executing_ops_high_water,
+ l_pq_executing,
+ l_pq_executing_high_water,
+ l_pq_executed,
+ l_pq_last
+};
+
+/**
+ * A persistent queue of PurgeItems. This class both writes and reads
+ * to the queue. There is one of these per MDS rank.
+ *
+ * Note that this class does not take a reference to MDSRank: we are
+ * independent of all the metadata structures and do not need to
+ * take mds_lock for anything.
+ */
+class PurgeQueue
+{
+private:
+ CephContext *cct;
+ const mds_rank_t rank;
+ Mutex lock;
+ bool readonly = false;
+
+ int64_t metadata_pool;
+
+ // Don't use the MDSDaemon's Finisher and Timer, because this class
+ // operates outside of MDSDaemon::mds_lock
+ Finisher finisher;
+ SafeTimer timer;
+ Filer filer;
+ Objecter *objecter;
+ std::unique_ptr<PerfCounters> logger;
+
+ Journaler journaler;
+
+ Context *on_error;
+
+ // Map of Journaler offset to PurgeItem
+ std::map<uint64_t, PurgeItem> in_flight;
+
+ std::set<uint64_t> pending_expire;
+
+ // Throttled allowances
+ uint64_t ops_in_flight;
+
+ // Dynamic op limit per MDS based on PG count
+ uint64_t max_purge_ops;
+
+ uint32_t _calculate_ops(const PurgeItem &item) const;
+
+ bool _can_consume();
+
+ // How many bytes were remaining when drain() was first called,
+ // used for indicating progress.
+ uint64_t drain_initial;
+
+ // Has drain() ever been called on this instance?
+ bool draining;
+
+ // recover the journal write_pos (drop any partial written entry)
+ void _recover();
+
+ /**
+ * @return true if we were in a position to try and consume something:
+ * does not mean we necessarily did.
+ */
+ bool _consume();
+
+ // Do we currently have a flush timer event waiting?
+ Context *delayed_flush;
+
+ void _execute_item(
+ const PurgeItem &item,
+ uint64_t expire_to);
+ void _execute_item_complete(
+ uint64_t expire_to);
+
+ bool recovered;
+ std::list<Context*> waiting_for_recovery;
+
+ void _go_readonly(int r);
+
+ uint64_t ops_high_water = 0;
+ uint64_t files_high_water = 0;
+
+public:
+ void init();
+ void activate();
+ void shutdown();
+
+ void create_logger();
+
+ // Write an empty queue, use this during MDS rank creation
+ void create(Context *completion);
+
+ // Read the Journaler header for an existing queue and start consuming
+ void open(Context *completion);
+
+ void wait_for_recovery(Context *c);
+
+ // Submit one entry to the work queue. Call back when it is persisted
+ // to the queue (there is no callback for when it is executed)
+ void push(const PurgeItem &pi, Context *completion);
+
+ // If the on-disk queue is empty and we are not currently processing
+ // anything.
+ bool is_idle() const;
+
+ /**
+ * Signal to the PurgeQueue that you would like it to hurry up and
+ * finish consuming everything in the queue. Provides progress
+ * feedback.
+ *
+ * @param progress: bytes consumed since we started draining
+ * @param progress_total: max bytes that were outstanding during purge
+ * @param in_flight_count: number of file purges currently in flight
+ *
+ * @returns true if drain is complete
+ */
+ bool drain(
+ uint64_t *progress,
+ uint64_t *progress_total,
+ size_t *in_flight_count);
+
+ void update_op_limit(const MDSMap &mds_map);
+
+ void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
+
+ PurgeQueue(
+ CephContext *cct_,
+ mds_rank_t rank_,
+ const int64_t metadata_pool_,
+ Objecter *objecter_,
+ Context *on_error);
+ ~PurgeQueue();
+};
+
+#endif
+
diff --git a/src/mds/RecoveryQueue.cc b/src/mds/RecoveryQueue.cc
new file mode 100644
index 00000000..e02de367
--- /dev/null
+++ b/src/mds/RecoveryQueue.cc
@@ -0,0 +1,237 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "CInode.h"
+#include "MDCache.h"
+#include "MDSRank.h"
+#include "Locker.h"
+#include "osdc/Filer.h"
+
+#include "RecoveryQueue.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << " RecoveryQueue::" << __func__ << " "
+
+class C_MDC_Recover : public MDSIOContextBase {
+protected:
+ RecoveryQueue *rq;
+ CInode *in;
+ void finish(int r) override {
+ rq->_recovered(in, r, size, mtime);
+ }
+
+ MDSRank *get_mds() override {
+ return rq->mds;
+ }
+
+public:
+ uint64_t size;
+ utime_t mtime;
+
+ C_MDC_Recover(RecoveryQueue *rq_, CInode *i) :
+ MDSIOContextBase(false), rq(rq_), in(i), size(0) {
+ ceph_assert(rq != NULL);
+ }
+ void print(ostream& out) const override {
+ out << "file_recover(" << in->ino() << ")";
+ }
+};
+
+
+RecoveryQueue::RecoveryQueue(MDSRank *mds_) :
+ file_recover_queue(member_offset(CInode, item_dirty_dirfrag_dir)),
+ file_recover_queue_front(member_offset(CInode, item_dirty_dirfrag_nest)),
+ mds(mds_), logger(NULL), filer(mds_->objecter, mds_->finisher)
+{ }
+
+
+/**
+ * Progress the queue. Call this after enqueuing something or on
+ * completion of something.
+ */
+void RecoveryQueue::advance()
+{
+ dout(10) << file_recover_queue_size << " queued, "
+ << file_recover_queue_front_size << " prioritized, "
+ << file_recovering.size() << " recovering" << dendl;
+
+ while (file_recovering.size() < g_conf()->mds_max_file_recover) {
+ if (!file_recover_queue_front.empty()) {
+ CInode *in = file_recover_queue_front.front();
+ in->item_recover_queue_front.remove_myself();
+ file_recover_queue_front_size--;
+ _start(in);
+ } else if (!file_recover_queue.empty()) {
+ CInode *in = file_recover_queue.front();
+ in->item_recover_queue.remove_myself();
+ file_recover_queue_size--;
+ _start(in);
+ } else {
+ break;
+ }
+ }
+
+ logger->set(l_mdc_num_recovering_processing, file_recovering.size());
+ logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
+ logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
+}
+
+void RecoveryQueue::_start(CInode *in)
+{
+ auto pi = in->get_projected_inode();
+
+ // blech
+ if (pi->client_ranges.size() && !pi->get_max_size()) {
+ mds->clog->warn() << "bad client_range " << pi->client_ranges
+ << " on ino " << pi->ino;
+ }
+
+ auto p = file_recovering.find(in);
+ if (pi->client_ranges.size() && pi->get_max_size()) {
+ dout(10) << "starting " << in->inode.size << " " << pi->client_ranges
+ << " " << *in << dendl;
+ if (p == file_recovering.end()) {
+ file_recovering.insert(make_pair(in, false));
+
+ C_MDC_Recover *fin = new C_MDC_Recover(this, in);
+ filer.probe(in->inode.ino, &in->inode.layout, in->last,
+ pi->get_max_size(), &fin->size, &fin->mtime, false,
+ 0, fin);
+ } else {
+ p->second = true;
+ dout(10) << "already working on " << *in << ", set need_restart flag" << dendl;
+ }
+ } else {
+ dout(10) << "skipping " << in->inode.size << " " << *in << dendl;
+ if (p == file_recovering.end()) {
+ in->state_clear(CInode::STATE_RECOVERING);
+ mds->locker->eval(in, CEPH_LOCK_IFILE);
+ in->auth_unpin(this);
+ }
+ }
+}
+
+void RecoveryQueue::prioritize(CInode *in)
+{
+ if (file_recovering.count(in)) {
+ dout(10) << "already working on " << *in << dendl;
+ return;
+ }
+
+ if (!in->item_recover_queue_front.is_on_list()) {
+ dout(20) << *in << dendl;
+
+ ceph_assert(in->item_recover_queue.is_on_list());
+ in->item_recover_queue.remove_myself();
+ file_recover_queue_size--;
+
+ file_recover_queue_front.push_back(&in->item_recover_queue_front);
+
+ file_recover_queue_front_size++;
+ logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
+ return;
+ }
+
+ dout(10) << "not queued " << *in << dendl;
+}
+
+static bool _is_in_any_recover_queue(CInode *in)
+{
+ return in->item_recover_queue.is_on_list() ||
+ in->item_recover_queue_front.is_on_list();
+}
+
+/**
+ * Given an authoritative inode which is in the cache,
+ * enqueue it for recovery.
+ */
+void RecoveryQueue::enqueue(CInode *in)
+{
+ dout(15) << "RecoveryQueue::enqueue " << *in << dendl;
+ ceph_assert(logger); // Caller should have done set_logger before using me
+ ceph_assert(in->is_auth());
+
+ in->state_clear(CInode::STATE_NEEDSRECOVER);
+ if (!in->state_test(CInode::STATE_RECOVERING)) {
+ in->state_set(CInode::STATE_RECOVERING);
+ in->auth_pin(this);
+ logger->inc(l_mdc_recovery_started);
+ }
+
+ if (!_is_in_any_recover_queue(in)) {
+ file_recover_queue.push_back(&in->item_recover_queue);
+ file_recover_queue_size++;
+ logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
+ }
+}
+
+
+/**
+ * Call back on completion of Filer probe on an inode.
+ */
+void RecoveryQueue::_recovered(CInode *in, int r, uint64_t size, utime_t mtime)
+{
+ dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime
+ << " for " << *in << dendl;
+
+ if (r != 0) {
+ dout(0) << "recovery error! " << r << dendl;
+ if (r == -EBLACKLISTED) {
+ mds->respawn();
+ return;
+ } else {
+ // Something wrong on the OSD side trying to recover the size
+ // of this inode. In principle we could record this as a piece
+ // of per-inode damage, but it's actually more likely that
+ // this indicates something wrong with the MDS (like maybe
+ // it has the wrong auth caps?)
+ mds->clog->error() << " OSD read error while recovering size"
+ " for inode " << in->ino();
+ mds->damaged();
+ }
+ }
+
+ auto p = file_recovering.find(in);
+ ceph_assert(p != file_recovering.end());
+ bool restart = p->second;
+ file_recovering.erase(p);
+
+ logger->set(l_mdc_num_recovering_processing, file_recovering.size());
+ logger->inc(l_mdc_recovery_completed);
+ in->state_clear(CInode::STATE_RECOVERING);
+
+ if (restart) {
+ if (in->item_recover_queue.is_on_list()) {
+ in->item_recover_queue.remove_myself();
+ file_recover_queue_size--;
+ }
+ if (in->item_recover_queue_front.is_on_list()) {
+ in->item_recover_queue_front.remove_myself();
+ file_recover_queue_front_size--;
+ }
+ logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
+ logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
+ _start(in);
+ } else if (!_is_in_any_recover_queue(in)) {
+ // journal
+ mds->locker->check_inode_max_size(in, true, 0, size, mtime);
+ mds->locker->eval(in, CEPH_LOCK_IFILE);
+ in->auth_unpin(this);
+ }
+
+ advance();
+}
+
diff --git a/src/mds/RecoveryQueue.h b/src/mds/RecoveryQueue.h
new file mode 100644
index 00000000..a1e6ac48
--- /dev/null
+++ b/src/mds/RecoveryQueue.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+//class C_MDC_Recover;
+//
+#ifndef RECOVERY_QUEUE_H
+#define RECOVERY_QUEUE_H
+
+#include <set>
+
+#include "osdc/Filer.h"
+
+class CInode;
+class MDSRank;
+class PerfCounters;
+
+class RecoveryQueue {
+public:
+ void enqueue(CInode *in);
+ void advance();
+ void prioritize(CInode *in); ///< do this inode now/soon
+ explicit RecoveryQueue(MDSRank *mds_);
+
+ void set_logger(PerfCounters *p) {logger=p;}
+
+private:
+ void _start(CInode *in); ///< start recovering this file
+ void _recovered(CInode *in, int r, uint64_t size, utime_t mtime);
+
+ size_t file_recover_queue_size = 0;
+ size_t file_recover_queue_front_size = 0;
+
+ elist<CInode*> file_recover_queue; ///< the queue
+ elist<CInode*> file_recover_queue_front; ///< elevated priority items
+ std::map<CInode*, bool> file_recovering; // inode -> need_restart
+
+ MDSRank *mds;
+ PerfCounters *logger;
+ Filer filer;
+
+ friend class C_MDC_Recover;
+};
+
+#endif // RECOVERY_QUEUE_H
diff --git a/src/mds/ScatterLock.h b/src/mds/ScatterLock.h
new file mode 100644
index 00000000..f2fe7938
--- /dev/null
+++ b/src/mds/ScatterLock.h
@@ -0,0 +1,255 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_SCATTERLOCK_H
+#define CEPH_SCATTERLOCK_H
+
+#include "SimpleLock.h"
+
+#include "MDSContext.h"
+
+class ScatterLock : public SimpleLock {
+
+ struct more_bits_t {
+ xlist<ScatterLock*>::item item_updated;
+ utime_t update_stamp;
+
+ explicit more_bits_t(ScatterLock *lock) :
+ item_updated(lock)
+ {}
+ };
+
+ mutable std::unique_ptr<more_bits_t> _more;
+
+ more_bits_t *more() {
+ if (!_more)
+ _more.reset(new more_bits_t(this));
+ return _more.get();
+ }
+
+ enum {
+ SCATTER_WANTED = 1 << 8,
+ UNSCATTER_WANTED = 1 << 9,
+ DIRTY = 1 << 10,
+ FLUSHING = 1 << 11,
+ FLUSHED = 1 << 12,
+ };
+
+public:
+ ScatterLock(MDSCacheObject *o, LockType *lt) :
+ SimpleLock(o, lt) {}
+ ~ScatterLock() override {
+ ceph_assert(!_more);
+ }
+
+ bool is_scatterlock() const override {
+ return true;
+ }
+
+ bool is_sync_and_unlocked() const {
+ return
+ SimpleLock::is_sync_and_unlocked() &&
+ !is_dirty() &&
+ !is_flushing();
+ }
+
+ bool can_scatter_pin(client_t loner) {
+ /*
+ LOCK : NOT okay because it can MIX and force replicas to journal something
+ TSYN : also not okay for same reason
+ EXCL : also not okay
+
+ MIX : okay, replica can stall before sending AC_SYNCACK
+ SYNC : okay, replica can stall before sending AC_MIXACK or AC_LOCKACK
+ */
+ return
+ get_state() == LOCK_SYNC ||
+ get_state() == LOCK_MIX;
+ }
+
+ void set_xlock_snap_sync(MDSContext *c)
+ {
+ ceph_assert(get_type() == CEPH_LOCK_IFILE);
+ ceph_assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE);
+ state = LOCK_XLOCKSNAP;
+ add_waiter(WAIT_STABLE, c);
+ }
+
+ xlist<ScatterLock*>::item *get_updated_item() { return &more()->item_updated; }
+
+ utime_t get_update_stamp() {
+ return _more ? _more->update_stamp : utime_t();
+ }
+
+ void set_update_stamp(utime_t t) { more()->update_stamp = t; }
+
+ void set_scatter_wanted() {
+ state_flags |= SCATTER_WANTED;
+ }
+ void set_unscatter_wanted() {
+ state_flags |= UNSCATTER_WANTED;
+ }
+ void clear_scatter_wanted() {
+ state_flags &= ~SCATTER_WANTED;
+ }
+ void clear_unscatter_wanted() {
+ state_flags &= ~UNSCATTER_WANTED;
+ }
+ bool get_scatter_wanted() const {
+ return state_flags & SCATTER_WANTED;
+ }
+ bool get_unscatter_wanted() const {
+ return state_flags & UNSCATTER_WANTED;
+ }
+
+ bool is_dirty() const override {
+ return state_flags & DIRTY;
+ }
+ bool is_flushing() const override {
+ return state_flags & FLUSHING;
+ }
+ bool is_flushed() const override {
+ return state_flags & FLUSHED;
+ }
+ bool is_dirty_or_flushing() const {
+ return is_dirty() || is_flushing();
+ }
+
+ void mark_dirty() {
+ if (!is_dirty()) {
+ if (!is_flushing())
+ parent->get(MDSCacheObject::PIN_DIRTYSCATTERED);
+ set_dirty();
+ }
+ }
+ void start_flush() {
+ if (is_dirty()) {
+ set_flushing();
+ clear_dirty();
+ }
+ }
+ void finish_flush() {
+ if (is_flushing()) {
+ clear_flushing();
+ set_flushed();
+ if (!is_dirty()) {
+ parent->put(MDSCacheObject::PIN_DIRTYSCATTERED);
+ parent->clear_dirty_scattered(get_type());
+ }
+ }
+ }
+ void clear_flushed() override {
+ state_flags &= ~FLUSHED;
+ }
+ void remove_dirty() {
+ start_flush();
+ finish_flush();
+ clear_flushed();
+ }
+
+ void infer_state_from_strong_rejoin(int rstate, bool locktoo) {
+ if (rstate == LOCK_MIX ||
+ rstate == LOCK_MIX_LOCK || // replica still has wrlocks?
+ rstate == LOCK_MIX_SYNC)
+ state = LOCK_MIX;
+ else if (locktoo && rstate == LOCK_LOCK)
+ state = LOCK_LOCK;
+ }
+
+ void encode_state_for_rejoin(bufferlist& bl, int rep) {
+ __s16 s = get_replica_state();
+ if (is_gathering(rep)) {
+ // the recovering mds may hold rejoined wrlocks
+ if (state == LOCK_MIX_SYNC)
+ s = LOCK_MIX_SYNC;
+ else
+ s = LOCK_MIX_LOCK;
+ }
+
+ // If there is a recovering mds who replcated an object when it failed
+ // and scatterlock in the object was in MIX state, It's possible that
+ // the recovering mds needs to take wrlock on the scatterlock when it
+ // replays unsafe requests. So this mds should delay taking rdlock on
+ // the scatterlock until the recovering mds finishes replaying unsafe.
+ // Otherwise unsafe requests may get replayed after current request.
+ //
+ // For example:
+ // The recovering mds is auth mds of a dirfrag, this mds is auth mds
+ // of corresponding inode. when 'rm -rf' the direcotry, this mds should
+ // delay the rmdir request until the recovering mds has replayed unlink
+ // requests.
+ if (s == LOCK_MIX || s == LOCK_MIX_LOCK || s == LOCK_MIX_SYNC)
+ mark_need_recover();
+
+ using ceph::encode;
+ encode(s, bl);
+ }
+
+ void decode_state_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters, bool survivor) {
+ SimpleLock::decode_state_rejoin(p, waiters, survivor);
+ if (is_flushing()) {
+ set_dirty();
+ clear_flushing();
+ }
+ }
+
+ bool remove_replica(int from, bool rejoin) {
+ if (rejoin &&
+ (state == LOCK_MIX ||
+ state == LOCK_MIX_SYNC ||
+ state == LOCK_MIX_LOCK2 ||
+ state == LOCK_MIX_TSYN ||
+ state == LOCK_MIX_EXCL))
+ return false;
+ return SimpleLock::remove_replica(from);
+ }
+
+ void print(ostream& out) const override {
+ out << "(";
+ _print(out);
+ if (is_dirty())
+ out << " dirty";
+ if (is_flushing())
+ out << " flushing";
+ if (is_flushed())
+ out << " flushed";
+ if (get_scatter_wanted())
+ out << " scatter_wanted";
+ out << ")";
+ }
+
+private:
+ void set_flushing() {
+ state_flags |= FLUSHING;
+ }
+ void clear_flushing() {
+ state_flags &= ~FLUSHING;
+ }
+ void set_flushed() {
+ state_flags |= FLUSHED;
+ }
+ void set_dirty() {
+ state_flags |= DIRTY;
+ }
+ void clear_dirty() {
+ state_flags &= ~DIRTY;
+ if (_more) {
+ _more->item_updated.remove_myself();
+ _more.reset();
+ }
+ }
+};
+
+#endif
diff --git a/src/mds/ScrubHeader.h b/src/mds/ScrubHeader.h
new file mode 100644
index 00000000..f49598d8
--- /dev/null
+++ b/src/mds/ScrubHeader.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef SCRUB_HEADER_H_
+#define SCRUB_HEADER_H_
+
+#include <string_view>
+
+class CInode;
+
+/**
+ * Externally input parameters for a scrub, associated with the root
+ * of where we are doing a recursive scrub
+ */
+class ScrubHeader {
+public:
+ ScrubHeader(std::string_view tag_, bool is_tag_internal_, bool force_,
+ bool recursive_, bool repair_, Formatter *f_)
+ : tag(tag_), is_tag_internal(is_tag_internal_), force(force_),
+ recursive(recursive_), repair(repair_), formatter(f_), origin(nullptr)
+ {
+ ceph_assert(formatter != nullptr);
+ }
+
+ // Set after construction because it won't be known until we've
+ // started resolving path and locking
+ void set_origin(CInode *origin_) { origin = origin_; }
+
+ bool get_recursive() const { return recursive; }
+ bool get_repair() const { return repair; }
+ bool get_force() const { return force; }
+ bool is_internal_tag() const { return is_tag_internal; }
+ CInode *get_origin() const { return origin; }
+ std::string_view get_tag() const { return tag; }
+ Formatter &get_formatter() const { return *formatter; }
+
+ bool get_repaired() const { return repaired; }
+ void set_repaired() { repaired = true; }
+
+protected:
+ const std::string tag;
+ bool is_tag_internal;
+ const bool force;
+ const bool recursive;
+ const bool repair;
+ Formatter * const formatter;
+ CInode *origin;
+
+ bool repaired = false; // May be set during scrub if repairs happened
+};
+
+typedef std::shared_ptr<ScrubHeader> ScrubHeaderRef;
+typedef std::shared_ptr<const ScrubHeader> ScrubHeaderRefConst;
+
+#endif // SCRUB_HEADER_H_
+
diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc
new file mode 100644
index 00000000..2743347e
--- /dev/null
+++ b/src/mds/ScrubStack.cc
@@ -0,0 +1,755 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <iostream>
+
+#include "ScrubStack.h"
+#include "common/Finisher.h"
+#include "mds/MDSRank.h"
+#include "mds/MDCache.h"
+#include "mds/MDSContinuation.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, scrubstack->mdcache->mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+ return *_dout << "mds." << mds->get_nodeid() << ".scrubstack ";
+}
+
+std::ostream &operator<<(std::ostream &os, const ScrubStack::State &state) {
+ switch(state) {
+ case ScrubStack::STATE_RUNNING:
+ os << "RUNNING";
+ break;
+ case ScrubStack::STATE_IDLE:
+ os << "IDLE";
+ break;
+ case ScrubStack::STATE_PAUSING:
+ os << "PAUSING";
+ break;
+ case ScrubStack::STATE_PAUSED:
+ os << "PAUSED";
+ break;
+ default:
+ ceph_abort();
+ }
+
+ return os;
+}
+
+void ScrubStack::push_inode(CInode *in)
+{
+ dout(20) << "pushing " << *in << " on top of ScrubStack" << dendl;
+ if (!in->item_scrub.is_on_list()) {
+ in->get(CInode::PIN_SCRUBQUEUE);
+ stack_size++;
+ }
+ inode_stack.push_front(&in->item_scrub);
+}
+
+void ScrubStack::push_inode_bottom(CInode *in)
+{
+ dout(20) << "pushing " << *in << " on bottom of ScrubStack" << dendl;
+ if (!in->item_scrub.is_on_list()) {
+ in->get(CInode::PIN_SCRUBQUEUE);
+ stack_size++;
+ }
+ inode_stack.push_back(&in->item_scrub);
+}
+
+void ScrubStack::pop_inode(CInode *in)
+{
+ dout(20) << "popping " << *in
+ << " off of ScrubStack" << dendl;
+ ceph_assert(in->item_scrub.is_on_list());
+ in->put(CInode::PIN_SCRUBQUEUE);
+ in->item_scrub.remove_myself();
+ stack_size--;
+}
+
+void ScrubStack::_enqueue_inode(CInode *in, CDentry *parent,
+ ScrubHeaderRef& header,
+ MDSContext *on_finish, bool top)
+{
+ dout(10) << __func__ << " with {" << *in << "}"
+ << ", on_finish=" << on_finish << ", top=" << top << dendl;
+ ceph_assert(mdcache->mds->mds_lock.is_locked_by_me());
+ in->scrub_initialize(parent, header, on_finish);
+ if (top)
+ push_inode(in);
+ else
+ push_inode_bottom(in);
+}
+
+void ScrubStack::enqueue_inode(CInode *in, ScrubHeaderRef& header,
+ MDSContext *on_finish, bool top)
+{
+ // abort in progress
+ if (clear_inode_stack) {
+ on_finish->complete(-EAGAIN);
+ return;
+ }
+
+ _enqueue_inode(in, NULL, header, on_finish, top);
+ kick_off_scrubs();
+}
+
+void ScrubStack::kick_off_scrubs()
+{
+ ceph_assert(mdcache->mds->mds_lock.is_locked());
+ dout(20) << __func__ << ": state=" << state << dendl;
+
+ if (clear_inode_stack || state == STATE_PAUSING || state == STATE_PAUSED) {
+ if (scrubs_in_progress == 0) {
+ dout(10) << __func__ << ": in progress scrub operations finished, "
+ << stack_size << " in the stack" << dendl;
+
+ State final_state = state;
+ if (clear_inode_stack) {
+ abort_pending_scrubs();
+ final_state = STATE_IDLE;
+ }
+ if (state == STATE_PAUSING) {
+ final_state = STATE_PAUSED;
+ }
+
+ set_state(final_state);
+ complete_control_contexts(0);
+ }
+
+ return;
+ }
+
+ dout(20) << __func__ << " entering with " << scrubs_in_progress << " in "
+ "progress and " << stack_size << " in the stack" << dendl;
+ bool can_continue = true;
+ elist<CInode*>::iterator i = inode_stack.begin();
+ while (g_conf()->mds_max_scrub_ops_in_progress > scrubs_in_progress &&
+ can_continue) {
+ if (i.end()) {
+ if (scrubs_in_progress == 0) {
+ set_state(STATE_IDLE);
+ }
+
+ return;
+ }
+
+ assert(state == STATE_RUNNING || state == STATE_IDLE);
+ set_state(STATE_RUNNING);
+
+ CInode *curi = *i;
+ ++i; // we have our reference, push iterator forward
+
+ dout(20) << __func__ << " examining " << *curi << dendl;
+
+ if (!curi->is_dir()) {
+ // it's a regular file, symlink, or hard link
+ pop_inode(curi); // we only touch it this once, so remove from stack
+
+ if (!curi->scrub_info()->on_finish) {
+ scrubs_in_progress++;
+ curi->scrub_set_finisher(&scrub_kick);
+ }
+ scrub_file_inode(curi);
+ can_continue = true;
+ } else {
+ bool completed; // it's done, so pop it off the stack
+ bool terminal; // not done, but we can start ops on other directories
+ bool progress; // it added new dentries to the top of the stack
+ scrub_dir_inode(curi, &progress, &terminal, &completed);
+ if (completed) {
+ dout(20) << __func__ << " dir completed" << dendl;
+ pop_inode(curi);
+ } else if (progress) {
+ dout(20) << __func__ << " dir progressed" << dendl;
+ // we added new stuff to top of stack, so reset ourselves there
+ i = inode_stack.begin();
+ } else {
+ dout(20) << __func__ << " dir no-op" << dendl;
+ }
+
+ can_continue = progress || terminal || completed;
+ }
+ }
+}
+
+void ScrubStack::scrub_dir_inode(CInode *in,
+ bool *added_children,
+ bool *terminal,
+ bool *done)
+{
+ dout(10) << __func__ << " " << *in << dendl;
+
+ *added_children = false;
+ bool all_frags_terminal = true;
+ bool all_frags_done = true;
+
+ ScrubHeaderRef header = in->get_scrub_header();
+ ceph_assert(header != nullptr);
+
+ if (header->get_recursive()) {
+ frag_vec_t scrubbing_frags;
+ list<CDir*> scrubbing_cdirs;
+ in->scrub_dirfrags_scrubbing(&scrubbing_frags);
+ dout(20) << __func__ << " iterating over " << scrubbing_frags.size()
+ << " scrubbing frags" << dendl;
+ for (const auto& fg : scrubbing_frags) {
+ // turn frags into CDir *
+ CDir *dir = in->get_dirfrag(fg);
+ if (dir) {
+ scrubbing_cdirs.push_back(dir);
+ dout(25) << __func__ << " got CDir " << *dir << " presently scrubbing" << dendl;
+ } else {
+ in->scrub_dirfrag_finished(fg);
+ dout(25) << __func__ << " missing dirfrag " << fg << " skip scrubbing" << dendl;
+ }
+ }
+
+ dout(20) << __func__ << " consuming from " << scrubbing_cdirs.size()
+ << " scrubbing cdirs" << dendl;
+
+ list<CDir*>::iterator i = scrubbing_cdirs.begin();
+ while (g_conf()->mds_max_scrub_ops_in_progress > scrubs_in_progress) {
+ // select next CDir
+ CDir *cur_dir = NULL;
+ if (i != scrubbing_cdirs.end()) {
+ cur_dir = *i;
+ ++i;
+ dout(20) << __func__ << " got cur_dir = " << *cur_dir << dendl;
+ } else {
+ bool ready = get_next_cdir(in, &cur_dir);
+ dout(20) << __func__ << " get_next_cdir ready=" << ready << dendl;
+
+ if (ready && cur_dir) {
+ scrubbing_cdirs.push_back(cur_dir);
+ } else if (!ready) {
+ // We are waiting for load of a frag
+ all_frags_done = false;
+ all_frags_terminal = false;
+ break;
+ } else {
+ // Finished with all frags
+ break;
+ }
+ }
+ // scrub that CDir
+ bool frag_added_children = false;
+ bool frag_terminal = true;
+ bool frag_done = false;
+ scrub_dirfrag(cur_dir, header,
+ &frag_added_children, &frag_terminal, &frag_done);
+ if (frag_done) {
+ cur_dir->inode->scrub_dirfrag_finished(cur_dir->frag);
+ }
+ *added_children |= frag_added_children;
+ all_frags_terminal = all_frags_terminal && frag_terminal;
+ all_frags_done = all_frags_done && frag_done;
+ }
+
+ dout(20) << "finished looping; all_frags_terminal=" << all_frags_terminal
+ << ", all_frags_done=" << all_frags_done << dendl;
+ } else {
+ dout(20) << "!scrub_recursive" << dendl;
+ }
+
+ if (all_frags_done) {
+ assert (!*added_children); // can't do this if children are still pending
+
+ // OK, so now I can... fire off a validate on the dir inode, and
+ // when it completes, come through here again, noticing that we've
+ // set a flag to indicate the validate happened, and
+ scrub_dir_inode_final(in);
+ }
+
+ *terminal = all_frags_terminal;
+ *done = all_frags_done;
+ dout(10) << __func__ << " is exiting " << *terminal << " " << *done << dendl;
+ return;
+}
+
+bool ScrubStack::get_next_cdir(CInode *in, CDir **new_dir)
+{
+ dout(20) << __func__ << " on " << *in << dendl;
+ frag_t next_frag;
+ int r = in->scrub_dirfrag_next(&next_frag);
+ assert (r >= 0);
+
+ if (r == 0) {
+ // we got a frag to scrub, otherwise it would be ENOENT
+ dout(25) << "looking up new frag " << next_frag << dendl;
+ CDir *next_dir = in->get_or_open_dirfrag(mdcache, next_frag);
+ if (!next_dir->is_complete()) {
+ scrubs_in_progress++;
+ next_dir->fetch(&scrub_kick);
+ dout(25) << "fetching frag from RADOS" << dendl;
+ return false;
+ }
+ *new_dir = next_dir;
+ dout(25) << "returning dir " << *new_dir << dendl;
+ return true;
+ }
+ ceph_assert(r == ENOENT);
+ // there are no dirfrags left
+ *new_dir = NULL;
+ return true;
+}
+
+class C_InodeValidated : public MDSInternalContext
+{
+ public:
+ ScrubStack *stack;
+ CInode::validated_data result;
+ CInode *target;
+
+ C_InodeValidated(MDSRank *mds, ScrubStack *stack_, CInode *target_)
+ : MDSInternalContext(mds), stack(stack_), target(target_)
+ {}
+
+ void finish(int r) override
+ {
+ stack->_validate_inode_done(target, r, result);
+ }
+};
+
+
+void ScrubStack::scrub_dir_inode_final(CInode *in)
+{
+ dout(20) << __func__ << " " << *in << dendl;
+
+ // Two passes through this function. First one triggers inode validation,
+ // second one sets finally_done
+ // FIXME: kind of overloading scrub_in_progress here, using it while
+ // dentry is still on stack to indicate that we have finished
+ // doing our validate_disk_state on the inode
+ // FIXME: the magic-constructing scrub_info() is going to leave
+ // an unneeded scrub_infop lying around here
+ if (!in->scrub_info()->children_scrubbed) {
+ if (!in->scrub_info()->on_finish) {
+ scrubs_in_progress++;
+ in->scrub_set_finisher(&scrub_kick);
+ }
+
+ in->scrub_children_finished();
+ C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in);
+ in->validate_disk_state(&fin->result, fin);
+ }
+
+ return;
+}
+
+void ScrubStack::scrub_dirfrag(CDir *dir,
+ ScrubHeaderRef& header,
+ bool *added_children, bool *is_terminal,
+ bool *done)
+{
+ ceph_assert(dir != NULL);
+
+ dout(20) << __func__ << " on " << *dir << dendl;
+ *added_children = false;
+ *is_terminal = false;
+ *done = false;
+
+
+ if (!dir->scrub_info()->directory_scrubbing) {
+ // Get the frag complete before calling
+ // scrub initialize, so that it can populate its lists
+ // of dentries.
+ if (!dir->is_complete()) {
+ scrubs_in_progress++;
+ dir->fetch(&scrub_kick);
+ return;
+ }
+
+ dir->scrub_initialize(header);
+ }
+
+ int r = 0;
+ while(r == 0) {
+ CDentry *dn = NULL;
+ scrubs_in_progress++;
+ r = dir->scrub_dentry_next(&scrub_kick, &dn);
+ if (r != EAGAIN) {
+ scrubs_in_progress--;
+ }
+
+ if (r == EAGAIN) {
+ // Drop out, CDir fetcher will call back our kicker context
+ dout(20) << __func__ << " waiting for fetch on " << *dir << dendl;
+ return;
+ }
+
+ if (r == ENOENT) {
+ // Nothing left to scrub, are we done?
+ std::list<CDentry*> scrubbing;
+ dir->scrub_dentries_scrubbing(&scrubbing);
+ if (scrubbing.empty()) {
+ dout(20) << __func__ << " dirfrag done: " << *dir << dendl;
+ // FIXME: greg: What's the diff meant to be between done and terminal
+ dir->scrub_finished();
+ *done = true;
+ *is_terminal = true;
+ } else {
+ dout(20) << __func__ << " " << scrubbing.size() << " dentries still "
+ "scrubbing in " << *dir << dendl;
+ }
+ return;
+ }
+
+ // scrub_dentry_next defined to only give EAGAIN, ENOENT, 0 -- we should
+ // never get random IO errors here.
+ ceph_assert(r == 0);
+
+ _enqueue_inode(dn->get_projected_inode(), dn, header, NULL, true);
+
+ *added_children = true;
+ }
+}
+
+void ScrubStack::scrub_file_inode(CInode *in)
+{
+ C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in);
+ // At this stage the DN is already past scrub_initialize, so
+ // it's in the cache, it has PIN_SCRUBQUEUE and it is authpinned
+ in->validate_disk_state(&fin->result, fin);
+}
+
+void ScrubStack::_validate_inode_done(CInode *in, int r,
+ const CInode::validated_data &result)
+{
+ LogChannelRef clog = mdcache->mds->clog;
+ const ScrubHeaderRefConst header = in->scrub_info()->header;
+
+ std::string path;
+ if (!result.passed_validation) {
+ // Build path string for use in messages
+ in->make_path_string(path, true);
+ }
+
+ if (result.backtrace.checked && !result.backtrace.passed &&
+ !result.backtrace.repaired)
+ {
+ // Record backtrace fails as remote linkage damage, as
+ // we may not be able to resolve hard links to this inode
+ mdcache->mds->damage_table.notify_remote_damaged(in->inode.ino, path);
+ } else if (result.inode.checked && !result.inode.passed &&
+ !result.inode.repaired) {
+ // Record damaged inode structures as damaged dentries as
+ // that is where they are stored
+ auto parent = in->get_projected_parent_dn();
+ if (parent) {
+ auto dir = parent->get_dir();
+ mdcache->mds->damage_table.notify_dentry(
+ dir->inode->ino(), dir->frag, parent->last, parent->get_name(), path);
+ }
+ }
+
+ // Inform the cluster log if we found an error
+ if (!result.passed_validation) {
+ if (result.all_damage_repaired()) {
+ clog->info() << "Scrub repaired inode " << in->ino()
+ << " (" << path << ")";
+ } else {
+ clog->warn() << "Scrub error on inode " << in->ino()
+ << " (" << path << ") see " << g_conf()->name
+ << " log and `damage ls` output for details";
+ }
+
+ // Put the verbose JSON output into the MDS log for later inspection
+ JSONFormatter f;
+ result.dump(&f);
+ std::ostringstream out;
+ f.flush(out);
+ derr << __func__ << " scrub error on inode " << *in << ": " << out.str()
+ << dendl;
+ } else {
+ dout(10) << __func__ << " scrub passed on inode " << *in << dendl;
+ }
+
+ MDSContext *c = NULL;
+ in->scrub_finished(&c);
+
+ if (in == header->get_origin()) {
+ scrub_origins.erase(in);
+ clog_scrub_summary(in);
+ if (!header->get_recursive()) {
+ if (r >= 0) { // we got into the scrubbing dump it
+ result.dump(&(header->get_formatter()));
+ } else { // we failed the lookup or something; dump ourselves
+ header->get_formatter().open_object_section("results");
+ header->get_formatter().dump_int("return_code", r);
+ header->get_formatter().close_section(); // results
+ }
+ }
+ }
+ if (c) {
+ finisher->queue(new MDSIOContextWrapper(mdcache->mds, c), 0);
+ }
+}
+
+ScrubStack::C_KickOffScrubs::C_KickOffScrubs(MDCache *mdcache, ScrubStack *s)
+ : MDSInternalContext(mdcache->mds), stack(s) { }
+
+void ScrubStack::complete_control_contexts(int r) {
+ ceph_assert(mdcache->mds->mds_lock.is_locked_by_me());
+
+ for (auto &ctx : control_ctxs) {
+ ctx->complete(r);
+ }
+ control_ctxs.clear();
+}
+
+void ScrubStack::set_state(State next_state) {
+ if (state != next_state) {
+ dout(20) << __func__ << ", from state=" << state << ", to state="
+ << next_state << dendl;
+ state = next_state;
+ clog_scrub_summary();
+ }
+}
+
+bool ScrubStack::scrub_in_transition_state() {
+ ceph_assert(mdcache->mds->mds_lock.is_locked_by_me());
+ dout(20) << __func__ << ": state=" << state << dendl;
+
+ // STATE_RUNNING is considered as a transition state so as to
+ // "delay" the scrub control operation.
+ if (state == STATE_RUNNING || state == STATE_PAUSING) {
+ return true;
+ }
+
+ return false;
+}
+
+std::string_view ScrubStack::scrub_summary() {
+ ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock));
+
+ bool have_more = false;
+ CachedStackStringStream cs;
+
+ if (state == STATE_IDLE) {
+ return "idle";
+ }
+
+ if (state == STATE_RUNNING) {
+ if (clear_inode_stack) {
+ *cs << "aborting";
+ } else {
+ *cs << "active";
+ }
+ } else {
+ if (state == STATE_PAUSING) {
+ have_more = true;
+ *cs << "pausing";
+ } else if (state == STATE_PAUSED) {
+ have_more = true;
+ *cs << "paused";
+ }
+
+ if (clear_inode_stack) {
+ if (have_more) {
+ *cs << "+";
+ }
+ *cs << "aborting";
+ }
+ }
+
+ if (!scrub_origins.empty()) {
+ *cs << " [paths:";
+ for (auto inode = scrub_origins.begin(); inode != scrub_origins.end(); ++inode) {
+ if (inode != scrub_origins.begin()) {
+ *cs << ",";
+ }
+
+ *cs << scrub_inode_path(*inode);
+ }
+ *cs << "]";
+ }
+
+ return cs->strv();
+}
+
+void ScrubStack::scrub_status(Formatter *f) {
+ ceph_assert(mdcache->mds->mds_lock.is_locked_by_me());
+
+ f->open_object_section("result");
+
+ std::stringstream ss;
+ bool have_more = false;
+
+ if (state == STATE_IDLE) {
+ ss << "no active scrubs running";
+ } else if (state == STATE_RUNNING) {
+ if (clear_inode_stack) {
+ ss << "ABORTING";
+ } else {
+ ss << "scrub active";
+ }
+ ss << " (" << stack_size << " inodes in the stack)";
+ } else {
+ if (state == STATE_PAUSING || state == STATE_PAUSED) {
+ have_more = true;
+ ss << state;
+ }
+ if (clear_inode_stack) {
+ if (have_more) {
+ ss << "+";
+ }
+ ss << "ABORTING";
+ }
+
+ ss << " (" << stack_size << " inodes in the stack)";
+ }
+ f->dump_string("status", ss.str());
+
+ f->open_object_section("scrubs");
+ for (auto &inode : scrub_origins) {
+ have_more = false;
+ ScrubHeaderRefConst header = inode->get_scrub_header();
+
+ std::string tag(header->get_tag());
+ f->open_object_section(tag.c_str()); // scrub id
+
+ f->dump_string("path", scrub_inode_path(inode));
+
+ std::stringstream optss;
+ if (header->get_recursive()) {
+ optss << "recursive";
+ have_more = true;
+ }
+ if (header->get_repair()) {
+ if (have_more) {
+ optss << ",";
+ }
+ optss << "repair";
+ have_more = true;
+ }
+ if (header->get_force()) {
+ if (have_more) {
+ optss << ",";
+ }
+ optss << "force";
+ }
+
+ f->dump_string("options", optss.str());
+ f->close_section(); // scrub id
+ }
+ f->close_section(); // scrubs
+ f->close_section(); // result
+}
+
+void ScrubStack::abort_pending_scrubs() {
+ ceph_assert(mdcache->mds->mds_lock.is_locked_by_me());
+ ceph_assert(clear_inode_stack);
+
+ for (auto inode = inode_stack.begin(); !inode.end(); ++inode) {
+ CInode *in = *inode;
+ if (in == in->scrub_info()->header->get_origin()) {
+ scrub_origins.erase(in);
+ clog_scrub_summary(in);
+ }
+
+ MDSContext *ctx = nullptr;
+ in->scrub_aborted(&ctx);
+ if (ctx != nullptr) {
+ ctx->complete(-ECANCELED);
+ }
+ }
+
+ stack_size = 0;
+ inode_stack.clear();
+ clear_inode_stack = false;
+}
+
+void ScrubStack::scrub_abort(Context *on_finish) {
+ ceph_assert(mdcache->mds->mds_lock.is_locked_by_me());
+ ceph_assert(on_finish != nullptr);
+
+ dout(10) << __func__ << ": aborting with " << scrubs_in_progress
+ << " scrubs in progress and " << stack_size << " in the"
+ << " stack" << dendl;
+
+ clear_inode_stack = true;
+ if (scrub_in_transition_state()) {
+ control_ctxs.push_back(on_finish);
+ return;
+ }
+
+ abort_pending_scrubs();
+ if (state != STATE_PAUSED) {
+ set_state(STATE_IDLE);
+ }
+ on_finish->complete(0);
+}
+
+void ScrubStack::scrub_pause(Context *on_finish) {
+ ceph_assert(mdcache->mds->mds_lock.is_locked_by_me());
+ ceph_assert(on_finish != nullptr);
+
+ dout(10) << __func__ << ": pausing with " << scrubs_in_progress
+ << " scrubs in progress and " << stack_size << " in the"
+ << " stack" << dendl;
+
+ // abort is in progress
+ if (clear_inode_stack) {
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ bool done = scrub_in_transition_state();
+ if (done) {
+ set_state(STATE_PAUSING);
+ control_ctxs.push_back(on_finish);
+ return;
+ }
+
+ set_state(STATE_PAUSED);
+ on_finish->complete(0);
+}
+
+bool ScrubStack::scrub_resume() {
+ ceph_assert(mdcache->mds->mds_lock.is_locked_by_me());
+ dout(20) << __func__ << ": state=" << state << dendl;
+
+ int r = 0;
+
+ if (clear_inode_stack) {
+ r = -EINVAL;
+ } else if (state == STATE_PAUSING) {
+ set_state(STATE_RUNNING);
+ complete_control_contexts(-ECANCELED);
+ } else if (state == STATE_PAUSED) {
+ set_state(STATE_RUNNING);
+ kick_off_scrubs();
+ }
+
+ return r;
+}
+
+// send current scrub summary to cluster log
+void ScrubStack::clog_scrub_summary(CInode *in) {
+ if (in) {
+ std::string what;
+ if (clear_inode_stack) {
+ what = "aborted";
+ } else if (scrub_origins.count(in)) {
+ what = "queued";
+ } else {
+ what = "completed";
+ }
+ clog->info() << "scrub " << what << " for path: " << scrub_inode_path(in);
+ }
+
+ clog->info() << "scrub summary: " << scrub_summary();
+}
diff --git a/src/mds/ScrubStack.h b/src/mds/ScrubStack.h
new file mode 100644
index 00000000..3586daf2
--- /dev/null
+++ b/src/mds/ScrubStack.h
@@ -0,0 +1,306 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef SCRUBSTACK_H_
+#define SCRUBSTACK_H_
+
+#include "CDir.h"
+#include "CDentry.h"
+#include "CInode.h"
+#include "MDSContext.h"
+#include "ScrubHeader.h"
+
+#include "common/LogClient.h"
+#include "include/elist.h"
+
+class MDCache;
+class Finisher;
+
+class ScrubStack {
+protected:
+ // reference to global cluster log client
+ LogChannelRef &clog;
+
+ /// A finisher needed so that we don't re-enter kick_off_scrubs
+ Finisher *finisher;
+
+ /// The stack of inodes we want to scrub
+ elist<CInode*> inode_stack;
+ /// current number of dentries we're actually scrubbing
+ int scrubs_in_progress;
+ ScrubStack *scrubstack; // hack for dout
+ int stack_size;
+
+ class C_KickOffScrubs : public MDSInternalContext {
+ ScrubStack *stack;
+ public:
+ C_KickOffScrubs(MDCache *mdcache, ScrubStack *s);
+ void finish(int r) override { }
+ void complete(int r) override {
+ if (r == -ECANCELED) {
+ return;
+ }
+
+ stack->scrubs_in_progress--;
+ stack->kick_off_scrubs();
+ // don't delete self
+ }
+ };
+ C_KickOffScrubs scrub_kick;
+
+public:
+ MDCache *mdcache;
+ ScrubStack(MDCache *mdc, LogChannelRef &clog, Finisher *finisher_) :
+ clog(clog),
+ finisher(finisher_),
+ inode_stack(member_offset(CInode, item_scrub)),
+ scrubs_in_progress(0),
+ scrubstack(this),
+ stack_size(0),
+ scrub_kick(mdc, this),
+ mdcache(mdc) {}
+ ~ScrubStack() {
+ ceph_assert(inode_stack.empty());
+ ceph_assert(!scrubs_in_progress);
+ }
+ /**
+ * Put a inode on the top of the scrub stack, so it is the highest priority.
+ * If there are other scrubs in progress, they will not continue scrubbing new
+ * entries until this one is completed.
+ * @param in The inodey to scrub
+ * @param header The ScrubHeader propagated from wherever this scrub
+ * was initiated
+ */
+ void enqueue_inode_top(CInode *in, ScrubHeaderRef& header,
+ MDSContext *on_finish) {
+ enqueue_inode(in, header, on_finish, true);
+ scrub_origins.emplace(in);
+ clog_scrub_summary(in);
+ }
+ /** Like enqueue_inode_top, but we wait for all pending scrubs before
+ * starting this one.
+ */
+ void enqueue_inode_bottom(CInode *in, ScrubHeaderRef& header,
+ MDSContext *on_finish) {
+ enqueue_inode(in, header, on_finish, false);
+ scrub_origins.emplace(in);
+ clog_scrub_summary(in);
+ }
+
+ /**
+ * Abort an ongoing scrub operation. The abort operation could be
+ * delayed if there are in-progress scrub operations on going. The
+ * caller should provide a context which is completed after all
+ * in-progress scrub operations are completed and pending inodes
+ * are removed from the scrub stack (with the context callbacks for
+ * inodes completed with -ECANCELED).
+ * @param on_finish Context callback to invoke after abort
+ */
+ void scrub_abort(Context *on_finish);
+
+ /**
+ * Pause scrub operations. Similar to abort, pause is delayed if
+ * there are in-progress scrub operations on going. The caller
+ * should provide a context which is completed after all in-progress
+ * scrub operations are completed. Subsequent scrub operations are
+ * queued until scrub is resumed.
+ * @param on_finish Context callback to invoke after pause
+ */
+ void scrub_pause(Context *on_finish);
+
+ /**
+ * Resume a paused scrub. Unlike abort or pause, this is instantaneous.
+ * Pending pause operations are cancelled (context callbacks are
+ * invoked with -ECANCELED).
+ * @returns 0 (success) if resumed, -EINVAL if an abort is in-progress.
+ */
+ bool scrub_resume();
+
+ /**
+ * Get the current scrub status as human readable string. Some basic
+ * information is returned such as number of inodes pending abort/pause.
+ */
+ void scrub_status(Formatter *f);
+
+ bool is_scrubbing() const { return !inode_stack.empty(); }
+
+ /**
+ * Get a high level scrub status summary such as current scrub state
+ * and scrub paths.
+ */
+ std::string_view scrub_summary();
+
+private:
+ // scrub abort is _not_ a state, rather it's an operation that's
+ // performed after in-progress scrubs are finished.
+ enum State {
+ STATE_RUNNING = 0,
+ STATE_IDLE,
+ STATE_PAUSING,
+ STATE_PAUSED,
+ };
+ friend std::ostream &operator<<(std::ostream &os, const State &state);
+
+ State state = STATE_IDLE;
+ bool clear_inode_stack = false;
+
+ // list of pending context completions for asynchronous scrub
+ // control operations.
+ std::list<Context *> control_ctxs;
+
+ // list of inodes for which scrub operations are running -- used
+ // to diplay out in `scrub status`.
+ std::set<CInode *> scrub_origins;
+
+ /**
+ * Put the inode at either the top or bottom of the stack, with
+ * the given scrub params, and then try and kick off more scrubbing.
+ */
+ void enqueue_inode(CInode *in, ScrubHeaderRef& header,
+ MDSContext *on_finish, bool top);
+ void _enqueue_inode(CInode *in, CDentry *parent, ScrubHeaderRef& header,
+ MDSContext *on_finish, bool top);
+ /**
+ * Kick off as many scrubs as are appropriate, based on the current
+ * state of the stack.
+ */
+ void kick_off_scrubs();
+ /**
+ * Push a inode on top of the stack.
+ */
+ inline void push_inode(CInode *in);
+ /**
+ * Push a inode to the bottom of the stack.
+ */
+ inline void push_inode_bottom(CInode *in);
+ /**
+ * Pop the given inode off the stack.
+ */
+ inline void pop_inode(CInode *in);
+
+ /**
+ * Scrub a file inode.
+ * @param in The inode to scrub
+ */
+ void scrub_file_inode(CInode *in);
+
+ /**
+ * Callback from completion of CInode::validate_disk_state
+ * @param in The inode we were validating
+ * @param r The return status from validate_disk_state
+ * @param result Populated results from validate_disk_state
+ */
+ void _validate_inode_done(CInode *in, int r,
+ const CInode::validated_data &result);
+ friend class C_InodeValidated;
+
+ /**
+ * Make progress on scrubbing a directory-representing dirfrag and
+ * its children..
+ *
+ * 1) Select the next dirfrag which hasn't been scrubbed, and make progress
+ * on it if possible.
+ *
+ * 2) If not, move on to the next dirfrag and start it up, if any.
+ *
+ * 3) If waiting for results from dirfrag scrubs, do nothing.
+ *
+ * 4) If all dirfrags have been scrubbed, scrub my inode.
+ *
+ * @param in The CInode to scrub as a directory
+ * @param added_children set to true if we pushed some of our children
+ * onto the ScrubStack
+ * @param is_terminal set to true if there are no descendant dentries
+ * remaining to start scrubbing.
+ * @param done set to true if we and all our children have finished scrubbing
+ */
+ void scrub_dir_inode(CInode *in, bool *added_children, bool *is_terminal,
+ bool *done);
+ /**
+ * Make progress on scrubbing a dirfrag. It may return after each of the
+ * following steps, but will report making progress on each one.
+ *
+ * 1) enqueues the next unscrubbed child directory dentry at the
+ * top of the stack.
+ *
+ * 2) Initiates a scrub on the next unscrubbed file dentry
+ *
+ * If there are scrubs currently in progress on child dentries, no more child
+ * dentries to scrub, and this function is invoked, it will report no
+ * progress. Try again later.
+ *
+ */
+ void scrub_dirfrag(CDir *dir, ScrubHeaderRef& header,
+ bool *added_children, bool *is_terminal, bool *done);
+ /**
+ * Scrub a directory-representing dentry.
+ *
+ * @param in The directory inode we're doing final scrub on.
+ */
+ void scrub_dir_inode_final(CInode *in);
+
+ /**
+ * Get a CDir into memory, and return it if it's already complete.
+ * Otherwise, fetch it and kick off scrubbing when done.
+ *
+ * @param in The Inode to get the next directory from
+ * @param new_dir The CDir we're returning to you. NULL if
+ * not ready yet or there aren't any.
+ * @returns false if you have to wait, true if there's no work
+ * left to do (we returned it, or there are none left in this inode).
+ */
+ bool get_next_cdir(CInode *in, CDir **new_dir);
+
+ /**
+ * Set scrub state
+ * @param next_state State to move the scrub to.
+ */
+ void set_state(State next_state);
+
+ /**
+ * Is scrub in one of transition states (running, pausing)
+ */
+ bool scrub_in_transition_state();
+
+ /**
+ * complete queued up contexts
+ * @param r return value to complete contexts.
+ */
+ void complete_control_contexts(int r);
+
+ /**
+ * Abort pending scrubs for inodes waiting in the inode stack.
+ * Completion context is complete with -ECANCELED.
+ */
+ void abort_pending_scrubs();
+
+ /**
+ * Return path for a given inode.
+ * @param in inode to make path entry.
+ */
+ std::string scrub_inode_path(CInode *in) {
+ std::string path;
+ in->make_path_string(path, true);
+ return (path.empty() ? "/" : path.c_str());
+ }
+
+ /**
+ * Send scrub information (queued/finished scrub path and summary)
+ * to cluster log.
+ * @param in inode for which scrub has been queued or finished.
+ */
+ void clog_scrub_summary(CInode *in=nullptr);
+};
+
+#endif /* SCRUBSTACK_H_ */
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
new file mode 100644
index 00000000..5d0be194
--- /dev/null
+++ b/src/mds/Server.cc
@@ -0,0 +1,10206 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <boost/lexical_cast.hpp>
+#include "include/ceph_assert.h" // lexical_cast includes system assert.h
+
+#include <boost/config/warning_disable.hpp>
+#include <boost/fusion/include/std_pair.hpp>
+#include <boost/range/adaptor/reversed.hpp>
+
+#include "MDSRank.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDCache.h"
+#include "MDLog.h"
+#include "Migrator.h"
+#include "MDBalancer.h"
+#include "InoTable.h"
+#include "SnapClient.h"
+#include "Mutation.h"
+#include "cephfs_features.h"
+
+#include "msg/Messenger.h"
+
+#include "osdc/Objecter.h"
+
+#include "events/EUpdate.h"
+#include "events/ESlaveUpdate.h"
+#include "events/ESession.h"
+#include "events/EOpen.h"
+#include "events/ECommitted.h"
+
+#include "include/stringify.h"
+#include "include/filepath.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "include/compat.h"
+#include "osd/OSDMap.h"
+
+#include <errno.h>
+#include <math.h>
+
+#include <list>
+#include <iostream>
+#include <string_view>
+
+#include "common/config.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
+
+class ServerContext : public MDSContext {
+ protected:
+ Server *server;
+ MDSRank *get_mds() override
+ {
+ return server->mds;
+ }
+
+ public:
+ explicit ServerContext(Server *s) : server(s) {
+ ceph_assert(server != NULL);
+ }
+};
+
+class ServerLogContext : public MDSLogContextBase {
+protected:
+ Server *server;
+ MDSRank *get_mds() override
+ {
+ return server->mds;
+ }
+
+ MDRequestRef mdr;
+ void pre_finish(int r) override {
+ if (mdr)
+ mdr->mark_event("journal_committed: ");
+ }
+public:
+ explicit ServerLogContext(Server *s) : server(s) {
+ ceph_assert(server != NULL);
+ }
+ explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
+ ceph_assert(server != NULL);
+ }
+};
+
+void Server::create_logger()
+{
+ PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
+
+ plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
+ "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
+ "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_u64_counter(l_mdss_handle_client_session,
+ "handle_client_session", "Client session messages", "hcs",
+ PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
+ "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
+ "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
+ PerfCountersBuilder::PRIO_INTERESTING);
+
+ // fop latencies are useful
+ plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+ plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
+ "Request type lookup hash of inode latency");
+ plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
+ "Request type lookup inode latency");
+ plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
+ "Request type lookup parent latency");
+ plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
+ "Request type lookup name latency");
+ plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
+ "Request type lookup latency");
+ plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
+ "Request type lookup snapshot latency");
+ plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
+ "Request type get attribute latency");
+ plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
+ "Request type set attribute latency");
+ plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
+ "Request type set file layout latency");
+ plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
+ "Request type set directory layout latency");
+ plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
+ "Request type set extended attribute latency");
+ plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
+ "Request type remove extended attribute latency");
+ plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
+ "Request type read directory latency");
+ plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
+ "Request type set file lock latency");
+ plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
+ "Request type get file lock latency");
+ plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
+ "Request type create latency");
+ plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
+ "Request type open latency");
+ plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
+ "Request type make node latency");
+ plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
+ "Request type link latency");
+ plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
+ "Request type unlink latency");
+ plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
+ "Request type remove directory latency");
+ plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
+ "Request type rename latency");
+ plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
+ "Request type make directory latency");
+ plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
+ "Request type symbolic link latency");
+ plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
+ "Request type list snapshot latency");
+ plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
+ "Request type make snapshot latency");
+ plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
+ "Request type remove snapshot latency");
+ plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
+ "Request type rename snapshot latency");
+
+ plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+ plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
+ "Client requests dispatched");
+ plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request",
+ "Server requests dispatched");
+
+ logger = plb.create_perf_counters();
+ g_ceph_context->get_perfcounters_collection()->add(logger);
+}
+
+Server::Server(MDSRank *m) :
+ mds(m),
+ mdcache(mds->mdcache), mdlog(mds->mdlog),
+ logger(0),
+ is_full(false),
+ reconnect_done(NULL),
+ failed_reconnects(0),
+ reconnect_evicting(false),
+ terminating_sessions(false),
+ recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate"))
+{
+ max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
+ replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
+ cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
+ max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+ cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
+ max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
+ caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
+ supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
+}
+
+void Server::dispatch(const Message::const_ref &m)
+{
+ switch (m->get_type()) {
+ case CEPH_MSG_CLIENT_RECONNECT:
+ handle_client_reconnect(MClientReconnect::msgref_cast(m));
+ return;
+ }
+
+/*
+ *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
+
+1. In reconnect phase, client sent unsafe requests to mds.
+2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
+(Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
+3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
+
+*/
+ bool sessionclosed_isok = replay_unsafe_with_closed_session;
+ // active?
+ // handle_slave_request()/handle_client_session() will wait if necessary
+ if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
+ const auto &req = MClientRequest::msgref_cast(m);
+ if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
+ Session *session = mds->get_session(req);
+ if (!session || (!session->is_open() && !sessionclosed_isok)) {
+ dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
+ return;
+ }
+ bool queue_replay = false;
+ if (req->is_replay()) {
+ dout(3) << "queuing replayed op" << dendl;
+ queue_replay = true;
+ if (req->head.ino &&
+ !session->have_completed_request(req->get_reqid().tid, nullptr)) {
+ mdcache->add_replay_ino_alloc(inodeno_t(req->head.ino));
+ }
+ } else if (req->get_retry_attempt()) {
+ // process completed request in clientreplay stage. The completed request
+ // might have created new file/directorie. This guarantees MDS sends a reply
+ // to client before other request modifies the new file/directorie.
+ if (session->have_completed_request(req->get_reqid().tid, NULL)) {
+ dout(3) << "queuing completed op" << dendl;
+ queue_replay = true;
+ }
+ // this request was created before the cap reconnect message, drop any embedded
+ // cap releases.
+ req->releases.clear();
+ }
+ if (queue_replay) {
+ req->mark_queued_for_replay();
+ mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ }
+
+ bool wait_for_active = true;
+ if (mds->is_stopping()) {
+ wait_for_active = false;
+ } else if (mds->is_clientreplay()) {
+ if (req->is_queued_for_replay()) {
+ wait_for_active = false;
+ }
+ }
+ if (wait_for_active) {
+ dout(3) << "not active yet, waiting" << dendl;
+ mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ }
+
+ switch (m->get_type()) {
+ case CEPH_MSG_CLIENT_SESSION:
+ handle_client_session(MClientSession::msgref_cast(m));
+ return;
+ case CEPH_MSG_CLIENT_REQUEST:
+ handle_client_request(MClientRequest::msgref_cast(m));
+ return;
+ case CEPH_MSG_CLIENT_RECLAIM:
+ handle_client_reclaim(MClientReclaim::msgref_cast(m));
+ return;
+ case MSG_MDS_SLAVE_REQUEST:
+ handle_slave_request(MMDSSlaveRequest::msgref_cast(m));
+ return;
+ default:
+ derr << "server unknown message " << m->get_type() << dendl;
+ ceph_abort_msg("server unknown message");
+ }
+}
+
+
+
+// ----------------------------------------------------------
+// SESSION management
+
+class C_MDS_session_finish : public ServerLogContext {
+ Session *session;
+ uint64_t state_seq;
+ bool open;
+ version_t cmapv;
+ interval_set<inodeno_t> inos;
+ version_t inotablev;
+ Context *fin;
+public:
+ C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
+ ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
+ C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
+ ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
+ if (fin) {
+ fin->complete(r);
+ }
+ }
+};
+
+Session* Server::find_session_by_uuid(std::string_view uuid)
+{
+ Session* session = nullptr;
+ for (auto& it : mds->sessionmap.get_sessions()) {
+ auto& metadata = it.second->info.client_metadata;
+
+ auto p = metadata.find("uuid");
+ if (p == metadata.end() || p->second != uuid)
+ continue;
+
+ if (!session) {
+ session = it.second;
+ } else if (!session->reclaiming_from) {
+ assert(it.second->reclaiming_from == session);
+ session = it.second;
+ } else {
+ assert(session->reclaiming_from == it.second);
+ }
+ }
+ return session;
+}
+
+void Server::reclaim_session(Session *session, const MClientReclaim::const_ref &m)
+{
+ if (!session->is_open() && !session->is_stale()) {
+ dout(10) << "session not open, dropping this req" << dendl;
+ return;
+ }
+
+ auto reply = MClientReclaimReply::create(0);
+ if (m->get_uuid().empty()) {
+ dout(10) << __func__ << " invalid message (no uuid)" << dendl;
+ reply->set_result(-EINVAL);
+ mds->send_message_client(reply, session);
+ return;
+ }
+
+ unsigned flags = m->get_flags();
+ if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
+ dout(10) << __func__ << " unsupported flags" << dendl;
+ reply->set_result(-EOPNOTSUPP);
+ mds->send_message_client(reply, session);
+ return;
+ }
+
+ Session* target = find_session_by_uuid(m->get_uuid());
+ if (target) {
+ if (session->info.auth_name != target->info.auth_name) {
+ dout(10) << __func__ << " session auth_name " << session->info.auth_name
+ << " != target auth_name " << target->info.auth_name << dendl;
+ reply->set_result(-EPERM);
+ mds->send_message_client(reply, session);
+ }
+
+ assert(!target->reclaiming_from);
+ assert(!session->reclaiming_from);
+ session->reclaiming_from = target;
+ reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
+ }
+
+ if (flags & CEPH_RECLAIM_RESET) {
+ finish_reclaim_session(session, reply);
+ return;
+ }
+
+ ceph_abort();
+}
+
+void Server::finish_reclaim_session(Session *session, const MClientReclaimReply::ref &reply)
+{
+ Session *target = session->reclaiming_from;
+ if (target) {
+ session->reclaiming_from = nullptr;
+
+ Context *send_reply;
+ if (reply) {
+ int64_t session_id = session->get_client().v;
+ send_reply = new FunctionContext([this, session_id, reply](int r) {
+ assert(mds->mds_lock.is_locked_by_me());
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
+ if (!session) {
+ return;
+ }
+ auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
+ reply->set_epoch(epoch);
+ mds->send_message_client(reply, session);
+ });
+ } else {
+ send_reply = nullptr;
+ }
+
+ bool blacklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
+ return map.is_blacklisted(target->info.inst.addr);
+ });
+
+ if (blacklisted || !g_conf()->mds_session_blacklist_on_evict) {
+ kill_session(target, send_reply);
+ } else {
+ std::stringstream ss;
+ mds->evict_client(target->get_client().v, false, true, ss, send_reply);
+ }
+ } else if (reply) {
+ mds->send_message_client(reply, session);
+ }
+}
+
+void Server::handle_client_reclaim(const MClientReclaim::const_ref &m)
+{
+ Session *session = mds->get_session(m);
+ dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+ assert(m->get_source().is_client()); // should _not_ come from an mds!
+
+ if (!session) {
+ dout(0) << " ignoring sessionless msg " << *m << dendl;
+ return;
+ }
+
+ if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
+ finish_reclaim_session(session);
+ } else {
+ reclaim_session(session, m);
+ }
+}
+
+void Server::handle_client_session(const MClientSession::const_ref &m)
+{
+ version_t pv;
+ Session *session = mds->get_session(m);
+
+ dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
+ ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
+
+ if (!session) {
+ dout(0) << " ignoring sessionless msg " << *m << dendl;
+ auto reply = MClientSession::create(CEPH_SESSION_REJECT);
+ reply->metadata["error_string"] = "sessionless";
+ mds->send_message(reply, m->get_connection());
+ return;
+ }
+
+ if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
+ // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
+ } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
+ // close requests need to be handled when mds is active
+ if (mds->get_state() < MDSMap::STATE_ACTIVE) {
+ mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ } else {
+ if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ }
+
+ if (logger)
+ logger->inc(l_mdss_handle_client_session);
+
+ uint64_t sseq = 0;
+ switch (m->get_op()) {
+ case CEPH_SESSION_REQUEST_OPEN:
+ if (session->is_opening() ||
+ session->is_open() ||
+ session->is_stale() ||
+ session->is_killing() ||
+ terminating_sessions) {
+ dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
+ return;
+ }
+ ceph_assert(session->is_closed() || session->is_closing());
+
+ if (mds->is_stopping()) {
+ dout(10) << "mds is stopping, dropping open req" << dendl;
+ return;
+ }
+
+ {
+ auto& addr = session->info.inst.addr;
+ session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features));
+ auto& client_metadata = session->info.client_metadata;
+
+ auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
+ auto now = ceph_clock_now();
+ auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
+ auto elapsed = now - m->get_recv_stamp();
+ CachedStackStringStream css;
+ *css << "New client session:"
+ << " addr=\"" << session->info.inst.addr << "\""
+ << ",elapsed=" << elapsed
+ << ",throttled=" << throttle_elapsed
+ << ",status=\"" << status << "\"";
+ if (!err.empty()) {
+ *css << ",error=\"" << err << "\"";
+ }
+ const auto& metadata = session->info.client_metadata;
+ if (auto it = metadata.find("root"); it != metadata.end()) {
+ *css << ",root=\"" << it->second << "\"";
+ }
+ dout(2) << css->strv() << dendl;
+ };
+
+ auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str) {
+ auto m = MClientSession::create(CEPH_SESSION_REJECT);
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
+ m->metadata["error_string"] = err_str;
+ mds->send_message_client(m, session);
+ log_session_status("REJECTED", err_str);
+ };
+
+ bool blacklisted = mds->objecter->with_osdmap(
+ [&addr](const OSDMap &osd_map) -> bool {
+ return osd_map.is_blacklisted(addr);
+ });
+
+ if (blacklisted) {
+ dout(10) << "rejecting blacklisted client " << addr << dendl;
+ send_reject_message("blacklisted");
+ session->clear();
+ break;
+ }
+
+ if (client_metadata.features.empty())
+ infer_supported_features(session, client_metadata);
+
+ dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
+ dout(20) << " features: '" << client_metadata.features << dendl;
+ for (const auto& p : client_metadata) {
+ dout(20) << " " << p.first << ": " << p.second << dendl;
+ }
+
+ feature_bitset_t missing_features = required_client_features;
+ missing_features -= client_metadata.features;
+ if (!missing_features.empty()) {
+ stringstream ss;
+ ss << "missing required features '" << missing_features << "'";
+ send_reject_message(ss.str());
+ mds->clog->warn() << "client session (" << session->info.inst
+ << ") lacks required features " << missing_features
+ << "; client supports " << client_metadata.features;
+ session->clear();
+ break;
+ }
+
+ // Special case for the 'root' metadata path; validate that the claimed
+ // root is actually within the caps of the session
+ if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
+ auto claimed_root = it->second;
+ stringstream ss;
+ bool denied = false;
+ // claimed_root has a leading "/" which we strip before passing
+ // into caps check
+ if (claimed_root.empty() || claimed_root[0] != '/') {
+ denied = true;
+ ss << "invalue root '" << claimed_root << "'";
+ } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
+ denied = true;
+ ss << "non-allowable root '" << claimed_root << "'";
+ }
+
+ if (denied) {
+ // Tell the client we're rejecting their open
+ send_reject_message(ss.str());
+ mds->clog->warn() << "client session with " << ss.str()
+ << " denied (" << session->info.inst << ")";
+ session->clear();
+ break;
+ }
+ }
+
+ if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
+ if (find_session_by_uuid(it->second)) {
+ send_reject_message("duplicated session uuid");
+ mds->clog->warn() << "client session with duplicated session uuid '"
+ << it->second << "' denied (" << session->info.inst << ")";
+ session->clear();
+ break;
+ }
+ }
+
+ if (session->is_closed())
+ mds->sessionmap.add_session(session);
+
+ pv = mds->sessionmap.mark_projected(session);
+ sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
+ mds->sessionmap.touch_session(session);
+ auto fin = new FunctionContext([log_session_status = std::move(log_session_status)](int r){
+ ceph_assert(r == 0);
+ log_session_status("ACCEPTED", "");
+ });
+ mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
+ new C_MDS_session_finish(this, session, sseq, true, pv, fin));
+ mdlog->flush();
+ }
+ break;
+
+ case CEPH_SESSION_REQUEST_RENEWCAPS:
+ if (session->is_open() || session->is_stale()) {
+ mds->sessionmap.touch_session(session);
+ if (session->is_stale()) {
+ mds->sessionmap.set_state(session, Session::STATE_OPEN);
+ mds->locker->resume_stale_caps(session);
+ mds->sessionmap.touch_session(session);
+ }
+ auto reply = MClientSession::create(CEPH_SESSION_RENEWCAPS, m->get_seq());
+ mds->send_message_client(reply, session);
+ } else {
+ dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
+ }
+ break;
+
+ case CEPH_SESSION_REQUEST_CLOSE:
+ {
+ if (session->is_closed() ||
+ session->is_closing() ||
+ session->is_killing()) {
+ dout(10) << "already closed|closing|killing, dropping this req" << dendl;
+ return;
+ }
+ if (session->is_importing()) {
+ dout(10) << "ignoring close req on importing session" << dendl;
+ return;
+ }
+ ceph_assert(session->is_open() ||
+ session->is_stale() ||
+ session->is_opening());
+ if (m->get_seq() < session->get_push_seq()) {
+ dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
+ << ", dropping" << dendl;
+ return;
+ }
+ // We are getting a seq that is higher than expected.
+ // Handle the same as any other seqn error.
+ //
+ if (m->get_seq() != session->get_push_seq()) {
+ dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
+ << ", BUGGY!" << dendl;
+ mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
+ << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
+ return;
+ }
+ journal_close_session(session, Session::STATE_CLOSING, NULL);
+ }
+ break;
+
+ case CEPH_SESSION_FLUSHMSG_ACK:
+ finish_flush_session(session, m->get_seq());
+ break;
+
+ case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
+ if (mds->is_active())
+ mdlog->flush();
+ break;
+
+ default:
+ ceph_abort();
+ }
+}
+
+
+void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
+ if (!session->is_open() ||
+ !session->get_connection() ||
+ !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
+ return;
+ }
+
+ version_t seq = session->wait_for_flush(gather.new_sub());
+ mds->send_message_client(
+ MClientSession::create(CEPH_SESSION_FLUSHMSG, seq), session);
+}
+
+void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
+{
+ for (const auto& client : client_set) {
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
+ ceph_assert(session);
+ flush_session(session, gather);
+ }
+}
+
+void Server::finish_flush_session(Session *session, version_t seq)
+{
+ MDSContext::vec finished;
+ session->finish_flush(seq, finished);
+ mds->queue_waiters(finished);
+}
+
+void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
+ interval_set<inodeno_t>& inos, version_t piv)
+{
+ dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
+ << " " << pv << dendl;
+
+ if (piv) {
+ ceph_assert(session->is_closing() || session->is_killing() ||
+ session->is_opening()); // re-open closing session
+ session->info.prealloc_inos.subtract(inos);
+ mds->inotable->apply_release_ids(inos);
+ ceph_assert(mds->inotable->get_version() == piv);
+ }
+
+ mds->sessionmap.mark_dirty(session);
+
+ // apply
+ if (session->get_state_seq() != state_seq) {
+ dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
+ << ", noop" << dendl;
+ // close must have been canceled (by an import?), or any number of other things..
+ } else if (open) {
+ ceph_assert(session->is_opening());
+ mds->sessionmap.set_state(session, Session::STATE_OPEN);
+ mds->sessionmap.touch_session(session);
+ ceph_assert(session->get_connection());
+ auto reply = MClientSession::create(CEPH_SESSION_OPEN);
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
+ reply->supported_features = supported_features;
+ mds->send_message_client(reply, session);
+ if (mdcache->is_readonly()) {
+ auto m = MClientSession::create(CEPH_SESSION_FORCE_RO);
+ mds->send_message_client(m, session);
+ }
+ } else if (session->is_closing() ||
+ session->is_killing()) {
+ // kill any lingering capabilities, leases, requests
+ bool killing = session->is_killing();
+ while (!session->caps.empty()) {
+ Capability *cap = session->caps.front();
+ CInode *in = cap->get_inode();
+ dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
+ mds->locker->remove_client_cap(in, cap, killing);
+ }
+ while (!session->leases.empty()) {
+ ClientLease *r = session->leases.front();
+ CDentry *dn = static_cast<CDentry*>(r->parent);
+ dout(20) << " killing client lease of " << *dn << dendl;
+ dn->remove_client_lease(r, mds->locker);
+ }
+ if (client_reconnect_gather.erase(session->info.get_client())) {
+ dout(20) << " removing client from reconnect set" << dendl;
+ if (client_reconnect_gather.empty()) {
+ dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
+ reconnect_gather_finish();
+ }
+ }
+ if (client_reclaim_gather.erase(session->info.get_client())) {
+ dout(20) << " removing client from reclaim set" << dendl;
+ if (client_reclaim_gather.empty()) {
+ dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
+ mds->maybe_clientreplay_done();
+ }
+ }
+
+ if (session->is_closing()) {
+ // mark con disposable. if there is a fault, we will get a
+ // reset and clean it up. if the client hasn't received the
+ // CLOSE message yet, they will reconnect and get an
+ // ms_handle_remote_reset() and realize they had in fact closed.
+ // do this *before* sending the message to avoid a possible
+ // race.
+ if (session->get_connection()) {
+ // Conditional because terminate_sessions will indiscrimately
+ // put sessions in CLOSING whether they ever had a conn or not.
+ session->get_connection()->mark_disposable();
+ }
+
+ // reset session
+ mds->send_message_client(MClientSession::create(CEPH_SESSION_CLOSE), session);
+ mds->sessionmap.set_state(session, Session::STATE_CLOSED);
+ session->clear();
+ mds->sessionmap.remove_session(session);
+ } else if (session->is_killing()) {
+ // destroy session, close connection
+ if (session->get_connection()) {
+ session->get_connection()->mark_down();
+ mds->sessionmap.set_state(session, Session::STATE_CLOSED);
+ session->set_connection(nullptr);
+ }
+ mds->sessionmap.remove_session(session);
+ } else {
+ ceph_abort();
+ }
+ } else {
+ ceph_abort();
+ }
+}
+
+/**
+ * Inject sessions from some source other than actual connections.
+ *
+ * For example:
+ * - sessions inferred from journal replay
+ * - sessions learned from other MDSs during rejoin
+ * - sessions learned from other MDSs during dir/caps migration
+ * - sessions learned from other MDSs during a cross-MDS rename
+ */
+version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
+ map<client_t,client_metadata_t>& cmm,
+ map<client_t, pair<Session*,uint64_t> >& smap)
+{
+ version_t pv = mds->sessionmap.get_projected();
+
+ dout(10) << "prepare_force_open_sessions " << pv
+ << " on " << cm.size() << " clients"
+ << dendl;
+
+ mds->objecter->with_osdmap(
+ [this, &cm, &cmm](const OSDMap &osd_map) {
+ for (auto p = cm.begin(); p != cm.end(); ) {
+ if (osd_map.is_blacklisted(p->second.addr)) {
+ dout(10) << " ignoring blacklisted client." << p->first
+ << " (" << p->second.addr << ")" << dendl;
+ cmm.erase(p->first);
+ cm.erase(p++);
+ } else {
+ ++p;
+ }
+ }
+ });
+
+ for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
+ Session *session = mds->sessionmap.get_or_add_session(p->second);
+ pv = mds->sessionmap.mark_projected(session);
+ uint64_t sseq;
+ if (session->is_closed() ||
+ session->is_closing() ||
+ session->is_killing()) {
+ sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
+ auto q = cmm.find(p->first);
+ if (q != cmm.end())
+ session->info.client_metadata.merge(q->second);
+ } else {
+ ceph_assert(session->is_open() ||
+ session->is_opening() ||
+ session->is_stale());
+ sseq = 0;
+ }
+ smap[p->first] = make_pair(session, sseq);
+ session->inc_importing();
+ }
+ return pv;
+}
+
+void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
+ bool dec_import)
+{
+ /*
+ * FIXME: need to carefully consider the race conditions between a
+ * client trying to close a session and an MDS doing an import
+ * trying to force open a session...
+ */
+ dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
+ << " initial v " << mds->sessionmap.get_version() << dendl;
+
+ for (auto &it : smap) {
+ Session *session = it.second.first;
+ uint64_t sseq = it.second.second;
+ if (sseq > 0) {
+ if (session->get_state_seq() != sseq) {
+ dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
+ } else {
+ dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
+ mds->sessionmap.set_state(session, Session::STATE_OPEN);
+ mds->sessionmap.touch_session(session);
+
+ auto reply = MClientSession::create(CEPH_SESSION_OPEN);
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
+ reply->supported_features = supported_features;
+ mds->send_message_client(reply, session);
+
+ if (mdcache->is_readonly())
+ mds->send_message_client(MClientSession::create(CEPH_SESSION_FORCE_RO), session);
+ }
+ } else {
+ dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
+ ceph_assert(session->is_open() || session->is_stale());
+ }
+
+ if (dec_import) {
+ session->dec_importing();
+ }
+
+ mds->sessionmap.mark_dirty(session);
+ }
+
+ dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
+}
+
+class C_MDS_TerminatedSessions : public ServerContext {
+ void finish(int r) override {
+ server->terminating_sessions = false;
+ }
+ public:
+ explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
+};
+
+void Server::terminate_sessions()
+{
+ dout(5) << "terminating all sessions..." << dendl;
+
+ terminating_sessions = true;
+
+ // kill them off. clients will retry etc.
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ for (set<Session*>::const_iterator p = sessions.begin();
+ p != sessions.end();
+ ++p) {
+ Session *session = *p;
+ if (session->is_closing() ||
+ session->is_killing() ||
+ session->is_closed())
+ continue;
+ journal_close_session(session, Session::STATE_CLOSING, NULL);
+ }
+
+ mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
+}
+
+
+void Server::find_idle_sessions()
+{
+ auto now = clock::now();
+ auto last_cleared_laggy = mds->last_cleared_laggy();
+
+ dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
+
+ // timeout/stale
+ // (caps go stale, lease die)
+ double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
+ double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
+
+ // don't kick clients if we've been laggy
+ if (last_cleared_laggy < cutoff) {
+ dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
+ << "), not marking any client stale" << dendl;
+ return;
+ }
+
+ std::vector<Session*> to_evict;
+
+ bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
+ const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
+ if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
+ std::vector<Session*> new_stale;
+
+ for (auto session : *(sessions_p1->second)) {
+ auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+ if (last_cap_renew_span < cutoff) {
+ dout(20) << "laggiest active session is " << session->info.inst
+ << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+ break;
+ }
+
+ if (session->last_seen > session->last_cap_renew) {
+ last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
+ if (last_cap_renew_span < cutoff) {
+ dout(20) << "laggiest active session is " << session->info.inst
+ << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+ continue;
+ }
+ }
+
+ if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
+ dout(20) << "evicting session " << session->info.inst << " since autoclose "
+ "has arrived" << dendl;
+ // evict session without marking it stale
+ to_evict.push_back(session);
+ continue;
+ }
+
+ if (defer_session_stale &&
+ !session->is_any_flush_waiter() &&
+ !mds->locker->is_revoking_any_caps_from(session->get_client())) {
+ dout(20) << "deferring marking session " << session->info.inst << " stale "
+ "since it holds no caps" << dendl;
+ continue;
+ }
+
+ auto it = session->info.client_metadata.find("timeout");
+ if (it != session->info.client_metadata.end()) {
+ unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
+ if (timeout == 0) {
+ dout(10) << "skipping session " << session->info.inst
+ << ", infinite timeout specified" << dendl;
+ continue;
+ }
+ double cutoff = queue_max_age + timeout;
+ if (last_cap_renew_span < cutoff) {
+ dout(10) << "skipping session " << session->info.inst
+ << ", timeout (" << timeout << ") specified"
+ << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+ continue;
+ }
+
+ // do not go through stale, evict it directly.
+ to_evict.push_back(session);
+ } else {
+ dout(10) << "new stale session " << session->info.inst
+ << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
+ new_stale.push_back(session);
+ }
+ }
+
+ for (auto session : new_stale) {
+ mds->sessionmap.set_state(session, Session::STATE_STALE);
+ if (mds->locker->revoke_stale_caps(session)) {
+ mds->locker->remove_stale_leases(session);
+ finish_flush_session(session, session->get_push_seq());
+ auto m = MClientSession::create(CEPH_SESSION_STALE, session->get_push_seq());
+ mds->send_message_client(m, session);
+ } else {
+ to_evict.push_back(session);
+ }
+ }
+ }
+
+ // autoclose
+ cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
+
+ // Collect a list of sessions exceeding the autoclose threshold
+ const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
+ if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
+ for (auto session : *(sessions_p2->second)) {
+ assert(session->is_stale());
+ auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+ if (last_cap_renew_span < cutoff) {
+ dout(20) << "oldest stale session is " << session->info.inst
+ << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
+ break;
+ }
+ to_evict.push_back(session);
+ }
+ }
+
+ for (auto session: to_evict) {
+ if (session->is_importing()) {
+ dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
+ continue;
+ }
+
+ auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+ mds->clog->warn() << "evicting unresponsive client " << *session
+ << ", after " << last_cap_renew_span << " seconds";
+ dout(10) << "autoclosing stale session " << session->info.inst
+ << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
+
+ if (g_conf()->mds_session_blacklist_on_timeout) {
+ std::stringstream ss;
+ mds->evict_client(session->get_client().v, false, true, ss, nullptr);
+ } else {
+ kill_session(session, NULL);
+ }
+ }
+}
+
+void Server::evict_cap_revoke_non_responders() {
+ if (!cap_revoke_eviction_timeout) {
+ return;
+ }
+
+ std::list<client_t> to_evict;
+ mds->locker->get_late_revoking_clients(&to_evict, cap_revoke_eviction_timeout);
+
+ for (auto const &client: to_evict) {
+ mds->clog->warn() << "client id " << client << " has not responded to"
+ << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
+ << " seconds, evicting";
+ dout(1) << __func__ << ": evicting cap revoke non-responder client id "
+ << client << dendl;
+
+ std::stringstream ss;
+ bool evicted = mds->evict_client(client.v, false,
+ g_conf()->mds_session_blacklist_on_evict,
+ ss, nullptr);
+ if (evicted && logger) {
+ logger->inc(l_mdss_cap_revoke_eviction);
+ }
+ }
+}
+
+void Server::handle_conf_change(const std::set<std::string>& changed) {
+ if (changed.count("mds_replay_unsafe_with_closed_session")) {
+ replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
+ }
+ if (changed.count("mds_cap_revoke_eviction_timeout")) {
+ cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
+ dout(20) << __func__ << " cap revoke eviction timeout changed to "
+ << cap_revoke_eviction_timeout << dendl;
+ }
+ if (changed.count("mds_recall_max_decay_rate")) {
+ recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
+ }
+ if (changed.count("mds_max_snaps_per_dir")) {
+ max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
+ dout(20) << __func__ << " max snapshots per directory changed to "
+ << max_snaps_per_dir << dendl;
+ }
+ if (changed.count("mds_max_caps_per_client")) {
+ max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+ }
+ if (changed.count("mds_session_cap_acquisition_throttle")) {
+ cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
+ }
+ if (changed.count("mds_session_max_caps_throttle_ratio")) {
+ max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
+ }
+ if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
+ caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
+ }
+}
+
+/*
+ * XXX bump in the interface here, not using an MDSContext here
+ * because all the callers right now happen to use a SaferCond
+ */
+void Server::kill_session(Session *session, Context *on_safe)
+{
+ ceph_assert(mds->mds_lock.is_locked_by_me());
+
+ if ((session->is_opening() ||
+ session->is_open() ||
+ session->is_stale()) &&
+ !session->is_importing()) {
+ dout(10) << "kill_session " << session << dendl;
+ journal_close_session(session, Session::STATE_KILLING, on_safe);
+ } else {
+ dout(10) << "kill_session importing or already closing/killing " << session << dendl;
+ if (session->is_closing() ||
+ session->is_killing()) {
+ if (on_safe)
+ mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
+ } else {
+ ceph_assert(session->is_closed() ||
+ session->is_importing());
+ if (on_safe)
+ on_safe->complete(0);
+ }
+ }
+}
+
+size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
+{
+ bool prenautilus = mds->objecter->with_osdmap(
+ [&](const OSDMap& o) {
+ return o.require_osd_release < CEPH_RELEASE_NAUTILUS;
+ });
+
+ std::vector<Session*> victims;
+ const auto& sessions = mds->sessionmap.get_sessions();
+ for (const auto& p : sessions) {
+ if (!p.first.is_client()) {
+ // Do not apply OSDMap blacklist to MDS daemons, we find out
+ // about their death via MDSMap.
+ continue;
+ }
+
+ Session *s = p.second;
+ auto inst_addr = s->info.inst.addr;
+ // blacklist entries are always TYPE_ANY for nautilus+
+ inst_addr.set_type(entity_addr_t::TYPE_ANY);
+ if (blacklist.count(inst_addr)) {
+ victims.push_back(s);
+ continue;
+ }
+ if (prenautilus) {
+ // ...except pre-nautilus, they were TYPE_LEGACY
+ inst_addr.set_type(entity_addr_t::TYPE_LEGACY);
+ if (blacklist.count(inst_addr)) {
+ victims.push_back(s);
+ }
+ }
+ }
+
+ for (const auto s : victims) {
+ kill_session(s, nullptr);
+ }
+
+ dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
+
+ return victims.size();
+}
+
+void Server::journal_close_session(Session *session, int state, Context *on_safe)
+{
+ uint64_t sseq = mds->sessionmap.set_state(session, state);
+ version_t pv = mds->sessionmap.mark_projected(session);
+ version_t piv = 0;
+
+ // release alloc and pending-alloc inos for this session
+ // and wipe out session state, in case the session close aborts for some reason
+ interval_set<inodeno_t> both;
+ both.insert(session->info.prealloc_inos);
+ both.insert(session->pending_prealloc_inos);
+ if (both.size()) {
+ mds->inotable->project_release_ids(both);
+ piv = mds->inotable->get_projected_version();
+ } else
+ piv = 0;
+
+ mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
+ new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
+ mdlog->flush();
+
+ // clean up requests, too
+ while(!session->requests.empty()) {
+ auto mdr = MDRequestRef(*session->requests.begin());
+ mdcache->request_kill(mdr);
+ }
+
+ finish_flush_session(session, session->get_push_seq());
+}
+
+void Server::reconnect_clients(MDSContext *reconnect_done_)
+{
+ reconnect_done = reconnect_done_;
+
+ auto now = clock::now();
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ for (auto session : sessions) {
+ if (session->is_open()) {
+ client_reconnect_gather.insert(session->get_client());
+ session->set_reconnecting(true);
+ session->last_cap_renew = now;
+ }
+ }
+
+ if (client_reconnect_gather.empty()) {
+ dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
+ reconnect_gather_finish();
+ return;
+ }
+
+ // clients will get the mdsmap and discover we're reconnecting via the monitor.
+
+ reconnect_start = now;
+ dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
+ mds->sessionmap.dump();
+}
+
+void Server::handle_client_reconnect(const MClientReconnect::const_ref &m)
+{
+ dout(7) << "handle_client_reconnect " << m->get_source()
+ << (m->has_more() ? " (more)" : "") << dendl;
+ client_t from = m->get_source().num();
+ Session *session = mds->get_session(m);
+ if (!session) {
+ dout(0) << " ignoring sessionless msg " << *m << dendl;
+ auto reply = MClientSession::create(CEPH_SESSION_REJECT);
+ reply->metadata["error_string"] = "sessionless";
+ mds->send_message(reply, m->get_connection());
+ return;
+ }
+
+ if (!session->is_open()) {
+ dout(0) << " ignoring msg from not-open session" << *m << dendl;
+ auto reply = MClientSession::create(CEPH_SESSION_CLOSE);
+ mds->send_message(reply, m->get_connection());
+ return;
+ }
+
+ if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
+ dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
+ mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
+ dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
+
+ bool deny = false;
+ if (!mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
+ // XXX maybe in the future we can do better than this?
+ dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
+ mds->clog->info() << "denied reconnect attempt (mds is "
+ << ceph_mds_state_name(mds->get_state())
+ << ") from " << m->get_source_inst()
+ << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
+ deny = true;
+ } else {
+ std::string error_str;
+ if (!session->is_open()) {
+ error_str = "session is closed";
+ } else if (mdcache->is_readonly()) {
+ error_str = "mds is readonly";
+ } else {
+ if (session->info.client_metadata.features.empty())
+ infer_supported_features(session, session->info.client_metadata);
+
+ feature_bitset_t missing_features = required_client_features;
+ missing_features -= session->info.client_metadata.features;
+ if (!missing_features.empty()) {
+ stringstream ss;
+ ss << "missing required features '" << missing_features << "'";
+ error_str = ss.str();
+ }
+ }
+
+ if (!error_str.empty()) {
+ deny = true;
+ dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
+ mds->clog->info() << "denied reconnect attempt from "
+ << m->get_source_inst() << " (" << error_str << ")";
+ }
+ }
+
+ if (deny) {
+ auto r = MClientSession::create(CEPH_SESSION_CLOSE);
+ mds->send_message_client(r, session);
+ if (session->is_open())
+ kill_session(session, nullptr);
+ return;
+ }
+
+ if (!m->has_more()) {
+ // notify client of success with an OPEN
+ auto reply = MClientSession::create(CEPH_SESSION_OPEN);
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
+ reply->supported_features = supported_features;
+ mds->send_message_client(reply, session);
+ mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
+ }
+
+ session->last_cap_renew = clock::now();
+
+ // snaprealms
+ for (const auto &r : m->realms) {
+ CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
+ if (in && in->state_test(CInode::STATE_PURGING))
+ continue;
+ if (in) {
+ if (in->snaprealm) {
+ dout(15) << "open snaprealm (w inode) on " << *in << dendl;
+ } else {
+ // this can happen if we are non-auth or we rollback snaprealm
+ dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
+ }
+ mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
+ } else {
+ dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
+ << " seq " << r.realm.seq << dendl;
+ mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
+ }
+ }
+
+ // caps
+ for (const auto &p : m->caps) {
+ // make sure our last_cap_id is MAX over all issued caps
+ if (p.second.capinfo.cap_id > mdcache->last_cap_id)
+ mdcache->last_cap_id = p.second.capinfo.cap_id;
+
+ CInode *in = mdcache->get_inode(p.first);
+ if (in && in->state_test(CInode::STATE_PURGING))
+ continue;
+ if (in && in->is_auth()) {
+ // we recovered it, and it's ours. take note.
+ dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
+ << " on " << *in << dendl;
+ in->reconnect_cap(from, p.second, session);
+ mdcache->add_reconnected_cap(from, p.first, p.second);
+ recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
+ continue;
+ }
+
+ if (in && !in->is_auth()) {
+ // not mine.
+ dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
+ // add to cap export list.
+ mdcache->rejoin_export_caps(p.first, from, p.second,
+ in->authority().first, true);
+ } else {
+ // don't know if the inode is mine
+ dout(10) << "missing ino " << p.first << ", will load later" << dendl;
+ mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
+ }
+ }
+
+ reconnect_last_seen = clock::now();
+
+ if (!m->has_more()) {
+ mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
+
+ // remove from gather set
+ client_reconnect_gather.erase(from);
+ session->set_reconnecting(false);
+ if (client_reconnect_gather.empty())
+ reconnect_gather_finish();
+ }
+}
+
+void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
+{
+ int supported = -1;
+ auto it = client_metadata.find("ceph_version");
+ if (it != client_metadata.end()) {
+ // user space client
+ if (it->second.compare(0, 16, "ceph version 12.") == 0)
+ supported = CEPHFS_FEATURE_LUMINOUS;
+ else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
+ supported = CEPHFS_FEATURE_KRAKEN;
+ } else {
+ it = client_metadata.find("kernel_version");
+ if (it != client_metadata.end()) {
+ // kernel client
+ if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
+ supported = CEPHFS_FEATURE_LUMINOUS;
+ }
+ }
+ if (supported == -1 &&
+ session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
+ supported = CEPHFS_FEATURE_JEWEL;
+
+ if (supported >= 0) {
+ unsigned long value = (1UL << (supported + 1)) - 1;
+ client_metadata.features = feature_bitset_t(value);
+ dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
+ }
+}
+
+void Server::update_required_client_features()
+{
+ vector<size_t> bits = CEPHFS_FEATURES_MDS_REQUIRED;
+
+ int min_compat = mds->mdsmap->get_min_compat_client();
+ if (min_compat >= CEPH_RELEASE_NAUTILUS) {
+ bits.push_back(CEPHFS_FEATURE_NAUTILUS);
+ } else if (min_compat >= CEPH_RELEASE_MIMIC)
+ bits.push_back(CEPHFS_FEATURE_MIMIC);
+ else if (min_compat >= CEPH_RELEASE_LUMINOUS)
+ bits.push_back(CEPHFS_FEATURE_LUMINOUS);
+ else if (min_compat >= CEPH_RELEASE_KRAKEN)
+ bits.push_back(CEPHFS_FEATURE_KRAKEN);
+ else if (min_compat >= CEPH_RELEASE_JEWEL)
+ bits.push_back(CEPHFS_FEATURE_JEWEL);
+
+ std::sort(bits.begin(), bits.end());
+ required_client_features = feature_bitset_t(bits);
+ dout(7) << "required_client_features: " << required_client_features << dendl;
+
+ if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ for (auto session : sessions) {
+ feature_bitset_t missing_features = required_client_features;
+ missing_features -= session->info.client_metadata.features;
+ if (!missing_features.empty()) {
+ bool blacklisted = mds->objecter->with_osdmap(
+ [session](const OSDMap &osd_map) -> bool {
+ return osd_map.is_blacklisted(session->info.inst.addr);
+ });
+ if (blacklisted)
+ continue;
+
+ mds->clog->warn() << "evicting session " << *session << ", missing required features '"
+ << missing_features << "'";
+ std::stringstream ss;
+ mds->evict_client(session->get_client().v, false,
+ g_conf()->mds_session_blacklist_on_evict, ss);
+ }
+ }
+ }
+}
+
+void Server::reconnect_gather_finish()
+{
+ dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
+ ceph_assert(reconnect_done);
+
+ if (!mds->snapclient->is_synced()) {
+ // make sure snaptable cache is populated. snaprealms will be
+ // extensively used in rejoin stage.
+ dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
+ mds->snapclient->wait_for_sync(reconnect_done);
+ } else {
+ reconnect_done->complete(0);
+ }
+ reconnect_done = NULL;
+}
+
+void Server::reconnect_tick()
+{
+ if (reconnect_evicting) {
+ dout(7) << "reconnect_tick: waiting for evictions" << dendl;
+ return;
+ }
+
+ if (client_reconnect_gather.empty())
+ return;
+
+ auto now = clock::now();
+ auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
+ if (elapse1 < g_conf()->mds_reconnect_timeout)
+ return;
+
+ vector<Session*> remaining_sessions;
+ remaining_sessions.reserve(client_reconnect_gather.size());
+ for (auto c : client_reconnect_gather) {
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
+ ceph_assert(session);
+ remaining_sessions.push_back(session);
+ // client re-sends cap flush messages before the reconnect message
+ if (session->last_seen > reconnect_last_seen)
+ reconnect_last_seen = session->last_seen;
+ }
+
+ auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
+ if (elapse2 < g_conf()->mds_reconnect_timeout / 2) {
+ dout(7) << "reconnect_tick: last seen " << elapse2
+ << " seconds ago, extending reconnect interval" << dendl;
+ return;
+ }
+
+ dout(7) << "reconnect timed out, " << remaining_sessions.size()
+ << " clients have not reconnected in time" << dendl;
+
+ // If we're doing blacklist evictions, use this to wait for them before
+ // proceeding to reconnect_gather_finish
+ MDSGatherBuilder gather(g_ceph_context);
+
+ for (auto session : remaining_sessions) {
+ // Keep sessions that have specified timeout. These sessions will prevent
+ // mds from going to active. MDS goes to active after they all have been
+ // killed or reclaimed.
+ if (session->info.client_metadata.find("timeout") !=
+ session->info.client_metadata.end()) {
+ dout(1) << "reconnect keeps " << session->info.inst
+ << ", need to be reclaimed" << dendl;
+ client_reclaim_gather.insert(session->get_client());
+ continue;
+ }
+
+ dout(1) << "reconnect gives up on " << session->info.inst << dendl;
+
+ mds->clog->warn() << "evicting unresponsive client " << *session
+ << ", after waiting " << elapse1
+ << " seconds during MDS startup";
+
+ if (g_conf()->mds_session_blacklist_on_timeout) {
+ std::stringstream ss;
+ mds->evict_client(session->get_client().v, false, true, ss,
+ gather.new_sub());
+ } else {
+ kill_session(session, NULL);
+ }
+
+ failed_reconnects++;
+ }
+ client_reconnect_gather.clear();
+
+ if (gather.has_subs()) {
+ dout(1) << "reconnect will complete once clients are evicted" << dendl;
+ gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
+ [this](int r){reconnect_gather_finish();})));
+ gather.activate();
+ reconnect_evicting = true;
+ } else {
+ reconnect_gather_finish();
+ }
+}
+
+void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
+{
+ if (!locks.length()) return;
+ int numlocks;
+ ceph_filelock lock;
+ auto p = locks.cbegin();
+ decode(numlocks, p);
+ for (int i = 0; i < numlocks; ++i) {
+ decode(lock, p);
+ lock.client = client;
+ in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
+ ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
+ }
+ decode(numlocks, p);
+ for (int i = 0; i < numlocks; ++i) {
+ decode(lock, p);
+ lock.client = client;
+ in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
+ ++in->get_flock_lock_state()->client_held_lock_counts[client];
+ }
+}
+
+/**
+ * Call this when the MDCache is oversized, to send requests to the clients
+ * to trim some caps, and consequently unpin some inodes in the MDCache so
+ * that it can trim too.
+ */
+std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
+{
+ const auto now = clock::now();
+ const bool steady = !!(flags&RecallFlags::STEADY);
+ const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
+ const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
+ const bool trim = !!(flags&RecallFlags::TRIM);
+
+ const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+ const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
+ const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
+ const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
+ const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
+ const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
+
+ dout(7) << __func__ << ":"
+ << " min=" << min_caps_per_client
+ << " max=" << max_caps_per_client
+ << " total=" << Capability::count()
+ << " flags=" << flags
+ << dendl;
+
+ /* trim caps of sessions with the most caps first */
+ std::multimap<uint64_t, Session*> caps_session;
+ auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
+ auto num_caps = s->caps.size();
+ auto cache_liveness = s->get_session_cache_liveness();
+ if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
+ caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
+ }
+ };
+ mds->sessionmap.get_client_sessions(std::move(f));
+
+ std::pair<bool, uint64_t> result = {false, 0};
+ auto& [throttled, caps_recalled] = result;
+ last_recall_state = now;
+ for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
+ if (!session->is_open() ||
+ !session->get_connection() ||
+ !session->info.inst.name.is_client())
+ continue;
+
+ dout(10) << __func__ << ":"
+ << " session " << session->info.inst
+ << " caps " << num_caps
+ << ", leases " << session->leases.size()
+ << dendl;
+
+ uint64_t newlim;
+ if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
+ newlim = min_caps_per_client;
+ } else {
+ newlim = num_caps-recall_max_caps;
+ }
+ if (num_caps > newlim) {
+ /* now limit the number of caps we recall at a time to prevent overloading ourselves */
+ uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
+ newlim = num_caps-recall;
+ const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
+ const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
+ const uint64_t global_recall_throttle = recall_throttle.get();
+ if (session_recall_throttle+recall > recall_max_decay_threshold) {
+ dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
+ throttled = true;
+ continue;
+ } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
+ dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
+ throttled = true;
+ continue;
+ } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
+ dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
+ throttled = true;
+ break;
+ }
+
+ // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
+ if (steady) {
+ const auto session_recall = session->get_recall_caps();
+ const auto session_release = session->get_release_caps();
+ if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
+ /* The session has been unable to keep up with the number of caps
+ * recalled (by half); additionally, to prevent marking sessions
+ * we've just begun to recall from, the session_recall counter
+ * (decayed count of caps recently recalled) is **greater** than the
+ * session threshold for the session's cap recall throttle.
+ */
+ dout(15) << " 2*session_release < session_recall"
+ " (2*" << session_release << " < " << session_recall << ") &&"
+ " 2*session_recall < recall_max_decay_threshold"
+ " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
+ " Skipping because we are unlikely to get more released." << dendl;
+ continue;
+ } else if (recall < recall_max_caps && 2*recall < session_recall) {
+ /* The number of caps recalled is less than the number we *could*
+ * recall (so there isn't much left to recall?) and the number of
+ * caps is less than the current recall_caps counter (decayed count
+ * of caps recently recalled).
+ */
+ dout(15) << " 2*recall < session_recall "
+ " (2*" << recall << " < " << session_recall << ") &&"
+ " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
+ " Skipping because we are unlikely to get more released." << dendl;
+ continue;
+ }
+ }
+
+ dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
+
+ auto m = MClientSession::create(CEPH_SESSION_RECALL_STATE);
+ m->head.max_caps = newlim;
+ mds->send_message_client(m, session);
+ if (gather) {
+ flush_session(session, *gather);
+ }
+ caps_recalled += session->notify_recall_sent(newlim);
+ recall_throttle.hit(recall);
+ }
+ }
+
+ dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
+
+ return result;
+}
+
+void Server::force_clients_readonly()
+{
+ dout(10) << "force_clients_readonly" << dendl;
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ for (set<Session*>::const_iterator p = sessions.begin();
+ p != sessions.end();
+ ++p) {
+ Session *session = *p;
+ if (!session->info.inst.name.is_client() ||
+ !(session->is_open() || session->is_stale()))
+ continue;
+ mds->send_message_client(MClientSession::create(CEPH_SESSION_FORCE_RO), session);
+ }
+}
+
+/*******
+ * some generic stuff for finishing off requests
+ */
+void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
+{
+ dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
+ ceph_assert(!mdr->has_completed);
+
+ // note trace items for eventual reply.
+ mdr->tracei = in;
+ if (in)
+ mdr->pin(in);
+
+ mdr->tracedn = dn;
+ if (dn)
+ mdr->pin(dn);
+
+ early_reply(mdr, in, dn);
+
+ mdr->committing = true;
+ submit_mdlog_entry(le, fin, mdr, __func__);
+
+ if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
+ if (mds->queue_one_replay()) {
+ dout(10) << " queued next replay op" << dendl;
+ } else {
+ dout(10) << " journaled last replay op" << dendl;
+ }
+ } else if (mdr->did_early_reply)
+ mds->locker->drop_rdlocks_for_early_reply(mdr.get());
+ else
+ mdlog->flush();
+}
+
+void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
+ std::string_view event)
+{
+ if (mdr) {
+ string event_str("submit entry: ");
+ event_str += event;
+ mdr->mark_event(event_str);
+ }
+ mdlog->submit_entry(le, fin);
+}
+
+/*
+ * send response built from mdr contents and error code; clean up mdr
+ */
+void Server::respond_to_request(MDRequestRef& mdr, int r)
+{
+ if (mdr->client_request) {
+ reply_client_request(mdr, MClientReply::create(*mdr->client_request, r));
+ } else if (mdr->internal_op > -1) {
+ dout(10) << "respond_to_request on internal request " << mdr << dendl;
+ if (!mdr->internal_op_finish)
+ ceph_abort_msg("trying to respond to internal op without finisher");
+ mdr->internal_op_finish->complete(r);
+ mdcache->request_finish(mdr);
+ }
+}
+
+// statistics mds req op number and latency
+void Server::perf_gather_op_latency(const MClientRequest::const_ref &req, utime_t lat)
+{
+ int code = l_mdss_first;
+ switch(req->get_op()) {
+ case CEPH_MDS_OP_LOOKUPHASH:
+ code = l_mdss_req_lookuphash_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUPINO:
+ code = l_mdss_req_lookupino_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUPPARENT:
+ code = l_mdss_req_lookupparent_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUPNAME:
+ code = l_mdss_req_lookupname_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUP:
+ code = l_mdss_req_lookup_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUPSNAP:
+ code = l_mdss_req_lookupsnap_latency;
+ break;
+ case CEPH_MDS_OP_GETATTR:
+ code = l_mdss_req_getattr_latency;
+ break;
+ case CEPH_MDS_OP_SETATTR:
+ code = l_mdss_req_setattr_latency;
+ break;
+ case CEPH_MDS_OP_SETLAYOUT:
+ code = l_mdss_req_setlayout_latency;
+ break;
+ case CEPH_MDS_OP_SETDIRLAYOUT:
+ code = l_mdss_req_setdirlayout_latency;
+ break;
+ case CEPH_MDS_OP_SETXATTR:
+ code = l_mdss_req_setxattr_latency;
+ break;
+ case CEPH_MDS_OP_RMXATTR:
+ code = l_mdss_req_rmxattr_latency;
+ break;
+ case CEPH_MDS_OP_READDIR:
+ code = l_mdss_req_readdir_latency;
+ break;
+ case CEPH_MDS_OP_SETFILELOCK:
+ code = l_mdss_req_setfilelock_latency;
+ break;
+ case CEPH_MDS_OP_GETFILELOCK:
+ code = l_mdss_req_getfilelock_latency;
+ break;
+ case CEPH_MDS_OP_CREATE:
+ code = l_mdss_req_create_latency;
+ break;
+ case CEPH_MDS_OP_OPEN:
+ code = l_mdss_req_open_latency;
+ break;
+ case CEPH_MDS_OP_MKNOD:
+ code = l_mdss_req_mknod_latency;
+ break;
+ case CEPH_MDS_OP_LINK:
+ code = l_mdss_req_link_latency;
+ break;
+ case CEPH_MDS_OP_UNLINK:
+ code = l_mdss_req_unlink_latency;
+ break;
+ case CEPH_MDS_OP_RMDIR:
+ code = l_mdss_req_rmdir_latency;
+ break;
+ case CEPH_MDS_OP_RENAME:
+ code = l_mdss_req_rename_latency;
+ break;
+ case CEPH_MDS_OP_MKDIR:
+ code = l_mdss_req_mkdir_latency;
+ break;
+ case CEPH_MDS_OP_SYMLINK:
+ code = l_mdss_req_symlink_latency;
+ break;
+ case CEPH_MDS_OP_LSSNAP:
+ code = l_mdss_req_lssnap_latency;
+ break;
+ case CEPH_MDS_OP_MKSNAP:
+ code = l_mdss_req_mksnap_latency;
+ break;
+ case CEPH_MDS_OP_RMSNAP:
+ code = l_mdss_req_rmsnap_latency;
+ break;
+ case CEPH_MDS_OP_RENAMESNAP:
+ code = l_mdss_req_renamesnap_latency;
+ break;
+ default: ceph_abort();
+ }
+ logger->tinc(code, lat);
+}
+
+void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
+{
+ if (!g_conf()->mds_early_reply)
+ return;
+
+ if (mdr->no_early_reply) {
+ dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
+ return;
+ }
+
+ if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
+ dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
+ return;
+ }
+
+ if (mdr->alloc_ino) {
+ dout(10) << "early_reply - allocated ino, not allowed" << dendl;
+ return;
+ }
+
+ const MClientRequest::const_ref &req = mdr->client_request;
+ entity_inst_t client_inst = req->get_source_inst();
+ if (client_inst.name.is_mds())
+ return;
+
+ if (req->is_replay()) {
+ dout(10) << " no early reply on replay op" << dendl;
+ return;
+ }
+
+
+ auto reply = MClientReply::create(*req, 0);
+ reply->set_unsafe();
+
+ // mark xlocks "done", indicating that we are exposing uncommitted changes.
+ //
+ //_rename_finish() does not send dentry link/unlink message to replicas.
+ // so do not set xlocks on dentries "done", the xlocks prevent dentries
+ // that have projected linkages from getting new replica.
+ mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
+
+ dout(10) << "early_reply " << reply->get_result()
+ << " (" << cpp_strerror(reply->get_result())
+ << ") " << *req << dendl;
+
+ if (tracei || tracedn) {
+ if (tracei)
+ mdr->cap_releases.erase(tracei->vino());
+ if (tracedn)
+ mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
+
+ set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
+ req->get_dentry_wanted(), mdr);
+ }
+
+ reply->set_extra_bl(mdr->reply_extra_bl);
+ mds->send_message_client(reply, mdr->session);
+
+ mdr->did_early_reply = true;
+
+ mds->logger->inc(l_mds_reply);
+ utime_t lat = ceph_clock_now() - req->get_recv_stamp();
+ mds->logger->tinc(l_mds_reply_latency, lat);
+ if (client_inst.name.is_client()) {
+ mds->sessionmap.hit_session(mdr->session);
+ }
+ perf_gather_op_latency(req, lat);
+ dout(20) << "lat " << lat << dendl;
+
+ mdr->mark_event("early_replied");
+}
+
+/*
+ * send given reply
+ * include a trace to tracei
+ * Clean up mdr
+ */
+void Server::reply_client_request(MDRequestRef& mdr, const MClientReply::ref &reply)
+{
+ ceph_assert(mdr.get());
+ const MClientRequest::const_ref &req = mdr->client_request;
+
+ dout(7) << "reply_client_request " << reply->get_result()
+ << " (" << cpp_strerror(reply->get_result())
+ << ") " << *req << dendl;
+
+ mdr->mark_event("replying");
+
+ Session *session = mdr->session;
+
+ // note successful request in session map?
+ //
+ // setfilelock requests are special, they only modify states in MDS memory.
+ // The states get lost when MDS fails. If Client re-send a completed
+ // setfilelock request, it means that client did not receive corresponding
+ // setfilelock reply. So MDS should re-execute the setfilelock request.
+ if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
+ reply->get_result() == 0 && session) {
+ inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
+ session->add_completed_request(mdr->reqid.tid, created);
+ if (mdr->ls) {
+ mdr->ls->touched_sessions.insert(session->info.inst.name);
+ }
+ }
+
+ // give any preallocated inos to the session
+ apply_allocated_inos(mdr, session);
+
+ // get tracei/tracedn from mdr?
+ snapid_t snapid = mdr->snapid;
+ CInode *tracei = mdr->tracei;
+ CDentry *tracedn = mdr->tracedn;
+
+ bool is_replay = mdr->client_request->is_replay();
+ bool did_early_reply = mdr->did_early_reply;
+ entity_inst_t client_inst = req->get_source_inst();
+ int dentry_wanted = req->get_dentry_wanted();
+
+ if (!did_early_reply && !is_replay) {
+
+ mds->logger->inc(l_mds_reply);
+ utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
+ mds->logger->tinc(l_mds_reply_latency, lat);
+ if (session && client_inst.name.is_client()) {
+ mds->sessionmap.hit_session(session);
+ }
+ perf_gather_op_latency(req, lat);
+ dout(20) << "lat " << lat << dendl;
+
+ if (tracei)
+ mdr->cap_releases.erase(tracei->vino());
+ if (tracedn)
+ mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
+ }
+
+ // drop non-rdlocks before replying, so that we can issue leases
+ mdcache->request_drop_non_rdlocks(mdr);
+
+ // reply at all?
+ if (session && !client_inst.name.is_mds()) {
+ // send reply.
+ if (!did_early_reply && // don't issue leases if we sent an earlier reply already
+ (tracei || tracedn)) {
+ if (is_replay) {
+ if (tracei)
+ mdcache->try_reconnect_cap(tracei, session);
+ } else {
+ // include metadata in reply
+ set_trace_dist(session, reply, tracei, tracedn,
+ snapid, dentry_wanted,
+ mdr);
+ }
+ }
+
+ // We can set the extra bl unconditionally: if it's already been sent in the
+ // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
+ reply->set_extra_bl(mdr->reply_extra_bl);
+
+ reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
+ mds->send_message_client(reply, session);
+ }
+
+ if (req->is_queued_for_replay() &&
+ (mdr->has_completed || reply->get_result() < 0)) {
+ if (reply->get_result() < 0) {
+ int r = reply->get_result();
+ derr << "reply_client_request: failed to replay " << *req
+ << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
+ mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
+ }
+ mds->queue_one_replay();
+ }
+
+ // clean up request
+ mdcache->request_finish(mdr);
+
+ // take a closer look at tracei, if it happens to be a remote link
+ if (tracei &&
+ tracedn &&
+ tracedn->get_projected_linkage()->is_remote()) {
+ mdcache->eval_remote(tracedn);
+ }
+}
+
+/*
+ * pass inode OR dentry (not both, or we may get confused)
+ *
+ * trace is in reverse order (i.e. root inode comes last)
+ */
+void Server::set_trace_dist(Session *session, const MClientReply::ref &reply,
+ CInode *in, CDentry *dn,
+ snapid_t snapid,
+ int dentry_wanted,
+ MDRequestRef& mdr)
+{
+ // skip doing this for debugging purposes?
+ if (g_conf()->mds_inject_traceless_reply_probability &&
+ mdr->ls && !mdr->o_trunc &&
+ (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
+ dout(5) << "deliberately skipping trace for " << *reply << dendl;
+ return;
+ }
+
+ // inode, dentry, dir, ..., inode
+ bufferlist bl;
+ mds_rank_t whoami = mds->get_nodeid();
+ client_t client = session->get_client();
+ utime_t now = ceph_clock_now();
+
+ dout(20) << "set_trace_dist snapid " << snapid << dendl;
+
+ //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
+
+ // realm
+ if (snapid == CEPH_NOSNAP) {
+ SnapRealm *realm;
+ if (in)
+ realm = in->find_snaprealm();
+ else
+ realm = dn->get_dir()->get_inode()->find_snaprealm();
+ reply->snapbl = realm->get_snap_trace();
+ dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
+ }
+
+ // dir + dentry?
+ if (dn) {
+ reply->head.is_dentry = 1;
+ CDir *dir = dn->get_dir();
+ CInode *diri = dir->get_inode();
+
+ diri->encode_inodestat(bl, session, NULL, snapid);
+ dout(20) << "set_trace_dist added diri " << *diri << dendl;
+
+#ifdef MDS_VERIFY_FRAGSTAT
+ if (dir->is_complete())
+ dir->verify_fragstat();
+#endif
+ DirStat ds;
+ ds.frag = dir->get_frag();
+ ds.auth = dir->get_dir_auth().first;
+ if (dir->is_auth() && !mdcache->forward_all_reqs_to_auth())
+ dir->get_dist_spec(ds.dist, whoami);
+
+ dir->encode_dirstat(bl, session->info, ds);
+ dout(20) << "set_trace_dist added dir " << *dir << dendl;
+
+ encode(dn->get_name(), bl);
+ if (snapid == CEPH_NOSNAP)
+ mds->locker->issue_client_lease(dn, client, bl, now, session);
+ else {
+ //null lease
+ LeaseStat e;
+ mds->locker->encode_lease(bl, session->info, e);
+ }
+ dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
+ } else
+ reply->head.is_dentry = 0;
+
+ // inode
+ if (in) {
+ in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
+ dout(20) << "set_trace_dist added in " << *in << dendl;
+ reply->head.is_target = 1;
+ } else
+ reply->head.is_target = 0;
+
+ reply->set_trace(bl);
+}
+
+void Server::handle_client_request(const MClientRequest::const_ref &req)
+{
+ dout(4) << "handle_client_request " << *req << dendl;
+
+ if (mds->logger)
+ mds->logger->inc(l_mds_request);
+ if (logger)
+ logger->inc(l_mdss_handle_client_request);
+
+ if (!mdcache->is_open()) {
+ dout(5) << "waiting for root" << dendl;
+ mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
+ return;
+ }
+
+ bool sessionclosed_isok = replay_unsafe_with_closed_session;
+ // active session?
+ Session *session = 0;
+ if (req->get_source().is_client()) {
+ session = mds->get_session(req);
+ if (!session) {
+ dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
+ } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
+ session->is_closing() ||
+ session->is_killing()) {
+ dout(5) << "session closed|closing|killing, dropping" << dendl;
+ session = NULL;
+ }
+ if (!session) {
+ if (req->is_queued_for_replay())
+ mds->queue_one_replay();
+ return;
+ }
+ }
+
+ // old mdsmap?
+ if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
+ // send it? hrm, this isn't ideal; they may get a lot of copies if
+ // they have a high request rate.
+ }
+
+ // completed request?
+ bool has_completed = false;
+ if (req->is_replay() || req->get_retry_attempt()) {
+ ceph_assert(session);
+ inodeno_t created;
+ if (session->have_completed_request(req->get_reqid().tid, &created)) {
+ has_completed = true;
+ if (!session->is_open())
+ return;
+ // Don't send traceless reply if the completed request has created
+ // new inode. Treat the request as lookup request instead.
+ if (req->is_replay() ||
+ ((created == inodeno_t() || !mds->is_clientreplay()) &&
+ req->get_op() != CEPH_MDS_OP_OPEN &&
+ req->get_op() != CEPH_MDS_OP_CREATE)) {
+ dout(5) << "already completed " << req->get_reqid() << dendl;
+ auto reply = MClientReply::create(*req, 0);
+ if (created != inodeno_t()) {
+ bufferlist extra;
+ encode(created, extra);
+ reply->set_extra_bl(extra);
+ }
+ mds->send_message_client(reply, session);
+
+ if (req->is_queued_for_replay())
+ mds->queue_one_replay();
+
+ return;
+ }
+ if (req->get_op() != CEPH_MDS_OP_OPEN &&
+ req->get_op() != CEPH_MDS_OP_CREATE) {
+ dout(10) << " completed request which created new inode " << created
+ << ", convert it to lookup request" << dendl;
+ req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
+ req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
+ }
+ }
+ }
+
+ // trim completed_request list
+ if (req->get_oldest_client_tid() > 0) {
+ dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
+ ceph_assert(session);
+ if (session->trim_completed_requests(req->get_oldest_client_tid())) {
+ // Sessions 'completed_requests' was dirtied, mark it to be
+ // potentially flushed at segment expiry.
+ mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
+
+ if (session->get_num_trim_requests_warnings() > 0 &&
+ session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
+ session->reset_num_trim_requests_warnings();
+ } else {
+ if (session->get_num_completed_requests() >=
+ (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
+ session->inc_num_trim_requests_warnings();
+ stringstream ss;
+ ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
+ << req->get_oldest_client_tid() << "), "
+ << session->get_num_completed_requests()
+ << " completed requests recorded in session\n";
+ mds->clog->warn() << ss.str();
+ dout(20) << __func__ << " " << ss.str() << dendl;
+ }
+ }
+ }
+
+ // register + dispatch
+ MDRequestRef mdr = mdcache->request_start(req);
+ if (!mdr.get())
+ return;
+
+ if (session) {
+ mdr->session = session;
+ session->requests.push_back(&mdr->item_session_request);
+ }
+
+ if (has_completed)
+ mdr->has_completed = true;
+
+ // process embedded cap releases?
+ // (only if NOT replay!)
+ if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
+ client_t client = req->get_source().num();
+ for (const auto &r : req->releases) {
+ mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
+ }
+ req->releases.clear();
+ }
+
+ dispatch_client_request(mdr);
+ return;
+}
+
+void Server::handle_osd_map()
+{
+ /* Note that we check the OSDMAP_FULL flag directly rather than
+ * using osdmap_full_flag(), because we want to know "is the flag set"
+ * rather than "does the flag apply to us?" */
+ mds->objecter->with_osdmap([this](const OSDMap& o) {
+ auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool());
+ is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
+ dout(7) << __func__ << ": full = " << is_full << " epoch = "
+ << o.get_epoch() << dendl;
+ });
+}
+
+void Server::dispatch_client_request(MDRequestRef& mdr)
+{
+ // we shouldn't be waiting on anyone.
+ ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
+
+ if (mdr->killed) {
+ dout(10) << "request " << *mdr << " was killed" << dendl;
+ return;
+ } else if (mdr->aborted) {
+ mdr->aborted = false;
+ mdcache->request_kill(mdr);
+ return;
+ }
+
+ const MClientRequest::const_ref &req = mdr->client_request;
+
+ if (logger) logger->inc(l_mdss_dispatch_client_request);
+
+ dout(7) << "dispatch_client_request " << *req << dendl;
+
+ if (req->may_write()) {
+ if (mdcache->is_readonly()) {
+ dout(10) << " read-only FS" << dendl;
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+ if (mdr->has_more() && mdr->more()->slave_error) {
+ dout(10) << " got error from slaves" << dendl;
+ respond_to_request(mdr, mdr->more()->slave_error);
+ return;
+ }
+ }
+
+ if (is_full) {
+ if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
+ req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
+ req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
+ req->get_op() == CEPH_MDS_OP_RMXATTR ||
+ req->get_op() == CEPH_MDS_OP_SETXATTR ||
+ req->get_op() == CEPH_MDS_OP_CREATE ||
+ req->get_op() == CEPH_MDS_OP_SYMLINK ||
+ req->get_op() == CEPH_MDS_OP_MKSNAP ||
+ ((req->get_op() == CEPH_MDS_OP_LINK ||
+ req->get_op() == CEPH_MDS_OP_RENAME) &&
+ (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
+ ) {
+
+ dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
+ respond_to_request(mdr, -ENOSPC);
+ return;
+ } else {
+ dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
+ }
+ }
+
+ switch (req->get_op()) {
+ case CEPH_MDS_OP_LOOKUPHASH:
+ case CEPH_MDS_OP_LOOKUPINO:
+ handle_client_lookup_ino(mdr, false, false);
+ break;
+ case CEPH_MDS_OP_LOOKUPPARENT:
+ handle_client_lookup_ino(mdr, true, false);
+ break;
+ case CEPH_MDS_OP_LOOKUPNAME:
+ handle_client_lookup_ino(mdr, false, true);
+ break;
+
+ // inodes ops.
+ case CEPH_MDS_OP_LOOKUP:
+ handle_client_getattr(mdr, true);
+ break;
+
+ case CEPH_MDS_OP_LOOKUPSNAP:
+ // lookupsnap does not reference a CDentry; treat it as a getattr
+ case CEPH_MDS_OP_GETATTR:
+ handle_client_getattr(mdr, false);
+ break;
+
+ case CEPH_MDS_OP_SETATTR:
+ handle_client_setattr(mdr);
+ break;
+ case CEPH_MDS_OP_SETLAYOUT:
+ handle_client_setlayout(mdr);
+ break;
+ case CEPH_MDS_OP_SETDIRLAYOUT:
+ handle_client_setdirlayout(mdr);
+ break;
+ case CEPH_MDS_OP_SETXATTR:
+ handle_client_setxattr(mdr);
+ break;
+ case CEPH_MDS_OP_RMXATTR:
+ handle_client_removexattr(mdr);
+ break;
+
+ case CEPH_MDS_OP_READDIR:
+ handle_client_readdir(mdr);
+ break;
+
+ case CEPH_MDS_OP_SETFILELOCK:
+ handle_client_file_setlock(mdr);
+ break;
+
+ case CEPH_MDS_OP_GETFILELOCK:
+ handle_client_file_readlock(mdr);
+ break;
+
+ // funky.
+ case CEPH_MDS_OP_CREATE:
+ if (mdr->has_completed)
+ handle_client_open(mdr); // already created.. just open
+ else
+ handle_client_openc(mdr);
+ break;
+
+ case CEPH_MDS_OP_OPEN:
+ handle_client_open(mdr);
+ break;
+
+ // namespace.
+ // no prior locks.
+ case CEPH_MDS_OP_MKNOD:
+ handle_client_mknod(mdr);
+ break;
+ case CEPH_MDS_OP_LINK:
+ handle_client_link(mdr);
+ break;
+ case CEPH_MDS_OP_UNLINK:
+ case CEPH_MDS_OP_RMDIR:
+ handle_client_unlink(mdr);
+ break;
+ case CEPH_MDS_OP_RENAME:
+ handle_client_rename(mdr);
+ break;
+ case CEPH_MDS_OP_MKDIR:
+ handle_client_mkdir(mdr);
+ break;
+ case CEPH_MDS_OP_SYMLINK:
+ handle_client_symlink(mdr);
+ break;
+
+
+ // snaps
+ case CEPH_MDS_OP_LSSNAP:
+ handle_client_lssnap(mdr);
+ break;
+ case CEPH_MDS_OP_MKSNAP:
+ handle_client_mksnap(mdr);
+ break;
+ case CEPH_MDS_OP_RMSNAP:
+ handle_client_rmsnap(mdr);
+ break;
+ case CEPH_MDS_OP_RENAMESNAP:
+ handle_client_renamesnap(mdr);
+ break;
+
+ default:
+ dout(1) << " unknown client op " << req->get_op() << dendl;
+ respond_to_request(mdr, -EOPNOTSUPP);
+ }
+}
+
+
+// ---------------------------------------
+// SLAVE REQUESTS
+
+void Server::handle_slave_request(const MMDSSlaveRequest::const_ref &m)
+{
+ dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+
+ if (logger) logger->inc(l_mdss_handle_slave_request);
+
+ // reply?
+ if (m->is_reply())
+ return handle_slave_request_reply(m);
+
+ // the purpose of rename notify is enforcing causal message ordering. making sure
+ // bystanders have received all messages from rename srcdn's auth MDS.
+ if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
+ auto reply = MMDSSlaveRequest::create(m->get_reqid(), m->get_attempt(), MMDSSlaveRequest::OP_RENAMENOTIFYACK);
+ mds->send_message(reply, m->get_connection());
+ return;
+ }
+
+ CDentry *straydn = NULL;
+ if (m->straybl.length() > 0) {
+ straydn = mdcache->add_replica_stray(m->straybl, from);
+ ceph_assert(straydn);
+ m->straybl.clear();
+ }
+
+ // am i a new slave?
+ MDRequestRef mdr;
+ if (mdcache->have_request(m->get_reqid())) {
+ // existing?
+ mdr = mdcache->request_get(m->get_reqid());
+
+ // is my request newer?
+ if (mdr->attempt > m->get_attempt()) {
+ dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
+ << ", dropping " << *m << dendl;
+ return;
+ }
+
+
+ if (mdr->attempt < m->get_attempt()) {
+ // mine is old, close it out
+ dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
+ << ", closing out" << dendl;
+ mdcache->request_finish(mdr);
+ mdr.reset();
+ } else if (mdr->slave_to_mds != from) {
+ dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
+ return;
+ }
+
+ if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
+ mdr->aborted = true;
+ if (mdr->slave_request) {
+ // only abort on-going xlock, wrlock and auth pin
+ ceph_assert(!mdr->slave_did_prepare());
+ } else {
+ mdcache->request_finish(mdr);
+ }
+ return;
+ }
+ }
+ if (!mdr.get()) {
+ // new?
+ if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
+ dout(10) << "missing slave request for " << m->get_reqid()
+ << " OP_FINISH, must have lost race with a forward" << dendl;
+ return;
+ }
+ mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
+ mdr->set_op_stamp(m->op_stamp);
+ }
+ ceph_assert(mdr->slave_request == 0); // only one at a time, please!
+
+ if (straydn) {
+ mdr->pin(straydn);
+ mdr->straydn = straydn;
+ }
+
+ if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+ dout(3) << "not clientreplay|active yet, waiting" << dendl;
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
+ mdr->locks.empty()) {
+ dout(3) << "not active yet, waiting" << dendl;
+ mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ mdr->reset_slave_request(m);
+
+ dispatch_slave_request(mdr);
+}
+
+void Server::handle_slave_request_reply(const MMDSSlaveRequest::const_ref &m)
+{
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+
+ if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+ metareqid_t r = m->get_reqid();
+ if (!mdcache->have_uncommitted_master(r, from)) {
+ dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
+ << from << " reqid " << r << dendl;
+ return;
+ }
+ dout(3) << "not clientreplay|active yet, waiting" << dendl;
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
+ metareqid_t r = m->get_reqid();
+ mdcache->committed_master_slave(r, from);
+ return;
+ }
+
+ MDRequestRef mdr = mdcache->request_get(m->get_reqid());
+ if (m->get_attempt() != mdr->attempt) {
+ dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
+ << m->get_attempt() << dendl;
+ return;
+ }
+
+ switch (m->get_op()) {
+ case MMDSSlaveRequest::OP_XLOCKACK:
+ {
+ // identify lock, master request
+ SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
+ m->get_object_info());
+ mdr->more()->slaves.insert(from);
+ lock->decode_locked_state(m->get_lock_data());
+ dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
+ mdr->locks.emplace_hint(mdr->locks.end(), lock, MutationImpl::LockOp::XLOCK);
+ mdr->finish_locking(lock);
+ lock->get_xlock(mdr, mdr->get_client());
+
+ ceph_assert(mdr->more()->waiting_on_slave.count(from));
+ mdr->more()->waiting_on_slave.erase(from);
+ ceph_assert(mdr->more()->waiting_on_slave.empty());
+ mdcache->dispatch_request(mdr);
+ }
+ break;
+
+ case MMDSSlaveRequest::OP_WRLOCKACK:
+ {
+ // identify lock, master request
+ SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
+ m->get_object_info());
+ mdr->more()->slaves.insert(from);
+ dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
+ auto it = mdr->locks.emplace_hint(mdr->locks.end(),
+ lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
+ ceph_assert(it->is_remote_wrlock());
+ ceph_assert(it->wrlock_target == from);
+
+ mdr->finish_locking(lock);
+
+ ceph_assert(mdr->more()->waiting_on_slave.count(from));
+ mdr->more()->waiting_on_slave.erase(from);
+ ceph_assert(mdr->more()->waiting_on_slave.empty());
+ mdcache->dispatch_request(mdr);
+ }
+ break;
+
+ case MMDSSlaveRequest::OP_AUTHPINACK:
+ handle_slave_auth_pin_ack(mdr, m);
+ break;
+
+ case MMDSSlaveRequest::OP_LINKPREPACK:
+ handle_slave_link_prep_ack(mdr, m);
+ break;
+
+ case MMDSSlaveRequest::OP_RMDIRPREPACK:
+ handle_slave_rmdir_prep_ack(mdr, m);
+ break;
+
+ case MMDSSlaveRequest::OP_RENAMEPREPACK:
+ handle_slave_rename_prep_ack(mdr, m);
+ break;
+
+ case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
+ handle_slave_rename_notify_ack(mdr, m);
+ break;
+
+ default:
+ ceph_abort();
+ }
+}
+
+void Server::dispatch_slave_request(MDRequestRef& mdr)
+{
+ dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
+
+ if (mdr->aborted) {
+ dout(7) << " abort flag set, finishing" << dendl;
+ mdcache->request_finish(mdr);
+ return;
+ }
+
+ if (logger) logger->inc(l_mdss_dispatch_slave_request);
+
+ int op = mdr->slave_request->get_op();
+ switch (op) {
+ case MMDSSlaveRequest::OP_XLOCK:
+ case MMDSSlaveRequest::OP_WRLOCK:
+ {
+ // identify object
+ SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
+ mdr->slave_request->get_object_info());
+
+ if (!lock) {
+ dout(10) << "don't have object, dropping" << dendl;
+ ceph_abort(); // can this happen, if we auth pinned properly.
+ }
+ if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
+ dout(10) << "not auth for remote xlock attempt, dropping on "
+ << *lock << " on " << *lock->get_parent() << dendl;
+ } else {
+ // use acquire_locks so that we get auth_pinning.
+ MutationImpl::LockOpVec lov;
+ for (const auto& p : mdr->locks) {
+ if (p.is_xlock())
+ lov.add_xlock(p.lock);
+ else if (p.is_wrlock())
+ lov.add_wrlock(p.lock);
+ }
+
+ int replycode = 0;
+ switch (op) {
+ case MMDSSlaveRequest::OP_XLOCK:
+ lov.add_xlock(lock);
+ replycode = MMDSSlaveRequest::OP_XLOCKACK;
+ break;
+ case MMDSSlaveRequest::OP_WRLOCK:
+ lov.add_wrlock(lock);
+ replycode = MMDSSlaveRequest::OP_WRLOCKACK;
+ break;
+ }
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ // ack
+ auto r = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, replycode);
+ r->set_lock_type(lock->get_type());
+ lock->get_parent()->set_object_info(r->get_object_info());
+ if (replycode == MMDSSlaveRequest::OP_XLOCKACK)
+ lock->encode_locked_state(r->get_lock_data());
+ mds->send_message(r, mdr->slave_request->get_connection());
+ }
+
+ // done.
+ mdr->reset_slave_request();
+ }
+ break;
+
+ case MMDSSlaveRequest::OP_UNXLOCK:
+ case MMDSSlaveRequest::OP_UNWRLOCK:
+ {
+ SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
+ mdr->slave_request->get_object_info());
+ ceph_assert(lock);
+ auto it = mdr->locks.find(lock);
+ ceph_assert(it != mdr->locks.end());
+ bool need_issue = false;
+ switch (op) {
+ case MMDSSlaveRequest::OP_UNXLOCK:
+ mds->locker->xlock_finish(it, mdr.get(), &need_issue);
+ break;
+ case MMDSSlaveRequest::OP_UNWRLOCK:
+ mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
+ break;
+ }
+ if (need_issue)
+ mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
+
+ // done. no ack necessary.
+ mdr->reset_slave_request();
+ }
+ break;
+
+ case MMDSSlaveRequest::OP_DROPLOCKS:
+ mds->locker->drop_locks(mdr.get());
+ mdr->reset_slave_request();
+ break;
+
+ case MMDSSlaveRequest::OP_AUTHPIN:
+ handle_slave_auth_pin(mdr);
+ break;
+
+ case MMDSSlaveRequest::OP_LINKPREP:
+ case MMDSSlaveRequest::OP_UNLINKPREP:
+ handle_slave_link_prep(mdr);
+ break;
+
+ case MMDSSlaveRequest::OP_RMDIRPREP:
+ handle_slave_rmdir_prep(mdr);
+ break;
+
+ case MMDSSlaveRequest::OP_RENAMEPREP:
+ handle_slave_rename_prep(mdr);
+ break;
+
+ case MMDSSlaveRequest::OP_FINISH:
+ // information about rename imported caps
+ if (mdr->slave_request->inode_export.length() > 0)
+ mdr->more()->inode_import = mdr->slave_request->inode_export;
+ // finish off request.
+ mdcache->request_finish(mdr);
+ break;
+
+ default:
+ ceph_abort();
+ }
+}
+
+void Server::handle_slave_auth_pin(MDRequestRef& mdr)
+{
+ dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
+
+ // build list of objects
+ list<MDSCacheObject*> objects;
+ CInode *auth_pin_freeze = NULL;
+ bool fail = false, wouldblock = false, readonly = false;
+
+ if (mdcache->is_readonly()) {
+ dout(10) << " read-only FS" << dendl;
+ readonly = true;
+ fail = true;
+ }
+
+ if (!fail) {
+ for (const auto &oi : mdr->slave_request->get_authpins()) {
+ MDSCacheObject *object = mdcache->get_object(oi);
+ if (!object) {
+ dout(10) << " don't have " << oi << dendl;
+ fail = true;
+ break;
+ }
+
+ objects.push_back(object);
+ if (oi == mdr->slave_request->get_authpin_freeze())
+ auth_pin_freeze = static_cast<CInode*>(object);
+ }
+ }
+
+ // can we auth pin them?
+ if (!fail) {
+ for (list<MDSCacheObject*>::iterator p = objects.begin();
+ p != objects.end();
+ ++p) {
+ if (!(*p)->is_auth()) {
+ dout(10) << " not auth for " << **p << dendl;
+ fail = true;
+ break;
+ }
+ if (mdr->is_auth_pinned(*p))
+ continue;
+ if (!mdr->can_auth_pin(*p)) {
+ if (mdr->slave_request->is_nonblock()) {
+ dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
+ fail = true;
+ wouldblock = true;
+ break;
+ }
+ // wait
+ dout(10) << " waiting for authpinnable on " << **p << dendl;
+ (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ mdr->drop_local_auth_pins();
+
+ mds->locker->notify_freeze_waiter(*p);
+ return;
+ }
+ }
+ }
+
+ // auth pin!
+ if (fail) {
+ mdr->drop_local_auth_pins(); // just in case
+ } else {
+ /* freeze authpin wrong inode */
+ if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
+ mdr->more()->rename_inode != auth_pin_freeze)
+ mdr->unfreeze_auth_pin(true);
+
+ /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
+ * on the source inode to complete. This happens after all locks for the rename
+ * operation are acquired. But to acquire locks, we need auth pin locks' parent
+ * objects first. So there is an ABBA deadlock if someone auth pins the source inode
+ * after locks are acquired and before Server::handle_slave_rename_prep() is called.
+ * The solution is freeze the inode and prevent other MDRequests from getting new
+ * auth pins.
+ */
+ if (auth_pin_freeze) {
+ dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
+ if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
+ auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
+ mds->mdlog->flush();
+ return;
+ }
+ }
+ for (list<MDSCacheObject*>::iterator p = objects.begin();
+ p != objects.end();
+ ++p) {
+ dout(10) << "auth_pinning " << **p << dendl;
+ mdr->auth_pin(*p);
+ }
+ }
+
+ // ack!
+ auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
+
+ // return list of my auth_pins (if any)
+ for (const auto &p : mdr->auth_pins) {
+ MDSCacheObjectInfo info;
+ p->set_object_info(info);
+ reply->get_authpins().push_back(info);
+ if (p == (MDSCacheObject*)auth_pin_freeze)
+ auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
+ }
+
+ if (wouldblock)
+ reply->mark_error_wouldblock();
+ if (readonly)
+ reply->mark_error_rofs();
+
+ mds->send_message_mds(reply, mdr->slave_to_mds);
+
+ // clean up this request
+ mdr->reset_slave_request();
+ return;
+}
+
+void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
+{
+ dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
+ mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+ // added auth pins?
+ set<MDSCacheObject*> pinned;
+ for (const auto &oi : ack->get_authpins()) {
+ MDSCacheObject *object = mdcache->get_object(oi);
+ ceph_assert(object); // we pinned it
+ dout(10) << " remote has pinned " << *object << dendl;
+ if (!mdr->is_auth_pinned(object))
+ mdr->remote_auth_pins[object] = from;
+ if (oi == ack->get_authpin_freeze())
+ mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
+ pinned.insert(object);
+ }
+
+ // removed frozen auth pin ?
+ if (mdr->more()->is_remote_frozen_authpin &&
+ ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
+ auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
+ ceph_assert(p != mdr->remote_auth_pins.end());
+ if (p->second == from) {
+ mdr->more()->is_remote_frozen_authpin = false;
+ }
+ }
+
+ // removed auth pins?
+ auto p = mdr->remote_auth_pins.begin();
+ while (p != mdr->remote_auth_pins.end()) {
+ MDSCacheObject* object = p->first;
+ if (p->second == from && pinned.count(object) == 0) {
+ dout(10) << " remote has unpinned " << *object << dendl;
+ mdr->remote_auth_pins.erase(p++);
+ } else {
+ ++p;
+ }
+ }
+
+ if (ack->is_error_rofs()) {
+ mdr->more()->slave_error = -EROFS;
+ mdr->aborted = true;
+ } else if (ack->is_error_wouldblock()) {
+ mdr->more()->slave_error = -EWOULDBLOCK;
+ mdr->aborted = true;
+ }
+
+ // note slave
+ mdr->more()->slaves.insert(from);
+
+ // clear from waiting list
+ ceph_assert(mdr->more()->waiting_on_slave.count(from));
+ mdr->more()->waiting_on_slave.erase(from);
+
+ // go again?
+ if (mdr->more()->waiting_on_slave.empty())
+ mdcache->dispatch_request(mdr);
+ else
+ dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
+}
+
+
+// ---------------------------------------
+// HELPERS
+
+
+/**
+ * check whether we are permitted to complete a request
+ *
+ * Check whether we have permission to perform the operation specified
+ * by mask on the given inode, based on the capability in the mdr's
+ * session.
+ */
+bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
+{
+ if (mdr->session) {
+ int r = mdr->session->check_access(
+ in, mask,
+ mdr->client_request->get_caller_uid(),
+ mdr->client_request->get_caller_gid(),
+ &mdr->client_request->get_caller_gid_list(),
+ mdr->client_request->head.args.setattr.uid,
+ mdr->client_request->head.args.setattr.gid);
+ if (r < 0) {
+ respond_to_request(mdr, r);
+ return false;
+ }
+ }
+ return true;
+}
+
+/**
+ * check whether fragment has reached maximum size
+ *
+ */
+bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
+{
+ const auto size = in->get_frag_size();
+ if (size >= g_conf()->mds_bal_fragment_size_max) {
+ dout(10) << "fragment " << *in << " size exceeds " << g_conf()->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
+ respond_to_request(mdr, -ENOSPC);
+ return false;
+ }
+
+ return true;
+}
+
+
+/** validate_dentry_dir
+ *
+ * verify that the dir exists and would own the dname.
+ * do not check if the dentry exists.
+ */
+CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, std::string_view dname)
+{
+ // make sure parent is a dir?
+ if (!diri->is_dir()) {
+ dout(7) << "validate_dentry_dir: not a dir" << dendl;
+ respond_to_request(mdr, -ENOTDIR);
+ return NULL;
+ }
+
+ // which dirfrag?
+ frag_t fg = diri->pick_dirfrag(dname);
+ CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
+ if (!dir)
+ return 0;
+
+ // frozen?
+ if (dir->is_frozen()) {
+ dout(7) << "dir is frozen " << *dir << dendl;
+ dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ return NULL;
+ }
+
+ return dir;
+}
+
+
+/** prepare_null_dentry
+ * prepare a null (or existing) dentry in given dir.
+ * wait for any dn lock.
+ */
+CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, std::string_view dname, bool okexist)
+{
+ dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
+ ceph_assert(dir->is_auth());
+
+ client_t client = mdr->get_client();
+
+ // does it already exist?
+ CDentry *dn = dir->lookup(dname);
+ if (dn) {
+ /*
+ if (dn->lock.is_xlocked_by_other(mdr)) {
+ dout(10) << "waiting on xlocked dentry " << *dn << dendl;
+ dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
+ return 0;
+ }
+ */
+ if (!dn->get_linkage(client, mdr)->is_null()) {
+ // name already exists
+ dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
+ if (!okexist) {
+ respond_to_request(mdr, -EEXIST);
+ return 0;
+ }
+ } else {
+ snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+ dn->first = std::max(dn->first, next_snap);
+ }
+ return dn;
+ }
+
+ // make sure dir is complete
+ if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
+ dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
+ dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
+ return 0;
+ }
+
+ // create
+ dn = dir->add_null_dentry(dname, mdcache->get_global_snaprealm()->get_newest_seq() + 1);
+ dn->mark_new();
+ dout(10) << "prepare_null_dentry added " << *dn << dendl;
+ return dn;
+}
+
+CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
+{
+ CDentry *straydn = mdr->straydn;
+ if (straydn) {
+ string straydname;
+ in->name_stray_dentry(straydname);
+ if (straydn->get_name() == straydname)
+ return straydn;
+
+ ceph_assert(!mdr->done_locking);
+ mdr->unpin(straydn);
+ }
+
+ CDir *straydir = mdcache->get_stray_dir(in);
+
+ if (!mdr->client_request->is_replay() &&
+ !check_fragment_space(mdr, straydir))
+ return NULL;
+
+ straydn = mdcache->get_or_create_stray_dentry(in);
+ mdr->straydn = straydn;
+ mdr->pin(straydn);
+ return straydn;
+}
+
+/** prepare_new_inode
+ *
+ * create a new inode. set c/m/atime. hit dir pop.
+ */
+CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
+ file_layout_t *layout)
+{
+ CInode *in = new CInode(mdcache);
+
+ // Server::prepare_force_open_sessions() can re-open session in closing
+ // state. In that corner case, session's prealloc_inos are being freed.
+ // To simplify the code, we disallow using/refilling session's prealloc_ino
+ // while session is opening.
+ bool allow_prealloc_inos = mdr->session->is_open();
+
+ // assign ino
+ if (allow_prealloc_inos &&
+ mdr->session->info.prealloc_inos.size()) {
+ mdr->used_prealloc_ino =
+ in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
+ mds->sessionmap.mark_projected(mdr->session);
+
+ dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
+ << " (" << mdr->session->info.prealloc_inos
+ << ", " << mdr->session->info.prealloc_inos.size() << " left)"
+ << dendl;
+ } else {
+ mdr->alloc_ino =
+ in->inode.ino = mds->inotable->project_alloc_id(useino);
+ dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
+ }
+
+ if (useino && useino != in->inode.ino) {
+ dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
+ mds->clog->error() << mdr->client_request->get_source()
+ << " specified ino " << useino
+ << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
+ //ceph_abort(); // just for now.
+ }
+
+ if (allow_prealloc_inos &&
+ mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
+ int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
+ mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
+ ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
+ mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
+ mds->sessionmap.mark_projected(mdr->session);
+ dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
+ }
+
+ in->inode.version = 1;
+ in->inode.xattr_version = 1;
+ in->inode.nlink = 1; // FIXME
+
+ in->inode.mode = mode;
+
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
+ if (in->inode.is_dir()) {
+ in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+ } else if (layout) {
+ in->inode.layout = *layout;
+ } else {
+ in->inode.layout = mdcache->default_file_layout;
+ }
+
+ in->inode.truncate_size = -1ull; // not truncated, yet!
+ in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
+
+ CInode *diri = dir->get_inode();
+
+ dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
+
+ if (diri->inode.mode & S_ISGID) {
+ dout(10) << " dir is sticky" << dendl;
+ in->inode.gid = diri->inode.gid;
+ if (S_ISDIR(mode)) {
+ dout(10) << " new dir also sticky" << dendl;
+ in->inode.mode |= S_ISGID;
+ }
+ } else
+ in->inode.gid = mdr->client_request->get_caller_gid();
+
+ in->inode.uid = mdr->client_request->get_caller_uid();
+
+ in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
+ mdr->get_op_stamp();
+
+ in->inode.change_attr = 0;
+
+ const MClientRequest::const_ref &req = mdr->client_request;
+ if (req->get_data().length()) {
+ auto p = req->get_data().cbegin();
+
+ // xattrs on new inode?
+ CInode::mempool_xattr_map xattrs;
+ decode_noshare(xattrs, p);
+ for (const auto &p : xattrs) {
+ dout(10) << "prepare_new_inode setting xattr " << p.first << dendl;
+ auto em = in->xattrs.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple(p.second));
+ if (!em.second)
+ em.first->second = p.second;
+ }
+ }
+
+ if (!mds->mdsmap->get_inline_data_enabled() ||
+ !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
+ in->inode.inline_data.version = CEPH_INLINE_NONE;
+
+ mdcache->add_inode(in); // add
+ dout(10) << "prepare_new_inode " << *in << dendl;
+ return in;
+}
+
+void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
+{
+ dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
+ << " inotablev " << mds->inotable->get_projected_version()
+ << dendl;
+ blob->set_ino_alloc(mdr->alloc_ino,
+ mdr->used_prealloc_ino,
+ mdr->prealloc_inos,
+ mdr->client_request->get_source(),
+ mds->sessionmap.get_projected(),
+ mds->inotable->get_projected_version());
+}
+
+void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
+{
+ dout(10) << "apply_allocated_inos " << mdr->alloc_ino
+ << " / " << mdr->prealloc_inos
+ << " / " << mdr->used_prealloc_ino << dendl;
+
+ if (mdr->alloc_ino) {
+ mds->inotable->apply_alloc_id(mdr->alloc_ino);
+ }
+ if (mdr->prealloc_inos.size()) {
+ ceph_assert(session);
+ session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
+ session->info.prealloc_inos.insert(mdr->prealloc_inos);
+ mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
+ mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
+ }
+ if (mdr->used_prealloc_ino) {
+ ceph_assert(session);
+ session->info.used_inos.erase(mdr->used_prealloc_ino);
+ mds->sessionmap.mark_dirty(session);
+ }
+}
+
+class C_MDS_TryFindInode : public ServerContext {
+ MDRequestRef mdr;
+public:
+ C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
+ void finish(int r) override {
+ if (r == -ESTALE) // :( find_ino_peers failed
+ server->respond_to_request(mdr, r);
+ else
+ server->dispatch_client_request(mdr);
+ }
+};
+
+class CF_MDS_MDRContextFactory : public MDSContextFactory {
+public:
+ CF_MDS_MDRContextFactory(MDCache *cache, MDRequestRef &mdr) : cache(cache), mdr(mdr) {}
+ MDSContext *build() {
+ return new C_MDS_RetryRequest(cache, mdr);
+ }
+private:
+ MDCache *cache;
+ MDRequestRef mdr;
+};
+
+CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
+{
+ // figure parent dir vs dname
+ if (refpath.depth() == 0) {
+ dout(7) << "can't do that to root" << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return 0;
+ }
+ string dname = refpath.last_dentry();
+ refpath.pop_dentry();
+
+ dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
+
+ // traverse to parent dir
+ CInode *diri;
+ CF_MDS_MDRContextFactory cf(mdcache, mdr);
+ int r = mdcache->path_traverse(mdr, cf, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
+ if (r > 0) return 0; // delayed
+ if (r < 0) {
+ if (r == -ESTALE) {
+ dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
+ mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
+ return 0;
+ }
+ respond_to_request(mdr, r);
+ return 0;
+ }
+
+ // is it an auth dir?
+ CDir *dir = validate_dentry_dir(mdr, diri, dname);
+ if (!dir)
+ return 0; // forwarded or waiting for freeze
+
+ dout(10) << "traverse_to_auth_dir " << *dir << dendl;
+ return dir;
+}
+
+/* If this returns null, the request has been handled
+ * as appropriate: forwarded on, or the client's been replied to */
+CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
+ MutationImpl::LockOpVec& lov,
+ bool want_auth,
+ bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
+ a snapped dir */
+ file_layout_t **layout,
+ bool no_lookup) // true if we cannot return a null dentry lease
+{
+ const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
+ dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
+
+ if (mdr->done_locking)
+ return mdr->in[n];
+
+ // traverse
+ CF_MDS_MDRContextFactory cf(mdcache, mdr);
+ int r = mdcache->path_traverse(mdr, cf, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
+ if (r > 0)
+ return NULL; // delayed
+ if (r < 0) { // error
+ if (r == -ENOENT && n == 0 && !mdr->dn[n].empty()) {
+ if (!no_lookup) {
+ mdr->tracedn = mdr->dn[n].back();
+ }
+ respond_to_request(mdr, r);
+ } else if (r == -ESTALE) {
+ dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
+ MDSContext *c = new C_MDS_TryFindInode(this, mdr);
+ mdcache->find_ino_peers(refpath.get_ino(), c);
+ } else {
+ dout(10) << "FAIL on error " << r << dendl;
+ respond_to_request(mdr, r);
+ }
+ return 0;
+ }
+ CInode *ref = mdr->in[n];
+ dout(10) << "ref is " << *ref << dendl;
+
+ // fw to inode auth?
+ if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
+ want_auth = true;
+
+ if (want_auth) {
+ if (ref->is_ambiguous_auth()) {
+ dout(10) << "waiting for single auth on " << *ref << dendl;
+ ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
+ return 0;
+ }
+ if (!ref->is_auth()) {
+ dout(10) << "fw to auth for " << *ref << dendl;
+ mdcache->request_forward(mdr, ref->authority().first);
+ return 0;
+ }
+
+ // auth_pin?
+ // do NOT proceed if freezing, as cap release may defer in that case, and
+ // we could deadlock when we try to lock @ref.
+ // if we're already auth_pinned, continue; the release has already been processed.
+ if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
+ (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
+ dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
+ ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ /* If we have any auth pins, this will deadlock.
+ * But the only way to get here if we've already got auth pins
+ * is because we're on an inode with snapshots that got updated
+ * between dispatches of this request. So we're going to drop
+ * our locks and our auth pins and reacquire them later.
+ *
+ * This is safe since we're only in this function when working on
+ * a single MDS request; otherwise we'd be in
+ * rdlock_path_xlock_dentry.
+ */
+ mds->locker->drop_locks(mdr.get(), NULL);
+ mdr->drop_local_auth_pins();
+ if (!mdr->remote_auth_pins.empty())
+ mds->locker->notify_freeze_waiter(ref);
+ return 0;
+ }
+
+ mdr->auth_pin(ref);
+ }
+
+ for (int i=0; i<(int)mdr->dn[n].size(); i++)
+ lov.add_rdlock(&mdr->dn[n][i]->lock);
+ if (layout)
+ mds->locker->include_snap_rdlocks_wlayout(ref, lov, layout);
+ else
+ mds->locker->include_snap_rdlocks(ref, lov);
+
+ // set and pin ref
+ mdr->pin(ref);
+ return ref;
+}
+
+
+/** rdlock_path_xlock_dentry
+ * traverse path to the directory that could/would contain dentry.
+ * make sure i am auth for that dentry, forward as necessary.
+ * create null dentry in place (or use existing if okexist).
+ * get rdlocks on traversed dentries, xlock on new dentry.
+ */
+CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
+ MutationImpl::LockOpVec& lov,
+ bool okexist, bool mustexist, bool alwaysxlock,
+ file_layout_t **layout)
+{
+ const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
+
+ dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
+
+ client_t client = mdr->get_client();
+
+ if (mdr->done_locking)
+ return mdr->dn[n].back();
+
+ CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
+ if (!dir) return 0;
+
+ CInode *diri = dir->get_inode();
+ if (!mdr->reqid.name.is_mds()) {
+ if (diri->is_system() && !diri->is_root()) {
+ respond_to_request(mdr, -EROFS);
+ return 0;
+ }
+ }
+ if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
+ respond_to_request(mdr, -ENOENT);
+ return 0;
+ }
+
+ // make a null dentry?
+ std::string_view dname = refpath.last_dentry();
+ CDentry *dn;
+ if (mustexist) {
+ dn = dir->lookup(dname);
+
+ // make sure dir is complete
+ if (!dn && !dir->is_complete() &&
+ (!dir->has_bloom() || dir->is_in_bloom(dname))) {
+ dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
+ dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
+ return 0;
+ }
+
+ // readable?
+ if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
+ dout(10) << "waiting on xlocked dentry " << *dn << dendl;
+ dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
+ return 0;
+ }
+
+ // exists?
+ if (!dn || dn->get_linkage(client, mdr)->is_null()) {
+ dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
+ respond_to_request(mdr, -ENOENT);
+ return 0;
+ }
+ } else {
+ dn = prepare_null_dentry(mdr, dir, dname, okexist);
+ if (!dn)
+ return 0;
+ }
+
+ mdr->dn[n].push_back(dn);
+ CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
+ mdr->in[n] = dnl->get_inode();
+
+ // -- lock --
+ // NOTE: rename takes the same set of locks for srcdn
+ for (int i=0; i<(int)mdr->dn[n].size(); i++)
+ lov.add_rdlock(&mdr->dn[n][i]->lock);
+ if (alwaysxlock || dnl->is_null())
+ lov.add_xlock(&dn->lock); // new dn, xlock
+ else
+ lov.add_rdlock(&dn->lock); // existing dn, rdlock
+ lov.add_wrlock(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
+ lov.add_wrlock(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
+ if (layout)
+ mds->locker->include_snap_rdlocks_wlayout(dn->get_dir()->inode, lov, layout);
+ else
+ mds->locker->include_snap_rdlocks(dn->get_dir()->inode, lov);
+
+ return dn;
+}
+
+
+
+
+
+/**
+ * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
+ *
+ * @param diri base inode
+ * @param fg the exact frag we want
+ * @param mdr request
+ * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
+ */
+CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
+{
+ CDir *dir = diri->get_dirfrag(fg);
+
+ // not open and inode not mine?
+ if (!dir && !diri->is_auth()) {
+ mds_rank_t inauth = diri->authority().first;
+ dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
+ mdcache->request_forward(mdr, inauth);
+ return 0;
+ }
+
+ // not open and inode frozen?
+ if (!dir && diri->is_frozen()) {
+ dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
+ ceph_assert(diri->get_parent_dir());
+ diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ return 0;
+ }
+
+ // invent?
+ if (!dir)
+ dir = diri->get_or_open_dirfrag(mdcache, fg);
+
+ // am i auth for the dirfrag?
+ if (!dir->is_auth()) {
+ mds_rank_t auth = dir->authority().first;
+ dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
+ << ", fw to mds." << auth << dendl;
+ mdcache->request_forward(mdr, auth);
+ return 0;
+ }
+
+ return dir;
+}
+
+
+// ===============================================================================
+// STAT
+
+void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+
+ if (req->get_filepath().depth() == 0 && is_lookup) {
+ // refpath can't be empty for lookup but it can for
+ // getattr (we do getattr with empty refpath for mount of '/')
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ bool want_auth = false;
+ int mask = req->head.args.getattr.mask;
+ if (mask & CEPH_STAT_RSTAT)
+ want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
+
+ MutationImpl::LockOpVec lov;
+ CInode *ref = rdlock_path_pin_ref(mdr, 0, lov, want_auth, false, NULL,
+ !is_lookup);
+ if (!ref) return;
+
+ /*
+ * if client currently holds the EXCL cap on a field, do not rdlock
+ * it; client's stat() will result in valid info if _either_ EXCL
+ * cap is held or MDS rdlocks and reads the value here.
+ *
+ * handling this case here is easier than weakening rdlock
+ * semantics... that would cause problems elsewhere.
+ */
+ client_t client = mdr->get_client();
+ int issued = 0;
+ Capability *cap = ref->get_client_cap(client);
+ if (cap && (mdr->snapid == CEPH_NOSNAP ||
+ mdr->snapid <= cap->client_follows))
+ issued = cap->issued();
+
+ if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
+ lov.add_rdlock(&ref->linklock);
+ if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
+ lov.add_rdlock(&ref->authlock);
+ if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
+ lov.add_rdlock(&ref->xattrlock);
+ if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
+ // Don't wait on unstable filelock if client is allowed to read file size.
+ // This can reduce the response time of getattr in the case that multiple
+ // clients do stat(2) and there are writers.
+ // The downside of this optimization is that mds may not issue Fs caps along
+ // with getattr reply. Client may need to send more getattr requests.
+ if (mdr->is_rdlocked(&ref->filelock)) {
+ lov.add_rdlock(&ref->filelock);
+ } else if (ref->filelock.is_stable() ||
+ ref->filelock.get_num_wrlocks() > 0 ||
+ !ref->filelock.can_read(mdr->get_client())) {
+ lov.add_rdlock(&ref->filelock);
+ mdr->done_locking = false;
+ }
+ }
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, ref, MAY_READ))
+ return;
+
+ utime_t now = ceph_clock_now();
+ mdr->set_mds_stamp(now);
+
+ // note which caps are requested, so we return at least a snapshot
+ // value for them. (currently this matters for xattrs and inline data)
+ mdr->getattr_caps = mask;
+
+ mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
+
+ // reply
+ dout(10) << "reply to stat on " << *req << dendl;
+ mdr->tracei = ref;
+ if (is_lookup)
+ mdr->tracedn = mdr->dn[0].back();
+ respond_to_request(mdr, 0);
+}
+
+struct C_MDS_LookupIno2 : public ServerContext {
+ MDRequestRef mdr;
+ C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
+ void finish(int r) override {
+ server->_lookup_ino_2(mdr, r);
+ }
+};
+
+/*
+ * filepath: ino
+ */
+void Server::handle_client_lookup_ino(MDRequestRef& mdr,
+ bool want_parent, bool want_dentry)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+
+ if ((uint64_t)req->head.args.lookupino.snapid > 0)
+ return _lookup_snap_ino(mdr);
+
+ inodeno_t ino = req->get_filepath().get_ino();
+ CInode *in = mdcache->get_inode(ino);
+ if (in && in->state_test(CInode::STATE_PURGING)) {
+ respond_to_request(mdr, -ESTALE);
+ return;
+ }
+ if (!in) {
+ mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
+ return;
+ }
+
+ if (mdr && in->snaprealm && !in->snaprealm->have_past_parents_open() &&
+ !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
+ return;
+ }
+
+ // check for nothing (not read or write); this still applies the
+ // path check.
+ if (!check_access(mdr, in, 0))
+ return;
+
+ CDentry *dn = in->get_projected_parent_dn();
+ CInode *diri = dn ? dn->get_dir()->inode : NULL;
+
+ MutationImpl::LockOpVec lov;
+ if (dn && (want_parent || want_dentry)) {
+ mdr->pin(dn);
+ lov.add_rdlock(&dn->lock);
+ }
+
+ unsigned mask = req->head.args.lookupino.mask;
+ if (mask) {
+ Capability *cap = in->get_client_cap(mdr->get_client());
+ int issued = 0;
+ if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
+ issued = cap->issued();
+ // permission bits, ACL/security xattrs
+ if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
+ lov.add_rdlock(&in->authlock);
+ if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
+ lov.add_rdlock(&in->xattrlock);
+
+ mdr->getattr_caps = mask;
+ }
+
+ if (!lov.empty()) {
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (diri != NULL) {
+ // need read access to directory inode
+ if (!check_access(mdr, diri, MAY_READ))
+ return;
+ }
+ }
+
+ if (want_parent) {
+ if (in->is_base()) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ if (!diri || diri->is_stray()) {
+ respond_to_request(mdr, -ESTALE);
+ return;
+ }
+ dout(10) << "reply to lookup_parent " << *in << dendl;
+ mdr->tracei = diri;
+ respond_to_request(mdr, 0);
+ } else {
+ if (want_dentry) {
+ inodeno_t dirino = req->get_filepath2().get_ino();
+ if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
+ respond_to_request(mdr, -ENOENT);
+ return;
+ }
+ dout(10) << "reply to lookup_name " << *in << dendl;
+ } else
+ dout(10) << "reply to lookup_ino " << *in << dendl;
+
+ mdr->tracei = in;
+ if (want_dentry)
+ mdr->tracedn = dn;
+ respond_to_request(mdr, 0);
+ }
+}
+
+void Server::_lookup_snap_ino(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+
+ vinodeno_t vino;
+ vino.ino = req->get_filepath().get_ino();
+ vino.snapid = (__u64)req->head.args.lookupino.snapid;
+ inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
+ __u32 hash = req->head.args.lookupino.hash;
+
+ dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
+
+ CInode *in = mdcache->lookup_snap_inode(vino);
+ if (!in) {
+ in = mdcache->get_inode(vino.ino);
+ if (in) {
+ if (in->state_test(CInode::STATE_PURGING) ||
+ !in->has_snap_data(vino.snapid)) {
+ if (in->is_dir() || !parent_ino) {
+ respond_to_request(mdr, -ESTALE);
+ return;
+ }
+ in = NULL;
+ }
+ }
+ }
+
+ if (in) {
+ dout(10) << "reply to lookup_snap_ino " << *in << dendl;
+ mdr->snapid = vino.snapid;
+ mdr->tracei = in;
+ respond_to_request(mdr, 0);
+ return;
+ }
+
+ CInode *diri = NULL;
+ if (parent_ino) {
+ diri = mdcache->get_inode(parent_ino);
+ if (!diri) {
+ mdcache->open_ino(parent_ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
+ return;
+ }
+
+ if (!diri->is_dir()) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+ lov.add_rdlock(&diri->dirfragtreelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ frag_t frag = diri->dirfragtree[hash];
+ CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
+ if (!dir)
+ return;
+
+ if (!dir->is_complete()) {
+ if (dir->is_frozen()) {
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
+ return;
+ }
+
+ respond_to_request(mdr, -ESTALE);
+ } else {
+ mdcache->open_ino(vino.ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
+ }
+}
+
+void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
+{
+ inodeno_t ino = mdr->client_request->get_filepath().get_ino();
+ dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
+
+ // `r` is a rank if >=0, else an error code
+ if (r >= 0) {
+ mds_rank_t dest_rank(r);
+ if (dest_rank == mds->get_nodeid())
+ dispatch_client_request(mdr);
+ else
+ mdcache->request_forward(mdr, dest_rank);
+ return;
+ }
+
+ // give up
+ if (r == -ENOENT || r == -ENODATA)
+ r = -ESTALE;
+ respond_to_request(mdr, r);
+}
+
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_open(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ dout(7) << "open on " << req->get_filepath() << dendl;
+
+ int flags = req->head.args.open.flags;
+ int cmode = ceph_flags_to_mode(flags);
+ if (cmode < 0) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ bool need_auth = !file_mode_is_readonly(cmode) ||
+ (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
+
+ if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
+ dout(7) << "read-only FS" << dendl;
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+ CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, need_auth);
+ if (!cur)
+ return;
+
+ if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
+ ceph_assert(!need_auth);
+ mdr->done_locking = false;
+ CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
+ if (!cur)
+ return;
+ }
+
+ if (!cur->inode.is_file()) {
+ // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
+ cmode = CEPH_FILE_MODE_PIN;
+ // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
+ if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
+ flags &= ~CEPH_O_TRUNC;
+ }
+
+ dout(10) << "open flags = " << flags
+ << ", filemode = " << cmode
+ << ", need_auth = " << need_auth
+ << dendl;
+
+ // regular file?
+ /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
+ dout(7) << "not a file or dir " << *cur << dendl;
+ respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
+ return;
+ }*/
+ if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
+ dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
+ dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
+ // we should return -EISDIR for directory, return -EINVAL for other non-regular
+ respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
+ return;
+ }
+
+ if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
+ !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
+ dout(7) << "old client cannot open inline data file " << *cur << dendl;
+ respond_to_request(mdr, -EPERM);
+ return;
+ }
+
+ // snapped data is read only
+ if (mdr->snapid != CEPH_NOSNAP &&
+ ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
+ dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+
+ unsigned mask = req->head.args.open.mask;
+ if (mask) {
+ Capability *cap = cur->get_client_cap(mdr->get_client());
+ int issued = 0;
+ if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
+ issued = cap->issued();
+ // permission bits, ACL/security xattrs
+ if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
+ lov.add_rdlock(&cur->authlock);
+ if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
+ lov.add_rdlock(&cur->xattrlock);
+
+ mdr->getattr_caps = mask;
+ }
+
+ // O_TRUNC
+ if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
+ ceph_assert(cur->is_auth());
+
+ lov.add_xlock(&cur->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, cur, MAY_WRITE))
+ return;
+
+ // wait for pending truncate?
+ const auto pi = cur->get_projected_inode();
+ if (pi->is_truncating()) {
+ dout(10) << " waiting for pending truncate from " << pi->truncate_from
+ << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ do_open_truncate(mdr, cmode);
+ return;
+ }
+
+ // sync filelock if snapped.
+ // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
+ // and that data itself is flushed so that we can read the snapped data off disk.
+ if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
+ lov.add_rdlock(&cur->filelock);
+ }
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ mask = MAY_READ;
+ if (cmode & CEPH_FILE_MODE_WR)
+ mask |= MAY_WRITE;
+ if (!check_access(mdr, cur, mask))
+ return;
+
+ utime_t now = ceph_clock_now();
+ mdr->set_mds_stamp(now);
+
+ if (cur->is_file() || cur->is_dir()) {
+ if (mdr->snapid == CEPH_NOSNAP) {
+ // register new cap
+ Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
+ if (cap)
+ dout(12) << "open issued caps " << ccap_string(cap->pending())
+ << " for " << req->get_source()
+ << " on " << *cur << dendl;
+ } else {
+ int caps = ceph_caps_for_mode(cmode);
+ dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
+ << " for " << req->get_source()
+ << " snapid " << mdr->snapid
+ << " on " << *cur << dendl;
+ mdr->snap_caps = caps;
+ }
+ }
+
+ // increase max_size?
+ if (cmode & CEPH_FILE_MODE_WR)
+ mds->locker->check_inode_max_size(cur);
+
+ // make sure this inode gets into the journal
+ if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
+ mdcache->open_file_table.should_log_open(cur)) {
+ EOpen *le = new EOpen(mds->mdlog);
+ mdlog->start_entry(le);
+ le->add_clean_inode(cur);
+ mdlog->submit_entry(le);
+ }
+
+ // hit pop
+ if (cmode & CEPH_FILE_MODE_WR)
+ mds->balancer->hit_inode(cur, META_POP_IWR);
+ else
+ mds->balancer->hit_inode(cur, META_POP_IRD,
+ mdr->client_request->get_source().num());
+
+ CDentry *dn = 0;
+ if (req->get_dentry_wanted()) {
+ ceph_assert(mdr->dn[0].size());
+ dn = mdr->dn[0].back();
+ }
+
+ mdr->tracei = cur;
+ mdr->tracedn = dn;
+ respond_to_request(mdr, 0);
+}
+
+class C_MDS_openc_finish : public ServerLogContext {
+ CDentry *dn;
+ CInode *newi;
+public:
+ C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
+ ServerLogContext(s, r), dn(d), newi(ni) {}
+ void finish(int r) override {
+ ceph_assert(r == 0);
+
+ dn->pop_projected_linkage();
+
+ // dirty inode, dn, dir
+ newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
+ newi->mark_dirty(newi->inode.version+1, mdr->ls);
+ newi->mark_dirty_parent(mdr->ls, true);
+
+ mdr->apply();
+
+ get_mds()->locker->share_inode_max_size(newi);
+
+ MDRequestRef null_ref;
+ get_mds()->mdcache->send_dentry_link(dn, null_ref);
+
+ get_mds()->balancer->hit_inode(newi, META_POP_IWR);
+
+ server->respond_to_request(mdr, 0);
+
+ ceph_assert(g_conf()->mds_kill_openc_at != 1);
+ }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_openc(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ client_t client = mdr->get_client();
+
+ dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
+
+ int cmode = ceph_flags_to_mode(req->head.args.open.flags);
+ if (cmode < 0) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ bool excl = req->head.args.open.flags & CEPH_O_EXCL;
+
+ if (!excl) {
+ CF_MDS_MDRContextFactory cf(mdcache, mdr);
+ int r = mdcache->path_traverse(mdr, cf, req->get_filepath(),
+ &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
+ if (r > 0) return;
+ if (r == 0) {
+ // it existed.
+ handle_client_open(mdr);
+ return;
+ }
+ if (r < 0 && r != -ENOENT) {
+ if (r == -ESTALE) {
+ dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
+ MDSContext *c = new C_MDS_TryFindInode(this, mdr);
+ mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
+ } else {
+ dout(10) << "FAIL on error " << r << dendl;
+ respond_to_request(mdr, r);
+ }
+ return;
+ }
+ }
+
+ MutationImpl::LockOpVec lov;
+ file_layout_t *dir_layout = nullptr;
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov,
+ !excl, false, false, &dir_layout);
+ if (!dn) return;
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+ // set layout
+ file_layout_t layout;
+ if (dir_layout)
+ layout = *dir_layout;
+ else
+ layout = mdcache->default_file_layout;
+
+ // What kind of client caps are required to complete this operation
+ uint64_t access = MAY_WRITE;
+
+ const auto default_layout = layout;
+
+ // fill in any special params from client
+ if (req->head.args.open.stripe_unit)
+ layout.stripe_unit = req->head.args.open.stripe_unit;
+ if (req->head.args.open.stripe_count)
+ layout.stripe_count = req->head.args.open.stripe_count;
+ if (req->head.args.open.object_size)
+ layout.object_size = req->head.args.open.object_size;
+ if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
+ (__s32)req->head.args.open.pool >= 0) {
+ layout.pool_id = req->head.args.open.pool;
+
+ // make sure we have as new a map as the client
+ if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+ mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ }
+
+ // If client doesn't have capability to modify layout pools, then
+ // only permit this request if the requested pool matches what the
+ // file would have inherited anyway from its parent.
+ if (default_layout != layout) {
+ access |= MAY_SET_VXATTR;
+ }
+
+ if (!layout.is_valid()) {
+ dout(10) << " invalid initial file layout" << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
+ dout(10) << " invalid data pool " << layout.pool_id << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ // created null dn.
+ CDir *dir = dn->get_dir();
+ CInode *diri = dir->get_inode();
+ lov.add_rdlock(&diri->authlock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, diri, access))
+ return;
+
+ if (!check_fragment_space(mdr, dir))
+ return;
+
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+
+ if (!dnl->is_null()) {
+ // it existed.
+ ceph_assert(req->head.args.open.flags & CEPH_O_EXCL);
+ dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
+ mdr->tracei = dnl->get_inode();
+ mdr->tracedn = dn;
+ respond_to_request(mdr, -EEXIST);
+ return;
+ }
+
+ // create inode.
+ CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
+ req->head.args.open.mode | S_IFREG, &layout);
+ ceph_assert(in);
+
+ // it's a file.
+ dn->push_projected_linkage(in);
+
+ in->inode.version = dn->pre_dirty();
+ if (layout.pool_id != mdcache->default_file_layout.pool_id)
+ in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
+ in->inode.update_backtrace();
+ in->inode.rstat.rfiles = 1;
+
+ SnapRealm *realm = diri->find_snaprealm();
+ snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+ ceph_assert(follows >= realm->get_newest_seq());
+
+ ceph_assert(dn->first == follows+1);
+ in->first = dn->first;
+
+ // do the open
+ Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
+ in->authlock.set_state(LOCK_EXCL);
+ in->xattrlock.set_state(LOCK_EXCL);
+
+ if (cap && (cmode & CEPH_FILE_MODE_WR)) {
+ in->inode.client_ranges[client].range.first = 0;
+ in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
+ in->inode.client_ranges[client].follows = follows;
+ cap->mark_clientwriteable();
+ }
+
+ // prepare finisher
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "openc");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ journal_allocated_inos(mdr, &le->metablob);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+ le->metablob.add_primary_dentry(dn, in, true, true, true);
+
+ // make sure this inode gets into the journal
+ le->metablob.add_opened_ino(in->ino());
+
+ C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in);
+
+ if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
+ dout(10) << "adding ino to reply to indicate inode was created" << dendl;
+ // add the file created flag onto the reply if create_flags features is supported
+ encode(in->inode.ino, mdr->reply_extra_bl);
+ }
+
+ journal_and_reply(mdr, in, dn, le, fin);
+
+ // We hit_dir (via hit_inode) in our finish callback, but by then we might
+ // have overshot the split size (multiple opencs in flight), so here is
+ // an early chance to split the dir if this openc makes it oversized.
+ mds->balancer->maybe_fragment(dir, false);
+}
+
+
+
+void Server::handle_client_readdir(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ Session *session = mds->get_session(req);
+ client_t client = req->get_source().num();
+ MutationImpl::LockOpVec lov;
+ CInode *diri = rdlock_path_pin_ref(mdr, 0, lov, false, true);
+ if (!diri) return;
+
+ // it's a directory, right?
+ if (!diri->is_dir()) {
+ // not a dir
+ dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
+ respond_to_request(mdr, -ENOTDIR);
+ return;
+ }
+
+ auto num_caps = session->get_num_caps();
+ auto session_cap_acquisition = session->get_cap_acquisition();
+
+ if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
+ dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
+ << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
+ if (logger)
+ logger->inc(l_mdss_cap_acquisition_throttle);
+
+ mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ lov.add_rdlock(&diri->filelock);
+ lov.add_rdlock(&diri->dirfragtreelock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, diri, MAY_READ))
+ return;
+
+ // which frag?
+ frag_t fg = (__u32)req->head.args.readdir.frag;
+ unsigned req_flags = (__u32)req->head.args.readdir.flags;
+ string offset_str = req->get_path2();
+
+ __u32 offset_hash = 0;
+ if (!offset_str.empty())
+ offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
+ else
+ offset_hash = (__u32)req->head.args.readdir.offset_hash;
+
+ dout(10) << " frag " << fg << " offset '" << offset_str << "'"
+ << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
+
+ // does the frag exist?
+ if (diri->dirfragtree[fg.value()] != fg) {
+ frag_t newfg;
+ if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
+ if (fg.contains((unsigned)offset_hash)) {
+ newfg = diri->dirfragtree[offset_hash];
+ } else {
+ // client actually wants next frag
+ newfg = diri->dirfragtree[fg.value()];
+ }
+ } else {
+ offset_str.clear();
+ newfg = diri->dirfragtree[fg.value()];
+ }
+ dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
+ fg = newfg;
+ }
+
+ CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
+ if (!dir) return;
+
+ // ok!
+ dout(10) << "handle_client_readdir on " << *dir << dendl;
+ ceph_assert(dir->is_auth());
+
+ if (!dir->is_complete()) {
+ if (dir->is_frozen()) {
+ dout(7) << "dir is frozen " << *dir << dendl;
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ // fetch
+ dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
+ dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
+ return;
+ }
+
+#ifdef MDS_VERIFY_FRAGSTAT
+ dir->verify_fragstat();
+#endif
+
+ utime_t now = ceph_clock_now();
+ mdr->set_mds_stamp(now);
+
+ snapid_t snapid = mdr->snapid;
+ dout(10) << "snapid " << snapid << dendl;
+
+ SnapRealm *realm = diri->find_snaprealm();
+
+ unsigned max = req->head.args.readdir.max_entries;
+ if (!max)
+ max = dir->get_num_any(); // whatever, something big.
+ unsigned max_bytes = req->head.args.readdir.max_bytes;
+ if (!max_bytes)
+ // make sure at least one item can be encoded
+ max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
+
+ // start final blob
+ bufferlist dirbl;
+ DirStat ds;
+ ds.frag = dir->get_frag();
+ ds.auth = dir->get_dir_auth().first;
+ if (dir->is_auth() && !mdcache->forward_all_reqs_to_auth())
+ dir->get_dist_spec(ds.dist, mds->get_nodeid());
+
+ dir->encode_dirstat(dirbl, mdr->session->info, ds);
+
+ // count bytes available.
+ // this isn't perfect, but we should capture the main variable/unbounded size items!
+ int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
+ int bytes_left = max_bytes - front_bytes;
+ bytes_left -= realm->get_snap_trace().length();
+
+ // build dir contents
+ bufferlist dnbl;
+ __u32 numfiles = 0;
+ bool start = !offset_hash && offset_str.empty();
+ // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
+ dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
+ auto it = start ? dir->begin() : dir->lower_bound(skip_key);
+ bool end = (it == dir->end());
+ for (; !end && numfiles < max; end = (it == dir->end())) {
+ CDentry *dn = it->second;
+ ++it;
+
+ if (dn->state_test(CDentry::STATE_PURGING))
+ continue;
+
+ bool dnp = dn->use_projected(client, mdr);
+ CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
+
+ if (dnl->is_null())
+ continue;
+
+ if (dn->last < snapid || dn->first > snapid) {
+ dout(20) << "skipping non-overlapping snap " << *dn << dendl;
+ continue;
+ }
+
+ if (!start) {
+ dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
+ if (!(offset_key < dn->key()))
+ continue;
+ }
+
+ CInode *in = dnl->get_inode();
+
+ if (in && in->ino() == CEPH_INO_CEPH)
+ continue;
+
+ // remote link?
+ // better for the MDS to do the work, if we think the client will stat any of these files.
+ if (dnl->is_remote() && !in) {
+ in = mdcache->get_inode(dnl->get_remote_ino());
+ if (in) {
+ dn->link_remote(dnl, in);
+ } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
+ dout(10) << "skipping bad remote ino on " << *dn << dendl;
+ continue;
+ } else {
+ // touch everything i _do_ have
+ for (auto &p : *dir) {
+ if (!p.second->get_linkage()->is_null())
+ mdcache->lru.lru_touch(p.second);
+ }
+
+ // already issued caps and leases, reply immediately.
+ if (dnbl.length() > 0) {
+ mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
+ dout(10) << " open remote dentry after caps were issued, stopping at "
+ << dnbl.length() << " < " << bytes_left << dendl;
+ break;
+ }
+
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ }
+ ceph_assert(in);
+
+ if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
+ dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
+ break;
+ }
+
+ unsigned start_len = dnbl.length();
+
+ // dentry
+ dout(12) << "including dn " << *dn << dendl;
+ encode(dn->get_name(), dnbl);
+ mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
+
+ // inode
+ dout(12) << "including inode " << *in << dendl;
+ int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
+ if (r < 0) {
+ // chop off dn->name, lease
+ dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
+ bufferlist keep;
+ keep.substr_of(dnbl, 0, start_len);
+ dnbl.swap(keep);
+ break;
+ }
+ ceph_assert(r >= 0);
+ numfiles++;
+
+ // touch dn
+ mdcache->lru.lru_touch(dn);
+ }
+
+ session->touch_readdir_cap(numfiles);
+
+ __u16 flags = 0;
+ if (end) {
+ flags = CEPH_READDIR_FRAG_END;
+ if (start)
+ flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
+ }
+ // client only understand END and COMPLETE flags ?
+ if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
+ flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
+ }
+
+ // finish final blob
+ encode(numfiles, dirbl);
+ encode(flags, dirbl);
+ dirbl.claim_append(dnbl);
+
+ // yay, reply
+ dout(10) << "reply to " << *req << " readdir num=" << numfiles
+ << " bytes=" << dirbl.length()
+ << " start=" << (int)start
+ << " end=" << (int)end
+ << dendl;
+ mdr->reply_extra_bl = dirbl;
+
+ // bump popularity. NOTE: this doesn't quite capture it.
+ mds->balancer->hit_dir(dir, META_POP_IRD, -1, numfiles);
+
+ // reply
+ mdr->tracei = diri;
+ respond_to_request(mdr, 0);
+}
+
+
+
+// ===============================================================================
+// INODE UPDATES
+
+
+/*
+ * finisher for basic inode updates
+ */
+class C_MDS_inode_update_finish : public ServerLogContext {
+ CInode *in;
+ bool truncating_smaller, changed_ranges, adjust_realm;
+public:
+ C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
+ bool sm=false, bool cr=false, bool ar=false) :
+ ServerLogContext(s, r), in(i),
+ truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
+ void finish(int r) override {
+ ceph_assert(r == 0);
+
+ int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
+
+ // apply
+ in->pop_and_dirty_projected_inode(mdr->ls);
+ mdr->apply();
+
+ MDSRank *mds = get_mds();
+
+ // notify any clients
+ if (truncating_smaller && in->inode.is_truncating()) {
+ mds->locker->issue_truncate(in);
+ mds->mdcache->truncate_inode(in, mdr->ls);
+ }
+
+ if (adjust_realm) {
+ mds->mdcache->send_snap_update(in, 0, snap_op);
+ mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
+ }
+
+ get_mds()->balancer->hit_inode(in, META_POP_IWR);
+
+ server->respond_to_request(mdr, 0);
+
+ if (changed_ranges)
+ get_mds()->locker->share_inode_max_size(in);
+ }
+};
+
+void Server::handle_client_file_setlock(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
+
+ // get the inode to operate on, and set up any locks needed for that
+ CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
+ if (!cur)
+ return;
+
+ lov.add_xlock(&cur->flocklock);
+ /* acquire_locks will return true if it gets the locks. If it fails,
+ it will redeliver this request at a later date, so drop the request.
+ */
+ if (!mds->locker->acquire_locks(mdr, lov)) {
+ dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
+ return;
+ }
+
+ // copy the lock change into a ceph_filelock so we can store/apply it
+ ceph_filelock set_lock;
+ set_lock.start = req->head.args.filelock_change.start;
+ set_lock.length = req->head.args.filelock_change.length;
+ set_lock.client = req->get_orig_source().num();
+ set_lock.owner = req->head.args.filelock_change.owner;
+ set_lock.pid = req->head.args.filelock_change.pid;
+ set_lock.type = req->head.args.filelock_change.type;
+ bool will_wait = req->head.args.filelock_change.wait;
+
+ dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
+
+ ceph_lock_state_t *lock_state = NULL;
+ bool interrupt = false;
+
+ // get the appropriate lock state
+ switch (req->head.args.filelock_change.rule) {
+ case CEPH_LOCK_FLOCK_INTR:
+ interrupt = true;
+ // fall-thru
+ case CEPH_LOCK_FLOCK:
+ lock_state = cur->get_flock_lock_state();
+ break;
+
+ case CEPH_LOCK_FCNTL_INTR:
+ interrupt = true;
+ // fall-thru
+ case CEPH_LOCK_FCNTL:
+ lock_state = cur->get_fcntl_lock_state();
+ break;
+
+ default:
+ dout(10) << "got unknown lock type " << set_lock.type
+ << ", dropping request!" << dendl;
+ respond_to_request(mdr, -EOPNOTSUPP);
+ return;
+ }
+
+ dout(10) << " state prior to lock change: " << *lock_state << dendl;
+ if (CEPH_LOCK_UNLOCK == set_lock.type) {
+ list<ceph_filelock> activated_locks;
+ MDSContext::vec waiters;
+ if (lock_state->is_waiting(set_lock)) {
+ dout(10) << " unlock removing waiting lock " << set_lock << dendl;
+ lock_state->remove_waiting(set_lock);
+ cur->take_waiting(CInode::WAIT_FLOCK, waiters);
+ } else if (!interrupt) {
+ dout(10) << " unlock attempt on " << set_lock << dendl;
+ lock_state->remove_lock(set_lock, activated_locks);
+ cur->take_waiting(CInode::WAIT_FLOCK, waiters);
+ }
+ mds->queue_waiters(waiters);
+
+ respond_to_request(mdr, 0);
+ } else {
+ dout(10) << " lock attempt on " << set_lock << dendl;
+ bool deadlock = false;
+ if (mdr->more()->flock_was_waiting &&
+ !lock_state->is_waiting(set_lock)) {
+ dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
+ respond_to_request(mdr, -EINTR);
+ } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
+ dout(10) << " it failed on this attempt" << dendl;
+ // couldn't set lock right now
+ if (deadlock) {
+ respond_to_request(mdr, -EDEADLK);
+ } else if (!will_wait) {
+ respond_to_request(mdr, -EWOULDBLOCK);
+ } else {
+ dout(10) << " added to waiting list" << dendl;
+ ceph_assert(lock_state->is_waiting(set_lock));
+ mdr->more()->flock_was_waiting = true;
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ mdr->mark_event("failed to add lock, waiting");
+ mdr->mark_nowarn();
+ cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
+ }
+ } else
+ respond_to_request(mdr, 0);
+ }
+ dout(10) << " state after lock change: " << *lock_state << dendl;
+}
+
+void Server::handle_client_file_readlock(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
+
+ // get the inode to operate on, and set up any locks needed for that
+ CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
+ if (!cur)
+ return;
+
+ /* acquire_locks will return true if it gets the locks. If it fails,
+ it will redeliver this request at a later date, so drop the request.
+ */
+ lov.add_rdlock(&cur->flocklock);
+ if (!mds->locker->acquire_locks(mdr, lov)) {
+ dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
+ return;
+ }
+
+ // copy the lock change into a ceph_filelock so we can store/apply it
+ ceph_filelock checking_lock;
+ checking_lock.start = req->head.args.filelock_change.start;
+ checking_lock.length = req->head.args.filelock_change.length;
+ checking_lock.client = req->get_orig_source().num();
+ checking_lock.owner = req->head.args.filelock_change.owner;
+ checking_lock.pid = req->head.args.filelock_change.pid;
+ checking_lock.type = req->head.args.filelock_change.type;
+
+ // get the appropriate lock state
+ ceph_lock_state_t *lock_state = NULL;
+ switch (req->head.args.filelock_change.rule) {
+ case CEPH_LOCK_FLOCK:
+ lock_state = cur->get_flock_lock_state();
+ break;
+
+ case CEPH_LOCK_FCNTL:
+ lock_state = cur->get_fcntl_lock_state();
+ break;
+
+ default:
+ dout(10) << "got unknown lock type " << checking_lock.type << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ lock_state->look_for_lock(checking_lock);
+
+ bufferlist lock_bl;
+ encode(checking_lock, lock_bl);
+
+ mdr->reply_extra_bl = lock_bl;
+ respond_to_request(mdr, 0);
+}
+
+void Server::handle_client_setattr(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
+ CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
+ if (!cur) return;
+
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+ if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
+ respond_to_request(mdr, -EPERM);
+ return;
+ }
+
+ __u32 mask = req->head.args.setattr.mask;
+ __u32 access_mask = MAY_WRITE;
+
+ // xlock inode
+ if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
+ lov.add_xlock(&cur->authlock);
+ if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
+ lov.add_xlock(&cur->filelock);
+ if (mask & CEPH_SETATTR_CTIME)
+ lov.add_wrlock(&cur->versionlock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
+ access_mask |= MAY_CHOWN;
+
+ if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
+ access_mask |= MAY_CHGRP;
+
+ if (!check_access(mdr, cur, access_mask))
+ return;
+
+ // trunc from bigger -> smaller?
+ auto pip = cur->get_projected_inode();
+
+ uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
+
+ // ENOSPC on growing file while full, but allow shrinks
+ if (is_full && req->head.args.setattr.size > old_size) {
+ dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
+ respond_to_request(mdr, -ENOSPC);
+ return;
+ }
+
+ bool truncating_smaller = false;
+ if (mask & CEPH_SETATTR_SIZE) {
+ truncating_smaller = req->head.args.setattr.size < old_size;
+ if (truncating_smaller && pip->is_truncating()) {
+ dout(10) << " waiting for pending truncate from " << pip->truncate_from
+ << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ }
+
+ bool changed_ranges = false;
+
+ // project update
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "setattr");
+ mdlog->start_entry(le);
+
+ auto &pi = cur->project_inode();
+
+ if (mask & CEPH_SETATTR_UID)
+ pi.inode.uid = req->head.args.setattr.uid;
+ if (mask & CEPH_SETATTR_GID)
+ pi.inode.gid = req->head.args.setattr.gid;
+
+ if (mask & CEPH_SETATTR_MODE)
+ pi.inode.mode = (pi.inode.mode & ~07777) | (req->head.args.setattr.mode & 07777);
+ else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
+ S_ISREG(pi.inode.mode) &&
+ (pi.inode.mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
+ pi.inode.mode &= ~(S_ISUID|S_ISGID);
+ }
+
+ if (mask & CEPH_SETATTR_MTIME)
+ pi.inode.mtime = req->head.args.setattr.mtime;
+ if (mask & CEPH_SETATTR_ATIME)
+ pi.inode.atime = req->head.args.setattr.atime;
+ if (mask & CEPH_SETATTR_BTIME)
+ pi.inode.btime = req->head.args.setattr.btime;
+ if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
+ pi.inode.time_warp_seq++; // maybe not a timewarp, but still a serialization point.
+ if (mask & CEPH_SETATTR_SIZE) {
+ if (truncating_smaller) {
+ pi.inode.truncate(old_size, req->head.args.setattr.size);
+ le->metablob.add_truncate_start(cur->ino());
+ } else {
+ pi.inode.size = req->head.args.setattr.size;
+ pi.inode.rstat.rbytes = pi.inode.size;
+ }
+ pi.inode.mtime = mdr->get_op_stamp();
+
+ // adjust client's max_size?
+ CInode::mempool_inode::client_range_map new_ranges;
+ bool max_increased = false;
+ mds->locker->calc_new_client_ranges(cur, pi.inode.size, true, &new_ranges, &max_increased);
+ if (pi.inode.client_ranges != new_ranges) {
+ dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
+ pi.inode.client_ranges = new_ranges;
+ changed_ranges = true;
+ }
+ }
+
+ pi.inode.version = cur->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
+
+ // log + wait
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
+ truncating_smaller, changed_ranges));
+
+ // flush immediately if there are readers/writers waiting
+ if (mdr->is_xlocked(&cur->filelock) &&
+ (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+ mds->mdlog->flush();
+}
+
+/* Takes responsibility for mdr */
+void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
+{
+ CInode *in = mdr->in[0];
+ client_t client = mdr->get_client();
+ ceph_assert(in);
+
+ dout(10) << "do_open_truncate " << *in << dendl;
+
+ SnapRealm *realm = in->find_snaprealm();
+ Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
+
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "open_truncate");
+ mdlog->start_entry(le);
+
+ // prepare
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
+ pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
+
+ uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
+ if (old_size > 0) {
+ pi.inode.truncate(old_size, 0);
+ le->metablob.add_truncate_start(in->ino());
+ }
+
+ bool changed_ranges = false;
+ if (cap && (cmode & CEPH_FILE_MODE_WR)) {
+ pi.inode.client_ranges[client].range.first = 0;
+ pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
+ pi.inode.client_ranges[client].follows = realm->get_newest_seq();
+ changed_ranges = true;
+ cap->mark_clientwriteable();
+ }
+
+ le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+
+ mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
+
+ // make sure ino gets into the journal
+ le->metablob.add_opened_ino(in->ino());
+
+ mdr->o_trunc = true;
+
+ CDentry *dn = 0;
+ if (mdr->client_request->get_dentry_wanted()) {
+ ceph_assert(mdr->dn[0].size());
+ dn = mdr->dn[0].back();
+ }
+
+ journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
+ changed_ranges));
+ // Although the `open` part can give an early reply, the truncation won't
+ // happen until our EUpdate is persistent, to give the client a prompt
+ // response we must also flush that event.
+ mdlog->flush();
+}
+
+
+/* This function cleans up the passed mdr */
+void Server::handle_client_setlayout(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
+ CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
+ if (!cur) return;
+
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+ if (!cur->is_file()) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ if (cur->get_projected_inode()->size ||
+ cur->get_projected_inode()->truncate_seq > 1) {
+ respond_to_request(mdr, -ENOTEMPTY);
+ return;
+ }
+
+ // validate layout
+ file_layout_t layout = cur->get_projected_inode()->layout;
+ // save existing layout for later
+ const auto old_layout = layout;
+
+ int access = MAY_WRITE;
+
+ if (req->head.args.setlayout.layout.fl_object_size > 0)
+ layout.object_size = req->head.args.setlayout.layout.fl_object_size;
+ if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
+ layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
+ if (req->head.args.setlayout.layout.fl_stripe_count > 0)
+ layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
+ if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
+ layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
+
+ // make sure we have as new a map as the client
+ if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+ mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ }
+
+ // Don't permit layout modifications without 'p' caps
+ if (layout != old_layout) {
+ access |= MAY_SET_VXATTR;
+ }
+
+ if (!layout.is_valid()) {
+ dout(10) << "bad layout" << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
+ dout(10) << " invalid data pool " << layout.pool_id << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ lov.add_xlock(&cur->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, cur, access))
+ return;
+
+ // project update
+ auto &pi = cur->project_inode();
+ pi.inode.layout = layout;
+ // add the old pool to the inode
+ pi.inode.add_old_pool(old_layout.pool_id);
+ pi.inode.version = cur->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
+
+ // log + wait
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "setlayout");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+}
+
+void Server::handle_client_setdirlayout(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
+ file_layout_t *dir_layout = nullptr;
+ CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout);
+ if (!cur) return;
+
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+
+ if (!cur->is_dir()) {
+ respond_to_request(mdr, -ENOTDIR);
+ return;
+ }
+
+ lov.add_xlock(&cur->policylock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ // validate layout
+ const auto old_pi = cur->get_projected_inode();
+ file_layout_t layout;
+ if (old_pi->has_layout())
+ layout = old_pi->layout;
+ else if (dir_layout)
+ layout = *dir_layout;
+ else
+ layout = mdcache->default_file_layout;
+
+ // Level of access required to complete
+ int access = MAY_WRITE;
+
+ const auto old_layout = layout;
+
+ if (req->head.args.setlayout.layout.fl_object_size > 0)
+ layout.object_size = req->head.args.setlayout.layout.fl_object_size;
+ if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
+ layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
+ if (req->head.args.setlayout.layout.fl_stripe_count > 0)
+ layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
+ if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
+ layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
+ // make sure we have as new a map as the client
+ if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+ mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ }
+
+ if (layout != old_layout) {
+ access |= MAY_SET_VXATTR;
+ }
+
+ if (!layout.is_valid()) {
+ dout(10) << "bad layout" << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
+ dout(10) << " invalid data pool " << layout.pool_id << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ if (!check_access(mdr, cur, access))
+ return;
+
+ auto &pi = cur->project_inode();
+ pi.inode.layout = layout;
+ pi.inode.version = cur->pre_dirty();
+
+ // log + wait
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "setlayout");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ mdr->no_early_reply = true;
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+}
+
+// XATTRS
+
+int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
+ file_layout_t *layout, bool validate)
+{
+ dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
+ try {
+ if (name == "layout") {
+ string::iterator begin = value.begin();
+ string::iterator end = value.end();
+ keys_and_values<string::iterator> p; // create instance of parser
+ std::map<string, string> m; // map to receive results
+ if (!qi::parse(begin, end, p, m)) { // returns true if successful
+ return -EINVAL;
+ }
+ string left(begin, end);
+ dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
+ if (begin != end)
+ return -EINVAL;
+ for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
+ // Skip validation on each attr, we do it once at the end (avoid
+ // rejecting intermediate states if the overall result is ok)
+ int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
+ osdmap, layout, false);
+ if (r < 0)
+ return r;
+ }
+ } else if (name == "layout.object_size") {
+ layout->object_size = boost::lexical_cast<unsigned>(value);
+ } else if (name == "layout.stripe_unit") {
+ layout->stripe_unit = boost::lexical_cast<unsigned>(value);
+ } else if (name == "layout.stripe_count") {
+ layout->stripe_count = boost::lexical_cast<unsigned>(value);
+ } else if (name == "layout.pool") {
+ try {
+ layout->pool_id = boost::lexical_cast<unsigned>(value);
+ } catch (boost::bad_lexical_cast const&) {
+ int64_t pool = osdmap.lookup_pg_pool_name(value);
+ if (pool < 0) {
+ dout(10) << " unknown pool " << value << dendl;
+ return -ENOENT;
+ }
+ layout->pool_id = pool;
+ }
+ } else if (name == "layout.pool_namespace") {
+ layout->pool_ns = value;
+ } else {
+ dout(10) << " unknown layout vxattr " << name << dendl;
+ return -EINVAL;
+ }
+ } catch (boost::bad_lexical_cast const&) {
+ dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
+ return -EINVAL;
+ }
+
+ if (validate && !layout->is_valid()) {
+ dout(10) << "bad layout" << dendl;
+ return -EINVAL;
+ }
+ if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
+ dout(10) << " invalid data pool " << layout->pool_id << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
+{
+ dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
+ try {
+ if (name == "quota") {
+ string::iterator begin = value.begin();
+ string::iterator end = value.end();
+ if (begin == end) {
+ // keep quota unchanged. (for create_quota_realm())
+ return 0;
+ }
+ keys_and_values<string::iterator> p; // create instance of parser
+ std::map<string, string> m; // map to receive results
+ if (!qi::parse(begin, end, p, m)) { // returns true if successful
+ return -EINVAL;
+ }
+ string left(begin, end);
+ dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
+ if (begin != end)
+ return -EINVAL;
+ for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
+ int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
+ if (r < 0)
+ return r;
+ }
+ } else if (name == "quota.max_bytes") {
+ int64_t q = boost::lexical_cast<int64_t>(value);
+ if (q < 0)
+ return -EINVAL;
+ quota->max_bytes = q;
+ } else if (name == "quota.max_files") {
+ int64_t q = boost::lexical_cast<int64_t>(value);
+ if (q < 0)
+ return -EINVAL;
+ quota->max_files = q;
+ } else {
+ dout(10) << " unknown quota vxattr " << name << dendl;
+ return -EINVAL;
+ }
+ } catch (boost::bad_lexical_cast const&) {
+ dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
+ return -EINVAL;
+ }
+
+ if (!quota->is_valid()) {
+ dout(10) << "bad quota" << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+void Server::create_quota_realm(CInode *in)
+{
+ dout(10) << __func__ << " " << *in << dendl;
+
+ auto req = MClientRequest::create(CEPH_MDS_OP_SETXATTR);
+ req->set_filepath(filepath(in->ino()));
+ req->set_string2("ceph.quota");
+ // empty vxattr value
+ req->set_tid(mds->issue_tid());
+
+ mds->send_message_mds(req, in->authority().first);
+}
+
+/*
+ * Verify that the file layout attribute carried by client
+ * is well-formatted.
+ * Return 0 on success, otherwise this function takes
+ * responsibility for the passed mdr.
+ */
+int Server::check_layout_vxattr(MDRequestRef& mdr,
+ string name,
+ string value,
+ file_layout_t *layout)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ epoch_t epoch;
+ int r;
+
+ mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
+ r = parse_layout_vxattr(name, value, osdmap, layout);
+ epoch = osdmap.get_epoch();
+ });
+
+ if (r == -ENOENT) {
+
+ // we don't have the specified pool, make sure our map
+ // is newer than or as new as the client.
+ epoch_t req_epoch = req->get_osdmap_epoch();
+
+ if (req_epoch > epoch) {
+
+ // well, our map is older. consult mds.
+ Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
+
+ if (!mds->objecter->wait_for_map(req_epoch, fin))
+ return r; // wait, fin will retry this request later
+
+ delete fin;
+
+ // now we have at least as new a map as the client, try again.
+ mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
+ r = parse_layout_vxattr(name, value, osdmap, layout);
+ epoch = osdmap.get_epoch();
+ });
+
+ ceph_assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
+
+ } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
+
+ // For compatibility with client w/ old code, we still need get the
+ // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
+ // we can remove those code.
+ mdr->waited_for_osdmap = true;
+ mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
+ mds, new C_MDS_RetryRequest(mdcache, mdr)));
+ return r;
+ }
+ }
+
+ if (r < 0) {
+
+ if (r == -ENOENT)
+ r = -EINVAL;
+
+ respond_to_request(mdr, r);
+ return r;
+ }
+
+ // all is well
+ return 0;
+}
+
+void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
+ file_layout_t *dir_layout,
+ MutationImpl::LockOpVec& lov)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ string name(req->get_path2());
+ bufferlist bl = req->get_data();
+ string value (bl.c_str(), bl.length());
+ dout(10) << "handle_set_vxattr " << name
+ << " val " << value.length()
+ << " bytes on " << *cur
+ << dendl;
+
+ CInode::mempool_inode *pip = nullptr;
+ string rest;
+
+ if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
+ return;
+ }
+
+ bool adjust_realm = false;
+ if (name.compare(0, 15, "ceph.dir.layout") == 0) {
+ if (!cur->is_dir()) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ file_layout_t layout;
+ if (cur->get_projected_inode()->has_layout())
+ layout = cur->get_projected_inode()->layout;
+ else if (dir_layout)
+ layout = *dir_layout;
+ else
+ layout = mdcache->default_file_layout;
+
+ rest = name.substr(name.find("layout"));
+ if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
+ return;
+
+ lov.add_xlock(&cur->policylock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ auto &pi = cur->project_inode();
+ pi.inode.layout = layout;
+ mdr->no_early_reply = true;
+ pip = &pi.inode;
+ } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
+ if (!cur->is_file()) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ if (cur->get_projected_inode()->size ||
+ cur->get_projected_inode()->truncate_seq > 1) {
+ respond_to_request(mdr, -ENOTEMPTY);
+ return;
+ }
+ file_layout_t layout = cur->get_projected_inode()->layout;
+ rest = name.substr(name.find("layout"));
+ if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
+ return;
+
+ lov.add_xlock(&cur->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ auto &pi = cur->project_inode();
+ int64_t old_pool = pi.inode.layout.pool_id;
+ pi.inode.add_old_pool(old_pool);
+ pi.inode.layout = layout;
+ pip = &pi.inode;
+ } else if (name.compare(0, 10, "ceph.quota") == 0) {
+ if (!cur->is_dir() || cur->is_root()) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ quota_info_t quota = cur->get_projected_inode()->quota;
+
+ rest = name.substr(name.find("quota"));
+ int r = parse_quota_vxattr(rest, value, &quota);
+ if (r < 0) {
+ respond_to_request(mdr, r);
+ return;
+ }
+
+ lov.add_xlock(&cur->policylock);
+ if (quota.is_enable() && !cur->get_projected_srnode()) {
+ lov.add_xlock(&cur->snaplock);
+ adjust_realm = true;
+ }
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (cur->get_projected_inode()->quota == quota) {
+ respond_to_request(mdr, 0);
+ return;
+ }
+
+ auto &pi = cur->project_inode(false, adjust_realm);
+ pi.inode.quota = quota;
+
+ if (adjust_realm)
+ pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
+
+ mdr->no_early_reply = true;
+ pip = &pi.inode;
+
+ client_t exclude_ct = mdr->get_client();
+ mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
+ } else if (name == "ceph.dir.subvolume"sv) {
+ if (!cur->is_dir()) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ bool val;
+ try {
+ val = boost::lexical_cast<bool>(value);
+ } catch (boost::bad_lexical_cast const&) {
+ dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ lov.add_xlock(&cur->policylock);
+ lov.add_xlock(&cur->snaplock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ SnapRealm *realm = cur->find_snaprealm();
+ if (val) {
+ inodeno_t subvol_ino = realm->get_subvolume_ino();
+ // can't create subvolume inside another subvolume
+ if (subvol_ino && subvol_ino != cur->ino()) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ }
+
+ const auto srnode = cur->get_projected_srnode();
+ if (val == (srnode && srnode->is_subvolume())) {
+ respond_to_request(mdr, 0);
+ return;
+ }
+
+ auto& pi = cur->project_inode(false, true);
+ if (!srnode)
+ pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
+ if (val)
+ pi.snapnode->mark_subvolume();
+ else
+ pi.snapnode->clear_subvolume();
+
+ mdr->no_early_reply = true;
+ pip = &pi.inode;
+ adjust_realm = true;
+ } else if (name == "ceph.dir.pin"sv) {
+ if (!cur->is_dir() || cur->is_root()) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ mds_rank_t rank;
+ try {
+ rank = boost::lexical_cast<mds_rank_t>(value);
+ if (rank < 0) rank = MDS_RANK_NONE;
+ } catch (boost::bad_lexical_cast const&) {
+ dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ lov.add_xlock(&cur->policylock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ auto &pi = cur->project_inode();
+ cur->set_export_pin(rank);
+ pip = &pi.inode;
+ } else {
+ dout(10) << " unknown vxattr " << name << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ pip->change_attr++;
+ pip->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pip->rstat.rctime)
+ pip->rstat.rctime = mdr->get_op_stamp();
+ pip->version = cur->pre_dirty();
+ if (cur->is_file())
+ pip->update_backtrace();
+
+ // log + wait
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
+ false, false, adjust_realm));
+ return;
+}
+
+void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
+ file_layout_t *dir_layout,
+ MutationImpl::LockOpVec& lov)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ string name(req->get_path2());
+
+ dout(10) << __func__ << " " << name << " on " << *cur << dendl;
+
+ if (name == "ceph.dir.layout") {
+ if (!cur->is_dir()) {
+ respond_to_request(mdr, -ENODATA);
+ return;
+ }
+ if (cur->is_root()) {
+ dout(10) << "can't remove layout policy on the root directory" << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ if (!cur->get_projected_inode()->has_layout()) {
+ respond_to_request(mdr, -ENODATA);
+ return;
+ }
+
+ lov.add_xlock(&cur->policylock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ auto &pi = cur->project_inode();
+ pi.inode.clear_layout();
+ pi.inode.version = cur->pre_dirty();
+
+ // log + wait
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ mdr->no_early_reply = true;
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+ return;
+ } else if (name == "ceph.dir.layout.pool_namespace"
+ || name == "ceph.file.layout.pool_namespace") {
+ // Namespace is the only layout field that has a meaningful
+ // null/none value (empty string, means default layout). Is equivalent
+ // to a setxattr with empty string: pass through the empty payload of
+ // the rmxattr request to do this.
+ handle_set_vxattr(mdr, cur, dir_layout, lov);
+ return;
+ }
+
+ respond_to_request(mdr, -ENODATA);
+}
+
+class C_MDS_inode_xattr_update_finish : public ServerLogContext {
+ CInode *in;
+public:
+
+ C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
+ ServerLogContext(s, r), in(i) { }
+ void finish(int r) override {
+ ceph_assert(r == 0);
+
+ // apply
+ in->pop_and_dirty_projected_inode(mdr->ls);
+
+ mdr->apply();
+
+ get_mds()->balancer->hit_inode(in, META_POP_IWR);
+
+ server->respond_to_request(mdr, 0);
+ }
+};
+
+void Server::handle_client_setxattr(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ string name(req->get_path2());
+ MutationImpl::LockOpVec lov;
+ CInode *cur;
+
+ file_layout_t *dir_layout = NULL;
+ if (name.compare(0, 15, "ceph.dir.layout") == 0)
+ cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout);
+ else
+ cur = rdlock_path_pin_ref(mdr, 0, lov, true);
+ if (!cur)
+ return;
+
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+
+ int flags = req->head.args.setxattr.flags;
+
+ // magic ceph.* namespace?
+ if (name.compare(0, 5, "ceph.") == 0) {
+ handle_set_vxattr(mdr, cur, dir_layout, lov);
+ return;
+ }
+
+ lov.add_xlock(&cur->xattrlock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, cur, MAY_WRITE))
+ return;
+
+ auto pxattrs = cur->get_projected_xattrs();
+ size_t len = req->get_data().length();
+ size_t inc = len + name.length();
+
+ // check xattrs kv pairs size
+ size_t cur_xattrs_size = 0;
+ for (const auto& p : *pxattrs) {
+ if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
+ continue;
+ }
+ cur_xattrs_size += p.first.length() + p.second.length();
+ }
+
+ if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
+ dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
+ << cur_xattrs_size << ", inc " << inc << dendl;
+ respond_to_request(mdr, -ENOSPC);
+ return;
+ }
+
+ if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(name))) {
+ dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
+ respond_to_request(mdr, -EEXIST);
+ return;
+ }
+ if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(name))) {
+ dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
+ respond_to_request(mdr, -ENODATA);
+ return;
+ }
+
+ dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
+
+ // project update
+ auto &pi = cur->project_inode(true);
+ pi.inode.version = cur->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
+ pi.inode.xattr_version++;
+ auto &px = *pi.xattrs;
+ if ((flags & CEPH_XATTR_REMOVE)) {
+ px.erase(mempool::mds_co::string(name));
+ } else {
+ bufferptr b = buffer::create(len);
+ if (len)
+ req->get_data().copy(0, len, b.c_str());
+ auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(name)), std::forward_as_tuple(b));
+ if (!em.second)
+ em.first->second = b;
+ }
+
+ // log + wait
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "setxattr");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+}
+
+void Server::handle_client_removexattr(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ std::string name(req->get_path2());
+
+ MutationImpl::LockOpVec lov;
+ file_layout_t *dir_layout = nullptr;
+ CInode *cur;
+ if (name == "ceph.dir.layout")
+ cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout);
+ else
+ cur = rdlock_path_pin_ref(mdr, 0, lov, true);
+ if (!cur)
+ return;
+
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+
+ if (name.compare(0, 5, "ceph.") == 0) {
+ handle_remove_vxattr(mdr, cur, dir_layout, lov);
+ return;
+ }
+
+ lov.add_xlock(&cur->xattrlock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ auto pxattrs = cur->get_projected_xattrs();
+ if (pxattrs->count(mempool::mds_co::string(name)) == 0) {
+ dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
+ respond_to_request(mdr, -ENODATA);
+ return;
+ }
+
+ dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
+
+ // project update
+ auto &pi = cur->project_inode(true);
+ auto &px = *pi.xattrs;
+ pi.inode.version = cur->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
+ pi.inode.xattr_version++;
+ px.erase(mempool::mds_co::string(name));
+
+ // log + wait
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "removexattr");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+}
+
+
+// =================================================================
+// DIRECTORY and NAMESPACE OPS
+
+
+// ------------------------------------------------
+
+// MKNOD
+
+class C_MDS_mknod_finish : public ServerLogContext {
+ CDentry *dn;
+ CInode *newi;
+public:
+ C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
+ ServerLogContext(s, r), dn(d), newi(ni) {}
+ void finish(int r) override {
+ ceph_assert(r == 0);
+
+ // link the inode
+ dn->pop_projected_linkage();
+
+ // be a bit hacky with the inode version, here.. we decrement it
+ // just to keep mark_dirty() happen. (we didn't bother projecting
+ // a new version of hte inode since it's just been created)
+ newi->inode.version--;
+ newi->mark_dirty(newi->inode.version + 1, mdr->ls);
+ newi->mark_dirty_parent(mdr->ls, true);
+
+ // mkdir?
+ if (newi->inode.is_dir()) {
+ CDir *dir = newi->get_dirfrag(frag_t());
+ ceph_assert(dir);
+ dir->fnode.version--;
+ dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
+ dir->mark_new(mdr->ls);
+ }
+
+ mdr->apply();
+
+ MDRequestRef null_ref;
+ get_mds()->mdcache->send_dentry_link(dn, null_ref);
+
+ if (newi->inode.is_file())
+ get_mds()->locker->share_inode_max_size(newi);
+
+ // hit pop
+ get_mds()->balancer->hit_inode(newi, META_POP_IWR);
+
+ // reply
+ server->respond_to_request(mdr, 0);
+ }
+};
+
+
+void Server::handle_client_mknod(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ client_t client = mdr->get_client();
+ MutationImpl::LockOpVec lov;
+ file_layout_t *dir_layout = nullptr;
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false,
+ &dir_layout);
+ if (!dn) return;
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+ CInode *diri = dn->get_dir()->get_inode();
+ lov.add_rdlock(&diri->authlock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, diri, MAY_WRITE))
+ return;
+
+ if (!check_fragment_space(mdr, dn->get_dir()))
+ return;
+
+ unsigned mode = req->head.args.mknod.mode;
+ if ((mode & S_IFMT) == 0)
+ mode |= S_IFREG;
+
+ // set layout
+ file_layout_t layout;
+ if (dir_layout && S_ISREG(mode))
+ layout = *dir_layout;
+ else
+ layout = mdcache->default_file_layout;
+
+ CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
+ ceph_assert(newi);
+
+ dn->push_projected_linkage(newi);
+
+ newi->inode.rdev = req->head.args.mknod.rdev;
+ newi->inode.version = dn->pre_dirty();
+ newi->inode.rstat.rfiles = 1;
+ if (layout.pool_id != mdcache->default_file_layout.pool_id)
+ newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
+ newi->inode.update_backtrace();
+
+ snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+ SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
+ ceph_assert(follows >= realm->get_newest_seq());
+
+ // if the client created a _regular_ file via MKNOD, it's highly likely they'll
+ // want to write to it (e.g., if they are reexporting NFS)
+ if (S_ISREG(newi->inode.mode)) {
+ // issue a cap on the file
+ int cmode = CEPH_FILE_MODE_RDWR;
+ Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
+ if (cap) {
+ cap->set_wanted(0);
+
+ // put locks in excl mode
+ newi->filelock.set_state(LOCK_EXCL);
+ newi->authlock.set_state(LOCK_EXCL);
+ newi->xattrlock.set_state(LOCK_EXCL);
+
+ dout(15) << " setting a client_range too, since this is a regular file" << dendl;
+ newi->inode.client_ranges[client].range.first = 0;
+ newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
+ newi->inode.client_ranges[client].follows = follows;
+ cap->mark_clientwriteable();
+ }
+ }
+
+ ceph_assert(dn->first == follows + 1);
+ newi->first = dn->first;
+
+ dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
+
+ // prepare finisher
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "mknod");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ journal_allocated_inos(mdr, &le->metablob);
+
+ mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
+ PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+ le->metablob.add_primary_dentry(dn, newi, true, true, true);
+
+ journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
+ mds->balancer->maybe_fragment(dn->get_dir(), false);
+}
+
+
+
+// MKDIR
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_mkdir(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ if (req->get_filepath().is_last_dot_or_dotdot()) {
+ respond_to_request(mdr, -EEXIST);
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false);
+ if (!dn) return;
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+ CDir *dir = dn->get_dir();
+ CInode *diri = dir->get_inode();
+ lov.add_rdlock(&diri->authlock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ // mkdir check access
+ if (!check_access(mdr, diri, MAY_WRITE))
+ return;
+
+ if (!check_fragment_space(mdr, dir))
+ return;
+
+ // new inode
+ unsigned mode = req->head.args.mkdir.mode;
+ mode &= ~S_IFMT;
+ mode |= S_IFDIR;
+ CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
+ ceph_assert(newi);
+
+ // it's a directory.
+ dn->push_projected_linkage(newi);
+
+ newi->inode.version = dn->pre_dirty();
+ newi->inode.rstat.rsubdirs = 1;
+ newi->inode.update_backtrace();
+
+ snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+ SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
+ ceph_assert(follows >= realm->get_newest_seq());
+
+ dout(12) << " follows " << follows << dendl;
+ ceph_assert(dn->first == follows + 1);
+ newi->first = dn->first;
+
+ // ...and that new dir is empty.
+ CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
+ newdir->state_set(CDir::STATE_CREATING);
+ newdir->mark_complete();
+ newdir->fnode.version = newdir->pre_dirty();
+
+ // prepare finisher
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "mkdir");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ journal_allocated_inos(mdr, &le->metablob);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+ le->metablob.add_primary_dentry(dn, newi, true, true);
+ le->metablob.add_new_dir(newdir); // dirty AND complete AND new
+
+ // issue a cap on the directory
+ int cmode = CEPH_FILE_MODE_RDWR;
+ Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
+ if (cap) {
+ cap->set_wanted(0);
+
+ // put locks in excl mode
+ newi->filelock.set_state(LOCK_EXCL);
+ newi->authlock.set_state(LOCK_EXCL);
+ newi->xattrlock.set_state(LOCK_EXCL);
+ }
+
+ // make sure this inode gets into the journal
+ le->metablob.add_opened_ino(newi->ino());
+
+ journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
+
+ // We hit_dir (via hit_inode) in our finish callback, but by then we might
+ // have overshot the split size (multiple mkdir in flight), so here is
+ // an early chance to split the dir if this mkdir makes it oversized.
+ mds->balancer->maybe_fragment(dir, false);
+}
+
+
+// SYMLINK
+
+void Server::handle_client_symlink(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false);
+ if (!dn) return;
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+ CDir *dir = dn->get_dir();
+ CInode *diri = dir->get_inode();
+ lov.add_rdlock(&diri->authlock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, diri, MAY_WRITE))
+ return;
+
+ if (!check_fragment_space(mdr, dir))
+ return;
+
+ unsigned mode = S_IFLNK | 0777;
+ CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
+ ceph_assert(newi);
+
+ // it's a symlink
+ dn->push_projected_linkage(newi);
+
+ newi->symlink = req->get_path2();
+ newi->inode.size = newi->symlink.length();
+ newi->inode.rstat.rbytes = newi->inode.size;
+ newi->inode.rstat.rfiles = 1;
+ newi->inode.version = dn->pre_dirty();
+ newi->inode.update_backtrace();
+
+ newi->first = dn->first;
+
+ // prepare finisher
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "symlink");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ journal_allocated_inos(mdr, &le->metablob);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+ le->metablob.add_primary_dentry(dn, newi, true, true);
+
+ journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
+ mds->balancer->maybe_fragment(dir, false);
+}
+
+
+
+
+
+// LINK
+
+void Server::handle_client_link(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+
+ dout(7) << "handle_client_link " << req->get_filepath()
+ << " to " << req->get_filepath2()
+ << dendl;
+
+ MutationImpl::LockOpVec lov;
+
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false);
+ if (!dn) return;
+ CInode *targeti = rdlock_path_pin_ref(mdr, 1, lov, false);
+ if (!targeti) return;
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+
+ CDir *dir = dn->get_dir();
+ dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
+ dout(7) << "target is " << *targeti << dendl;
+ if (targeti->is_dir()) {
+ // if srcdn is replica, need to make sure its linkage is correct
+ vector<CDentry*>& trace = mdr->dn[1];
+ if (trace.empty() ||
+ trace.back()->is_auth() ||
+ trace.back()->lock.can_read(mdr->get_client())) {
+ dout(7) << "target is a dir, failing..." << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ }
+
+ lov.erase_rdlock(&targeti->snaplock);
+ lov.add_xlock(&targeti->snaplock);
+ lov.add_xlock(&targeti->linklock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
+ if (!check_access(mdr, targeti, MAY_WRITE))
+ return;
+
+ if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
+ return;
+
+ if (!check_fragment_space(mdr, dir))
+ return;
+ }
+
+ CInode* target_pin = targeti->get_projected_parent_dir()->inode;
+ SnapRealm *target_realm = target_pin->find_snaprealm();
+ if (target_pin != dir->inode &&
+ target_realm->get_subvolume_ino() !=
+ dir->inode->find_snaprealm()->get_subvolume_ino()) {
+ dout(7) << "target is in different subvolume, failing..." << dendl;
+ respond_to_request(mdr, -EXDEV);
+ return;
+ }
+
+ // go!
+ ceph_assert(g_conf()->mds_kill_link_at != 1);
+
+ // local or remote?
+ if (targeti->is_auth())
+ _link_local(mdr, dn, targeti, target_realm);
+ else
+ _link_remote(mdr, true, dn, targeti);
+ mds->balancer->maybe_fragment(dir, false);
+}
+
+
+class C_MDS_link_local_finish : public ServerLogContext {
+ CDentry *dn;
+ CInode *targeti;
+ version_t dnpv;
+ version_t tipv;
+ bool adjust_realm;
+public:
+ C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
+ version_t dnpv_, version_t tipv_, bool ar) :
+ ServerLogContext(s, r), dn(d), targeti(ti),
+ dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
+ }
+};
+
+
+void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
+{
+ dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
+
+ mdr->ls = mdlog->get_current_segment();
+
+ // predirty NEW dentry
+ version_t dnpv = dn->pre_dirty();
+ version_t tipv = targeti->pre_dirty();
+
+ // project inode update
+ auto &pi = targeti->project_inode();
+ pi.inode.nlink++;
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
+ pi.inode.version = tipv;
+
+ bool adjust_realm = false;
+ if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
+ sr_t *newsnap = targeti->project_snaprealm();
+ targeti->mark_snaprealm_global(newsnap);
+ targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
+ adjust_realm = true;
+ }
+
+ // log + wait
+ EUpdate *le = new EUpdate(mdlog, "link_local");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
+ mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
+ le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
+
+ // do this after predirty_*, to avoid funky extra dnl arg
+ dn->push_projected_linkage(targeti->ino(), targeti->d_type());
+
+ journal_and_reply(mdr, targeti, dn, le,
+ new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
+}
+
+void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
+ version_t dnpv, version_t tipv, bool adjust_realm)
+{
+ dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
+
+ // link and unlock the NEW dentry
+ CDentry::linkage_t *dnl = dn->pop_projected_linkage();
+ if (!dnl->get_inode())
+ dn->link_remote(dnl, targeti);
+ dn->mark_dirty(dnpv, mdr->ls);
+
+ // target inode
+ targeti->pop_and_dirty_projected_inode(mdr->ls);
+
+ mdr->apply();
+
+ MDRequestRef null_ref;
+ mdcache->send_dentry_link(dn, null_ref);
+
+ if (adjust_realm) {
+ int op = CEPH_SNAP_OP_SPLIT;
+ mds->mdcache->send_snap_update(targeti, 0, op);
+ mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
+ }
+
+ // bump target popularity
+ mds->balancer->hit_inode(targeti, META_POP_IWR);
+ mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
+
+ // reply
+ respond_to_request(mdr, 0);
+}
+
+
+// link / unlink remote
+
+class C_MDS_link_remote_finish : public ServerLogContext {
+ bool inc;
+ CDentry *dn;
+ CInode *targeti;
+ version_t dpv;
+public:
+ C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
+ ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
+ dpv(d->get_projected_version()) {}
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
+ }
+};
+
+void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
+{
+ dout(10) << "_link_remote "
+ << (inc ? "link ":"unlink ")
+ << *dn << " to " << *targeti << dendl;
+
+ // 1. send LinkPrepare to dest (journal nlink++ prepare)
+ mds_rank_t linkauth = targeti->authority().first;
+ if (mdr->more()->witnessed.count(linkauth) == 0) {
+ if (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
+ dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
+ if (mdr->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ dout(10) << " targeti auth must prepare nlink++/--" << dendl;
+ int op;
+ if (inc)
+ op = MMDSSlaveRequest::OP_LINKPREP;
+ else
+ op = MMDSSlaveRequest::OP_UNLINKPREP;
+ auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, op);
+ targeti->set_object_info(req->get_object_info());
+ req->op_stamp = mdr->get_op_stamp();
+ if (auto& desti_srnode = mdr->more()->desti_srnode)
+ encode(*desti_srnode, req->desti_snapbl);
+ mds->send_message_mds(req, linkauth);
+
+ ceph_assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
+ mdr->more()->waiting_on_slave.insert(linkauth);
+ return;
+ }
+ dout(10) << " targeti auth has prepared nlink++/--" << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 2);
+
+ if (auto& desti_srnode = mdr->more()->desti_srnode) {
+ delete desti_srnode;
+ desti_srnode = NULL;
+ }
+
+ mdr->set_mds_stamp(ceph_clock_now());
+
+ // add to event
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+ if (!mdr->more()->witnessed.empty()) {
+ dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
+ le->reqid = mdr->reqid;
+ le->had_slaves = true;
+ mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+ }
+
+ if (inc) {
+ dn->pre_dirty();
+ mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
+ le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
+ dn->push_projected_linkage(targeti->ino(), targeti->d_type());
+ } else {
+ dn->pre_dirty();
+ mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
+ mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
+ le->metablob.add_null_dentry(dn, true);
+ dn->push_projected_linkage();
+ }
+
+ journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
+}
+
+void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
+ CDentry *dn, CInode *targeti,
+ version_t dpv)
+{
+ dout(10) << "_link_remote_finish "
+ << (inc ? "link ":"unlink ")
+ << *dn << " to " << *targeti << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 3);
+
+ if (!mdr->more()->witnessed.empty())
+ mdcache->logged_master_update(mdr->reqid);
+
+ if (inc) {
+ // link the new dentry
+ CDentry::linkage_t *dnl = dn->pop_projected_linkage();
+ if (!dnl->get_inode())
+ dn->link_remote(dnl, targeti);
+ dn->mark_dirty(dpv, mdr->ls);
+ } else {
+ // unlink main dentry
+ dn->get_dir()->unlink_inode(dn);
+ dn->pop_projected_linkage();
+ dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
+ }
+
+ mdr->apply();
+
+ MDRequestRef null_ref;
+ if (inc)
+ mdcache->send_dentry_link(dn, null_ref);
+ else
+ mdcache->send_dentry_unlink(dn, NULL, null_ref);
+
+ // bump target popularity
+ mds->balancer->hit_inode(targeti, META_POP_IWR);
+ mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
+
+ // reply
+ respond_to_request(mdr, 0);
+
+ if (!inc)
+ // removing a new dn?
+ dn->get_dir()->try_remove_unlinked_dn(dn);
+}
+
+
+// remote linking/unlinking
+
+class C_MDS_SlaveLinkPrep : public ServerLogContext {
+ CInode *targeti;
+ bool adjust_realm;
+public:
+ C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
+ ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ server->_logged_slave_link(mdr, targeti, adjust_realm);
+ }
+};
+
+class C_MDS_SlaveLinkCommit : public ServerContext {
+ MDRequestRef mdr;
+ CInode *targeti;
+public:
+ C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
+ ServerContext(s), mdr(r), targeti(t) { }
+ void finish(int r) override {
+ server->_commit_slave_link(mdr, r, targeti);
+ }
+};
+
+void Server::handle_slave_link_prep(MDRequestRef& mdr)
+{
+ dout(10) << "handle_slave_link_prep " << *mdr
+ << " on " << mdr->slave_request->get_object_info()
+ << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 4);
+
+ CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
+ ceph_assert(targeti);
+ dout(10) << "targeti " << *targeti << dendl;
+ CDentry *dn = targeti->get_parent_dn();
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ ceph_assert(dnl->is_primary());
+
+ mdr->set_op_stamp(mdr->slave_request->op_stamp);
+
+ mdr->auth_pin(targeti);
+
+ //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
+ ceph_assert(g_conf()->mds_kill_link_at != 5);
+
+ // journal it
+ mdr->ls = mdlog->get_current_segment();
+ ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
+ ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
+ mdlog->start_entry(le);
+
+ auto &pi = dnl->get_inode()->project_inode();
+
+ // update journaled target inode
+ bool inc;
+ bool adjust_realm = false;
+ bool realm_projected = false;
+ if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
+ inc = true;
+ pi.inode.nlink++;
+
+ CDentry *target_pdn = targeti->get_projected_parent_dn();
+ SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
+ if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
+ sr_t *newsnap = targeti->project_snaprealm();
+ targeti->mark_snaprealm_global(newsnap);
+ targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
+ adjust_realm = true;
+ realm_projected = true;
+ }
+ } else {
+ inc = false;
+ pi.inode.nlink--;
+ if (targeti->is_projected_snaprealm_global()) {
+ ceph_assert(mdr->slave_request->desti_snapbl.length());
+ auto p = mdr->slave_request->desti_snapbl.cbegin();
+
+ sr_t *newsnap = targeti->project_snaprealm();
+ decode(*newsnap, p);
+
+ if (pi.inode.nlink == 0)
+ ceph_assert(!newsnap->is_parent_global());
+
+ realm_projected = true;
+ } else {
+ ceph_assert(mdr->slave_request->desti_snapbl.length() == 0);
+ }
+ }
+
+ link_rollback rollback;
+ rollback.reqid = mdr->reqid;
+ rollback.ino = targeti->ino();
+ rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
+ const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
+ rollback.old_dir_mtime = pf->fragstat.mtime;
+ rollback.old_dir_rctime = pf->rstat.rctime;
+ rollback.was_inc = inc;
+ if (realm_projected) {
+ if (targeti->snaprealm) {
+ encode(true, rollback.snapbl);
+ targeti->encode_snap_blob(rollback.snapbl);
+ } else {
+ encode(false, rollback.snapbl);
+ }
+ }
+ encode(rollback, le->rollback);
+ mdr->more()->rollback_bl = le->rollback;
+
+ pi.inode.ctime = mdr->get_op_stamp();
+ pi.inode.version = targeti->pre_dirty();
+
+ dout(10) << " projected inode " << pi.inode.ino << " v " << pi.inode.version << dendl;
+
+ // commit case
+ mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
+ mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
+
+ // set up commit waiter
+ mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
+
+ mdr->more()->slave_update_journaled = true;
+ submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, adjust_realm),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
+{
+ dout(10) << "_logged_slave_link " << *mdr
+ << " " << *targeti << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 6);
+
+ // update the target
+ targeti->pop_and_dirty_projected_inode(mdr->ls);
+ mdr->apply();
+
+ // hit pop
+ mds->balancer->hit_inode(targeti, META_POP_IWR);
+
+ // done.
+ mdr->reset_slave_request();
+
+ if (adjust_realm) {
+ int op = CEPH_SNAP_OP_SPLIT;
+ mds->mdcache->send_snap_update(targeti, 0, op);
+ mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
+ }
+
+ // ack
+ if (!mdr->aborted) {
+ auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_LINKPREPACK);
+ mds->send_message_mds(reply, mdr->slave_to_mds);
+ } else {
+ dout(10) << " abort flag set, finishing" << dendl;
+ mdcache->request_finish(mdr);
+ }
+}
+
+
+struct C_MDS_CommittedSlave : public ServerLogContext {
+ C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
+ void finish(int r) override {
+ server->_committed_slave(mdr);
+ }
+};
+
+void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
+{
+ dout(10) << "_commit_slave_link " << *mdr
+ << " r=" << r
+ << " " << *targeti << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 7);
+
+ if (r == 0) {
+ // drop our pins, etc.
+ mdr->cleanup();
+
+ // write a commit to the journal
+ ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
+ ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
+ mdlog->start_entry(le);
+ submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
+ mdlog->flush();
+ } else {
+ do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
+ }
+}
+
+void Server::_committed_slave(MDRequestRef& mdr)
+{
+ dout(10) << "_committed_slave " << *mdr << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 8);
+
+ bool assert_exist = mdr->more()->slave_update_journaled;
+ mdcache->finish_uncommitted_slave(mdr->reqid, assert_exist);
+ auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_COMMITTED);
+ mds->send_message_mds(req, mdr->slave_to_mds);
+ mdcache->request_finish(mdr);
+}
+
+struct C_MDS_LoggedLinkRollback : public ServerLogContext {
+ MutationRef mut;
+ map<client_t,MClientSnap::ref> splits;
+ C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
+ map<client_t,MClientSnap::ref>&& _splits) :
+ ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
+ }
+ void finish(int r) override {
+ server->_link_rollback_finish(mut, mdr, splits);
+ }
+};
+
+void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
+{
+ link_rollback rollback;
+ auto p = rbl.cbegin();
+ decode(rollback, p);
+
+ dout(10) << "do_link_rollback on " << rollback.reqid
+ << (rollback.was_inc ? " inc":" dec")
+ << " ino " << rollback.ino
+ << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 9);
+
+ mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
+ ceph_assert(mdr || mds->is_resolve());
+
+ MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
+ mut->ls = mds->mdlog->get_current_segment();
+
+ CInode *in = mdcache->get_inode(rollback.ino);
+ ceph_assert(in);
+ dout(10) << " target is " << *in << dendl;
+ ceph_assert(!in->is_projected()); // live slave request hold versionlock xlock.
+
+ auto &pi = in->project_inode();
+ pi.inode.version = in->pre_dirty();
+ mut->add_projected_inode(in);
+
+ // parent dir rctime
+ CDir *parent = in->get_projected_parent_dn()->get_dir();
+ fnode_t *pf = parent->project_fnode();
+ mut->add_projected_fnode(parent);
+ pf->version = parent->pre_dirty();
+ if (pf->fragstat.mtime == pi.inode.ctime) {
+ pf->fragstat.mtime = rollback.old_dir_mtime;
+ if (pf->rstat.rctime == pi.inode.ctime)
+ pf->rstat.rctime = rollback.old_dir_rctime;
+ mut->add_updated_lock(&parent->get_inode()->filelock);
+ mut->add_updated_lock(&parent->get_inode()->nestlock);
+ }
+
+ // inode
+ pi.inode.ctime = rollback.old_ctime;
+ if (rollback.was_inc)
+ pi.inode.nlink--;
+ else
+ pi.inode.nlink++;
+
+ map<client_t,MClientSnap::ref> splits;
+ if (rollback.snapbl.length() && in->snaprealm) {
+ bool hadrealm;
+ auto p = rollback.snapbl.cbegin();
+ decode(hadrealm, p);
+ if (hadrealm) {
+ if (!mds->is_resolve()) {
+ sr_t *new_srnode = new sr_t();
+ decode(*new_srnode, p);
+ in->project_snaprealm(new_srnode);
+ } else {
+ decode(in->snaprealm->srnode, p);
+ }
+ } else {
+ SnapRealm *realm = parent->get_inode()->find_snaprealm();
+ if (!mds->is_resolve())
+ mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
+ in->project_snaprealm(NULL);
+ }
+ }
+
+ // journal it
+ ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
+ ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
+ mdlog->start_entry(le);
+ le->commit.add_dir_context(parent);
+ le->commit.add_dir(parent, true);
+ le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
+
+ submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
+ map<client_t,MClientSnap::ref>& splits)
+{
+ dout(10) << "_link_rollback_finish" << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 10);
+
+ mut->apply();
+
+ if (!mds->is_resolve())
+ mdcache->send_snaps(splits);
+
+ if (mdr)
+ mdcache->request_finish(mdr);
+
+ mdcache->finish_rollback(mut->reqid, mdr);
+
+ mut->cleanup();
+}
+
+
+void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &m)
+{
+ dout(10) << "handle_slave_link_prep_ack " << *mdr
+ << " " << *m << dendl;
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+
+ ceph_assert(g_conf()->mds_kill_link_at != 11);
+
+ // note slave
+ mdr->more()->slaves.insert(from);
+
+ // witnessed!
+ ceph_assert(mdr->more()->witnessed.count(from) == 0);
+ mdr->more()->witnessed.insert(from);
+ ceph_assert(!m->is_not_journaled());
+ mdr->more()->has_journaled_slaves = true;
+
+ // remove from waiting list
+ ceph_assert(mdr->more()->waiting_on_slave.count(from));
+ mdr->more()->waiting_on_slave.erase(from);
+
+ ceph_assert(mdr->more()->waiting_on_slave.empty());
+
+ dispatch_client_request(mdr); // go again!
+}
+
+
+
+
+
+// UNLINK
+
+void Server::handle_client_unlink(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ client_t client = mdr->get_client();
+
+ // rmdir or unlink?
+ bool rmdir = false;
+ if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
+
+ const filepath& refpath = req->get_filepath();
+ if (refpath.depth() == 0) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ if (refpath.is_last_dot_or_dotdot()) {
+ respond_to_request(mdr, -ENOTEMPTY);
+ return;
+ }
+
+ // traverse to path
+ vector<CDentry*> trace;
+ CInode *in;
+ CF_MDS_MDRContextFactory cf(mdcache, mdr);
+ int r = mdcache->path_traverse(mdr, cf, refpath, &trace, &in, MDS_TRAVERSE_FORWARD);
+ if (r > 0) return;
+ if (r < 0) {
+ if (r == -ESTALE) {
+ dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
+ mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
+ return;
+ }
+ respond_to_request(mdr, r);
+ return;
+ }
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+
+ CDentry *dn = trace.back();
+ ceph_assert(dn);
+ if (!dn->is_auth()) {
+ mdcache->request_forward(mdr, dn->authority().first);
+ return;
+ }
+
+ CInode *diri = dn->get_dir()->get_inode();
+
+ CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
+ ceph_assert(!dnl->is_null());
+
+ if (rmdir) {
+ dout(7) << "handle_client_rmdir on " << *dn << dendl;
+ } else {
+ dout(7) << "handle_client_unlink on " << *dn << dendl;
+ }
+ dout(7) << "dn links to " << *in << dendl;
+
+ // rmdir vs is_dir
+ if (in->is_dir()) {
+ if (rmdir) {
+ // do empty directory checks
+ if (_dir_is_nonempty_unlocked(mdr, in)) {
+ respond_to_request(mdr, -ENOTEMPTY);
+ return;
+ }
+ } else {
+ dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
+ respond_to_request(mdr, -EISDIR);
+ return;
+ }
+ } else {
+ if (rmdir) {
+ // unlink
+ dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
+ respond_to_request(mdr, -ENOTDIR);
+ return;
+ }
+ }
+
+ // -- create stray dentry? --
+ CDentry *straydn = NULL;
+ if (dnl->is_primary()) {
+ straydn = prepare_stray_dentry(mdr, dnl->get_inode());
+ if (!straydn)
+ return;
+ dout(10) << " straydn is " << *straydn << dendl;
+ } else if (mdr->straydn) {
+ mdr->unpin(mdr->straydn);
+ mdr->straydn = NULL;
+ }
+
+ // lock
+ MutationImpl::LockOpVec lov;
+
+ for (int i=0; i<(int)trace.size()-1; i++)
+ lov.add_rdlock(&trace[i]->lock);
+ lov.add_xlock(&dn->lock);
+ lov.add_wrlock(&diri->filelock);
+ lov.add_wrlock(&diri->nestlock);
+ lov.add_xlock(&in->linklock);
+ if (straydn) {
+ lov.add_wrlock(&straydn->get_dir()->inode->filelock);
+ lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
+ lov.add_xlock(&straydn->lock);
+ }
+
+ mds->locker->include_snap_rdlocks(diri, lov);
+ lov.add_xlock(&in->snaplock);
+ if (in->is_dir())
+ lov.add_rdlock(&in->filelock); // to verify it's empty
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (in->is_dir() &&
+ _dir_is_nonempty(mdr, in)) {
+ respond_to_request(mdr, -ENOTEMPTY);
+ return;
+ }
+
+ if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
+ if (!check_access(mdr, diri, MAY_WRITE))
+ return;
+ }
+
+ if (straydn)
+ straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+
+ if (!mdr->more()->desti_srnode) {
+ if (in->is_projected_snaprealm_global()) {
+ sr_t *new_srnode = in->prepare_new_srnode(0);
+ in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
+ // dropping the last linkage or dropping the last remote linkage,
+ // detch the inode from global snaprealm
+ auto nlink = in->get_projected_inode()->nlink;
+ if (nlink == 1 ||
+ (nlink == 2 && !dnl->is_primary() &&
+ !in->get_projected_parent_dir()->inode->is_stray()))
+ in->clear_snaprealm_global(new_srnode);
+ mdr->more()->desti_srnode = new_srnode;
+ } else if (dnl->is_primary()) {
+ // prepare snaprealm blob for slave request
+ SnapRealm *realm = in->find_snaprealm();
+ snapid_t follows = realm->get_newest_seq();
+ if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
+ sr_t *new_srnode = in->prepare_new_srnode(follows);
+ in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
+ mdr->more()->desti_srnode = new_srnode;
+ }
+ }
+ }
+
+ // yay!
+ if (in->is_dir() && in->has_subtree_root_dirfrag()) {
+ // subtree root auths need to be witnesses
+ set<mds_rank_t> witnesses;
+ in->list_replicas(witnesses);
+ dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
+
+ for (set<mds_rank_t>::iterator p = witnesses.begin();
+ p != witnesses.end();
+ ++p) {
+ if (mdr->more()->witnessed.count(*p)) {
+ dout(10) << " already witnessed by mds." << *p << dendl;
+ } else if (mdr->more()->waiting_on_slave.count(*p)) {
+ dout(10) << " already waiting on witness mds." << *p << dendl;
+ } else {
+ if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
+ return;
+ }
+ }
+ if (!mdr->more()->waiting_on_slave.empty())
+ return; // we're waiting for a witness.
+ }
+
+ // ok!
+ if (dnl->is_remote() && !dnl->get_inode()->is_auth())
+ _link_remote(mdr, false, dn, dnl->get_inode());
+ else
+ _unlink_local(mdr, dn, straydn);
+}
+
+class C_MDS_unlink_local_finish : public ServerLogContext {
+ CDentry *dn;
+ CDentry *straydn;
+ version_t dnpv; // deleted dentry
+public:
+ C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
+ ServerLogContext(s, r), dn(d), straydn(sd),
+ dnpv(d->get_projected_version()) {}
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ server->_unlink_local_finish(mdr, dn, straydn, dnpv);
+ }
+};
+
+void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
+{
+ dout(10) << "_unlink_local " << *dn << dendl;
+
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ CInode *in = dnl->get_inode();
+
+
+ // ok, let's do it.
+ mdr->ls = mdlog->get_current_segment();
+
+ // prepare log entry
+ EUpdate *le = new EUpdate(mdlog, "unlink_local");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+ if (!mdr->more()->witnessed.empty()) {
+ dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
+ le->reqid = mdr->reqid;
+ le->had_slaves = true;
+ mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+ }
+
+ if (straydn) {
+ ceph_assert(dnl->is_primary());
+ straydn->push_projected_linkage(in);
+ }
+
+ // the unlinked dentry
+ dn->pre_dirty();
+
+ auto &pi = in->project_inode();
+ {
+ std::string t;
+ dn->make_path_string(t, true);
+ pi.inode.stray_prior_path = std::move(t);
+ }
+ pi.inode.version = in->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.change_attr++;
+ pi.inode.nlink--;
+ if (pi.inode.nlink == 0)
+ in->state_set(CInode::STATE_ORPHAN);
+
+ if (mdr->more()->desti_srnode) {
+ auto& desti_srnode = mdr->more()->desti_srnode;
+ in->project_snaprealm(desti_srnode);
+ desti_srnode = NULL;
+ }
+
+ if (straydn) {
+ // will manually pop projected inode
+
+ // primary link. add stray dentry.
+ mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+
+ pi.inode.update_backtrace();
+ le->metablob.add_primary_dentry(straydn, in, true, true);
+ } else {
+ mdr->add_projected_inode(in);
+ // remote link. update remote inode.
+ mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
+ }
+
+ mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
+ le->metablob.add_null_dentry(dn, true);
+
+ if (in->is_dir()) {
+ dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
+ le->metablob.renamed_dirino = in->ino();
+ }
+
+ dn->push_projected_linkage();
+
+ if (straydn) {
+ ceph_assert(in->first <= straydn->first);
+ in->first = straydn->first;
+ }
+
+ if (in->is_dir()) {
+ ceph_assert(straydn);
+ mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
+ }
+
+ journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
+}
+
+void Server::_unlink_local_finish(MDRequestRef& mdr,
+ CDentry *dn, CDentry *straydn,
+ version_t dnpv)
+{
+ dout(10) << "_unlink_local_finish " << *dn << dendl;
+
+ if (!mdr->more()->witnessed.empty())
+ mdcache->logged_master_update(mdr->reqid);
+
+ CInode *strayin = NULL;
+ bool hadrealm = false;
+ if (straydn) {
+ // if there is newly created snaprealm, need to split old snaprealm's
+ // inodes_with_caps. So pop snaprealm before linkage changes.
+ strayin = dn->get_linkage()->get_inode();
+ hadrealm = strayin->snaprealm ? true : false;
+ strayin->early_pop_projected_snaprealm();
+ }
+
+ // unlink main dentry
+ dn->get_dir()->unlink_inode(dn);
+ dn->pop_projected_linkage();
+
+ // relink as stray? (i.e. was primary link?)
+ if (straydn) {
+ dout(20) << " straydn is " << *straydn << dendl;
+ straydn->pop_projected_linkage();
+
+ strayin->pop_and_dirty_projected_inode(mdr->ls);
+
+ mdcache->touch_dentry_bottom(straydn);
+ }
+
+ dn->mark_dirty(dnpv, mdr->ls);
+ mdr->apply();
+
+ mdcache->send_dentry_unlink(dn, straydn, mdr);
+
+ if (straydn) {
+ // update subtree map?
+ if (strayin->is_dir())
+ mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
+
+ if (strayin->snaprealm && !hadrealm)
+ mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
+ }
+
+ // bump pop
+ mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
+
+ // reply
+ respond_to_request(mdr, 0);
+
+ // removing a new dn?
+ dn->get_dir()->try_remove_unlinked_dn(dn);
+
+ // clean up ?
+ // respond_to_request() drops locks. So stray reintegration can race with us.
+ if (straydn && !straydn->get_projected_linkage()->is_null()) {
+ // Tip off the MDCache that this dentry is a stray that
+ // might be elegible for purge.
+ mdcache->notify_stray(straydn);
+ }
+}
+
+bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
+{
+ if (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
+ dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
+ if (mdr->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
+ return false;
+ }
+
+ dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
+ auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREP);
+ req->srcdnpath = filepath(trace.front()->get_dir()->ino());
+ for (auto dn : trace)
+ req->srcdnpath.push_dentry(dn->get_name());
+ mdcache->replicate_stray(straydn, who, req->straybl);
+ if (mdr->more()->desti_srnode)
+ encode(*mdr->more()->desti_srnode, req->desti_snapbl);
+
+ req->op_stamp = mdr->get_op_stamp();
+ mds->send_message_mds(req, who);
+
+ ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
+ mdr->more()->waiting_on_slave.insert(who);
+ return true;
+}
+
+struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
+ CDentry *dn, *straydn;
+ C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
+ : ServerLogContext(s, r), dn(d), straydn(st) {}
+ void finish(int r) override {
+ server->_logged_slave_rmdir(mdr, dn, straydn);
+ }
+};
+
+struct C_MDS_SlaveRmdirCommit : public ServerContext {
+ MDRequestRef mdr;
+ CDentry *straydn;
+ C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
+ : ServerContext(s), mdr(r), straydn(sd) { }
+ void finish(int r) override {
+ server->_commit_slave_rmdir(mdr, r, straydn);
+ }
+};
+
+void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
+{
+ dout(10) << "handle_slave_rmdir_prep " << *mdr
+ << " " << mdr->slave_request->srcdnpath
+ << " to " << mdr->slave_request->destdnpath
+ << dendl;
+
+ vector<CDentry*> trace;
+ filepath srcpath(mdr->slave_request->srcdnpath);
+ dout(10) << " src " << srcpath << dendl;
+ CInode *in;
+ CF_MDS_MDRContextFactory cf(mdcache, mdr);
+ int r = mdcache->path_traverse(mdr, cf, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
+ if (r > 0) return;
+ if (r == -ESTALE) {
+ mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
+ mdr->slave_to_mds);
+ return;
+ }
+ ceph_assert(r == 0);
+ CDentry *dn = trace.back();
+ dout(10) << " dn " << *dn << dendl;
+ mdr->pin(dn);
+
+ ceph_assert(mdr->straydn);
+ CDentry *straydn = mdr->straydn;
+ dout(10) << " straydn " << *straydn << dendl;
+
+ mdr->set_op_stamp(mdr->slave_request->op_stamp);
+
+ rmdir_rollback rollback;
+ rollback.reqid = mdr->reqid;
+ rollback.src_dir = dn->get_dir()->dirfrag();
+ rollback.src_dname = dn->get_name();
+ rollback.dest_dir = straydn->get_dir()->dirfrag();
+ rollback.dest_dname = straydn->get_name();
+ if (mdr->slave_request->desti_snapbl.length()) {
+ if (in->snaprealm) {
+ encode(true, rollback.snapbl);
+ in->encode_snap_blob(rollback.snapbl);
+ } else {
+ encode(false, rollback.snapbl);
+ }
+ }
+ encode(rollback, mdr->more()->rollback_bl);
+ // FIXME: rollback snaprealm
+ dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
+
+ // set up commit waiter
+ mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
+
+ straydn->push_projected_linkage(in);
+ dn->push_projected_linkage();
+
+ ceph_assert(straydn->first >= in->first);
+ in->first = straydn->first;
+
+ if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
+ dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
+ _logged_slave_rmdir(mdr, dn, straydn);
+ return;
+ }
+
+ mdr->ls = mdlog->get_current_segment();
+ ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
+ ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
+ mdlog->start_entry(le);
+ le->rollback = mdr->more()->rollback_bl;
+
+ le->commit.add_dir_context(straydn->get_dir());
+ le->commit.add_primary_dentry(straydn, in, true);
+ // slave: no need to journal original dentry
+
+ dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
+ le->commit.renamed_dirino = in->ino();
+
+ mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
+ mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
+
+ mdr->more()->slave_update_journaled = true;
+ submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
+{
+ dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
+ CInode *in = dn->get_linkage()->get_inode();
+
+ bool new_realm;
+ if (mdr->slave_request->desti_snapbl.length()) {
+ new_realm = !in->snaprealm;
+ in->decode_snap_blob(mdr->slave_request->desti_snapbl);
+ ceph_assert(in->snaprealm);
+ ceph_assert(in->snaprealm->have_past_parents_open());
+ } else {
+ new_realm = false;
+ }
+
+ // update our cache now, so we are consistent with what is in the journal
+ // when we journal a subtree map
+ dn->get_dir()->unlink_inode(dn);
+ straydn->pop_projected_linkage();
+ dn->pop_projected_linkage();
+
+ mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->slave_update_journaled);
+
+ if (new_realm)
+ mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
+
+ // done.
+ mdr->reset_slave_request();
+ mdr->straydn = 0;
+
+ if (!mdr->aborted) {
+ auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREPACK);
+ if (!mdr->more()->slave_update_journaled)
+ reply->mark_not_journaled();
+ mds->send_message_mds(reply, mdr->slave_to_mds);
+ } else {
+ dout(10) << " abort flag set, finishing" << dendl;
+ mdcache->request_finish(mdr);
+ }
+}
+
+void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
+{
+ dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
+ << " " << *ack << dendl;
+
+ mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+ mdr->more()->slaves.insert(from);
+ mdr->more()->witnessed.insert(from);
+ if (!ack->is_not_journaled())
+ mdr->more()->has_journaled_slaves = true;
+
+ // remove from waiting list
+ ceph_assert(mdr->more()->waiting_on_slave.count(from));
+ mdr->more()->waiting_on_slave.erase(from);
+
+ if (mdr->more()->waiting_on_slave.empty())
+ dispatch_client_request(mdr); // go again!
+ else
+ dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
+}
+
+void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
+{
+ dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
+
+ if (r == 0) {
+ if (mdr->more()->slave_update_journaled) {
+ CInode *strayin = straydn->get_projected_linkage()->get_inode();
+ if (strayin && !strayin->snaprealm)
+ mdcache->clear_dirty_bits_for_stray(strayin);
+ }
+
+ mdr->cleanup();
+
+ if (mdr->more()->slave_update_journaled) {
+ // write a commit to the journal
+ ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
+ mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
+ ESlaveUpdate::RMDIR);
+ mdlog->start_entry(le);
+ submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
+ mdlog->flush();
+ } else {
+ _committed_slave(mdr);
+ }
+ } else {
+ // abort
+ do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
+ }
+}
+
+struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
+ metareqid_t reqid;
+ CDentry *dn;
+ CDentry *straydn;
+ C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
+ : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
+ void finish(int r) override {
+ server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
+ }
+};
+
+void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
+{
+ // unlink the other rollback methods, the rmdir rollback is only
+ // needed to record the subtree changes in the journal for inode
+ // replicas who are auth for empty dirfrags. no actual changes to
+ // the file system are taking place here, so there is no Mutation.
+
+ rmdir_rollback rollback;
+ auto p = rbl.cbegin();
+ decode(rollback, p);
+
+ dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
+ mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
+ ceph_assert(mdr || mds->is_resolve());
+
+ CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
+ if (!dir)
+ dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
+ ceph_assert(dir);
+ CDentry *dn = dir->lookup(rollback.src_dname);
+ ceph_assert(dn);
+ dout(10) << " dn " << *dn << dendl;
+ CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
+ ceph_assert(straydir);
+ CDentry *straydn = straydir->lookup(rollback.dest_dname);
+ ceph_assert(straydn);
+ dout(10) << " straydn " << *straydn << dendl;
+ CInode *in = straydn->get_linkage()->get_inode();
+
+ dn->push_projected_linkage(in);
+ straydn->push_projected_linkage();
+
+ if (rollback.snapbl.length() && in->snaprealm) {
+ bool hadrealm;
+ auto p = rollback.snapbl.cbegin();
+ decode(hadrealm, p);
+ if (hadrealm) {
+ decode(in->snaprealm->srnode, p);
+ } else {
+ in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
+ }
+ }
+
+ if (mdr && !mdr->more()->slave_update_journaled) {
+ ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
+
+ _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
+ return;
+ }
+
+
+ ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
+ ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
+ mdlog->start_entry(le);
+
+ le->commit.add_dir_context(dn->get_dir());
+ le->commit.add_primary_dentry(dn, in, true);
+ // slave: no need to journal straydn
+
+ dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
+ le->commit.renamed_dirino = in->ino();
+
+ mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
+
+ submit_mdlog_entry(le,
+ new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
+ dn, straydn),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
+{
+ dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
+
+ straydn->get_dir()->unlink_inode(straydn);
+ dn->pop_projected_linkage();
+ straydn->pop_projected_linkage();
+
+ CInode *in = dn->get_linkage()->get_inode();
+ mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
+ !mdr || mdr->more()->slave_update_journaled);
+
+ if (mds->is_resolve()) {
+ CDir *root = mdcache->get_subtree_root(straydn->get_dir());
+ mdcache->try_trim_non_auth_subtree(root);
+ }
+
+ if (mdr)
+ mdcache->request_finish(mdr);
+
+ mdcache->finish_rollback(reqid, mdr);
+}
+
+
+/** _dir_is_nonempty[_unlocked]
+ *
+ * check if a directory is non-empty (i.e. we can rmdir it).
+ *
+ * the unlocked varient this is a fastpath check. we can't really be
+ * sure until we rdlock the filelock.
+ */
+bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
+{
+ dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
+ ceph_assert(in->is_auth());
+
+ if (in->snaprealm && in->snaprealm->srnode.snaps.size())
+ return true; // in a snapshot!
+
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ CDir *dir = *p;
+ // is the frag obviously non-empty?
+ if (dir->is_auth()) {
+ if (dir->get_projected_fnode()->fragstat.size()) {
+ dout(10) << "dir_is_nonempty_unlocked dirstat has "
+ << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
+{
+ dout(10) << "dir_is_nonempty " << *in << dendl;
+ ceph_assert(in->is_auth());
+ ceph_assert(in->filelock.can_read(mdr->get_client()));
+
+ frag_info_t dirstat;
+ version_t dirstat_version = in->get_projected_inode()->dirstat.version;
+
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ CDir *dir = *p;
+ const fnode_t *pf = dir->get_projected_fnode();
+ if (pf->fragstat.size()) {
+ dout(10) << "dir_is_nonempty dirstat has "
+ << pf->fragstat.size() << " items " << *dir << dendl;
+ return true;
+ }
+
+ if (pf->accounted_fragstat.version == dirstat_version)
+ dirstat.add(pf->accounted_fragstat);
+ else
+ dirstat.add(pf->fragstat);
+ }
+
+ return dirstat.size() != in->get_projected_inode()->dirstat.size();
+}
+
+
+// ======================================================
+
+
+class C_MDS_rename_finish : public ServerLogContext {
+ CDentry *srcdn;
+ CDentry *destdn;
+ CDentry *straydn;
+public:
+ C_MDS_rename_finish(Server *s, MDRequestRef& r,
+ CDentry *sdn, CDentry *ddn, CDentry *stdn) :
+ ServerLogContext(s, r),
+ srcdn(sdn), destdn(ddn), straydn(stdn) { }
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ server->_rename_finish(mdr, srcdn, destdn, straydn);
+ }
+};
+
+
+/** handle_client_rename
+ *
+ * rename master is the destdn auth. this is because cached inodes
+ * must remain connected. thus, any replica of srci, must also
+ * replicate destdn, and possibly straydn, so that srci (and
+ * destdn->inode) remain connected during the rename.
+ *
+ * to do this, we freeze srci, then master (destdn auth) verifies that
+ * all other nodes have also replciated destdn and straydn. note that
+ * destdn replicas need not also replicate srci. this only works when
+ * destdn is master.
+ *
+ * This function takes responsibility for the passed mdr.
+ */
+void Server::handle_client_rename(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ dout(7) << "handle_client_rename " << *req << dendl;
+
+ filepath destpath = req->get_filepath();
+ filepath srcpath = req->get_filepath2();
+ if (destpath.depth() == 0 || srcpath.depth() == 0) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
+ respond_to_request(mdr, -EBUSY);
+ return;
+ }
+
+ std::string_view destname = destpath.last_dentry();
+
+ vector<CDentry*>& srctrace = mdr->dn[1];
+ vector<CDentry*>& desttrace = mdr->dn[0];
+
+ MutationImpl::LockOpVec lov;
+
+ CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, lov, true, false, true);
+ if (!destdn) return;
+ dout(10) << " destdn " << *destdn << dendl;
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+ CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
+ CDir *destdir = destdn->get_dir();
+ ceph_assert(destdir->is_auth());
+
+ CF_MDS_MDRContextFactory cf(mdcache, mdr);
+ int r = mdcache->path_traverse(mdr, cf, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
+ if (r > 0)
+ return; // delayed
+ if (r < 0) {
+ if (r == -ESTALE) {
+ dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
+ mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
+ } else {
+ dout(10) << "FAIL on error " << r << dendl;
+ respond_to_request(mdr, r);
+ }
+ return;
+
+ }
+ ceph_assert(!srctrace.empty());
+ CDentry *srcdn = srctrace.back();
+ dout(10) << " srcdn " << *srcdn << dendl;
+ if (srcdn->last != CEPH_NOSNAP) {
+ respond_to_request(mdr, -EROFS);
+ return;
+ }
+ CDir *srcdir = srcdn->get_dir();
+ CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+ CInode *srci = srcdnl->get_inode();
+ dout(10) << " srci " << *srci << dendl;
+
+ CInode *oldin = 0;
+ if (!destdnl->is_null()) {
+ //dout(10) << "dest dn exists " << *destdn << dendl;
+ oldin = mdcache->get_dentry_inode(destdn, mdr, true);
+ if (!oldin) return;
+ dout(10) << " oldin " << *oldin << dendl;
+
+ // non-empty dir? do trivial fast unlocked check, do another check later with read locks
+ if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
+ respond_to_request(mdr, -ENOTEMPTY);
+ return;
+ }
+
+ // if srcdn is replica, need to make sure its linkage is correct
+ if (srcdn->is_auth() ||
+ srcdn->lock.can_read(mdr->get_client()) ||
+ (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) {
+ // mv /some/thing /to/some/existing_other_thing
+ if (oldin->is_dir() && !srci->is_dir()) {
+ respond_to_request(mdr, -EISDIR);
+ return;
+ }
+ if (!oldin->is_dir() && srci->is_dir()) {
+ respond_to_request(mdr, -ENOTDIR);
+ return;
+ }
+ if (srci == oldin && !srcdir->inode->is_stray()) {
+ respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
+ return;
+ }
+ }
+ }
+
+ // -- some sanity checks --
+
+ // src+dest traces _must_ share a common ancestor for locking to prevent orphans
+ if (destpath.get_ino() != srcpath.get_ino() &&
+ !(req->get_source().is_mds() &&
+ MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
+ CInode *srcbase = srctrace[0]->get_dir()->get_inode();
+ CInode *destbase = desttrace[0]->get_dir()->get_inode();
+ // ok, extend srctrace toward root until it is an ancestor of desttrace.
+ while (srcbase != destbase &&
+ !srcbase->is_projected_ancestor_of(destbase)) {
+ CDentry *pdn = srcbase->get_projected_parent_dn();
+ srctrace.insert(srctrace.begin(), pdn);
+ dout(10) << "rename prepending srctrace with " << *pdn << dendl;
+ srcbase = pdn->get_dir()->get_inode();
+ }
+
+ // then, extend destpath until it shares the same parent inode as srcpath.
+ while (destbase != srcbase) {
+ CDentry *pdn = destbase->get_projected_parent_dn();
+ desttrace.insert(desttrace.begin(), pdn);
+ lov.add_rdlock(&pdn->lock);
+ dout(10) << "rename prepending desttrace with " << *pdn << dendl;
+ destbase = pdn->get_dir()->get_inode();
+ }
+ dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
+ }
+
+ // src == dest?
+ if (srcdir == destdir && srcdn->get_name() == destname) {
+ dout(7) << "rename src=dest, noop" << dendl;
+ respond_to_request(mdr, 0);
+ return;
+ }
+
+ // dest a child of src?
+ // e.g. mv /usr /usr/foo
+ CDentry *pdn = destdir->inode->get_projected_parent_dn();
+ while (pdn) {
+ if (pdn == srcdn) {
+ dout(7) << "cannot rename item to be a child of itself" << dendl;
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ pdn = pdn->get_dir()->inode->parent;
+ }
+
+ // is this a stray migration, reintegration or merge? (sanity checks!)
+ if (mdr->reqid.name.is_mds() &&
+ !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
+ MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
+ !(destdnl->is_remote() &&
+ destdnl->get_remote_ino() == srci->ino())) {
+ respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
+ return;
+ }
+
+ bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
+ if (linkmerge)
+ dout(10) << " this is a link merge" << dendl;
+
+ // -- create stray dentry? --
+ CDentry *straydn = NULL;
+ if (destdnl->is_primary() && !linkmerge) {
+ straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
+ if (!straydn)
+ return;
+ dout(10) << " straydn is " << *straydn << dendl;
+ } else if (mdr->straydn) {
+ mdr->unpin(mdr->straydn);
+ mdr->straydn = NULL;
+ }
+
+ // -- prepare witness list --
+ /*
+ * NOTE: we use _all_ replicas as witnesses.
+ * this probably isn't totally necessary (esp for file renames),
+ * but if/when we change that, we have to make sure rejoin is
+ * sufficiently robust to handle strong rejoins from survivors
+ * with totally wrong dentry->inode linkage.
+ * (currently, it can ignore rename effects, because the resolve
+ * stage will sort them out.)
+ */
+ set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
+ if (srcdn->is_auth())
+ srcdn->list_replicas(witnesses);
+ else
+ witnesses.insert(srcdn->authority().first);
+ if (srcdnl->is_remote() && !srci->is_auth())
+ witnesses.insert(srci->authority().first);
+ destdn->list_replicas(witnesses);
+ if (destdnl->is_remote() && !oldin->is_auth())
+ witnesses.insert(oldin->authority().first);
+ dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
+
+
+ // -- locks --
+
+ // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
+ for (int i=0; i<(int)srctrace.size(); i++)
+ lov.add_rdlock(&srctrace[i]->lock);
+ lov.add_xlock(&srcdn->lock);
+ mds_rank_t srcdirauth = srcdir->authority().first;
+ if (srcdirauth != mds->get_nodeid()) {
+ dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
+ lov.add_remote_wrlock(&srcdir->inode->filelock, srcdirauth);
+ lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdirauth);
+ if (srci->is_dir())
+ lov.add_rdlock(&srci->dirfragtreelock);
+ } else {
+ lov.add_wrlock(&srcdir->inode->filelock);
+ lov.add_wrlock(&srcdir->inode->nestlock);
+ }
+ mds->locker->include_snap_rdlocks(srcdir->inode, lov);
+
+ // straydn?
+ if (straydn) {
+ lov.add_wrlock(&straydn->get_dir()->inode->filelock);
+ lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
+ lov.add_xlock(&straydn->lock);
+ }
+
+ // xlock versionlock on dentries if there are witnesses.
+ // replicas can't see projected dentry linkages, and will get
+ // confused if we try to pipeline things.
+ if (!witnesses.empty()) {
+ // take xlock on all projected ancestor dentries for srcdn and destdn.
+ // this ensures the srcdn and destdn can be traversed to by the witnesses.
+ for (int i= 0; i<(int)srctrace.size(); i++) {
+ if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
+ lov.add_xlock(&srctrace[i]->versionlock);
+ }
+ for (int i=0; i<(int)desttrace.size(); i++) {
+ if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
+ lov.add_xlock(&desttrace[i]->versionlock);
+ }
+ // xlock srci and oldin's primary dentries, so witnesses can call
+ // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
+ // is traversed.
+ if (srcdnl->is_remote())
+ lov.add_xlock(&srci->get_projected_parent_dn()->lock);
+ if (destdnl->is_remote())
+ lov.add_xlock(&oldin->get_projected_parent_dn()->lock);
+ }
+
+ // we need to update srci's ctime. xlock its least contended lock to do that...
+ lov.add_xlock(&srci->linklock);
+ lov.add_xlock(&srci->snaplock);
+
+ if (oldin) {
+ // xlock oldin (for nlink--)
+ lov.add_xlock(&oldin->linklock);
+ lov.add_xlock(&oldin->snaplock);
+ if (oldin->is_dir())
+ lov.add_rdlock(&oldin->filelock); // to verify it's empty
+ }
+
+ CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
+ if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
+ return;
+
+ if (linkmerge)
+ ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
+
+ if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
+ if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
+ return;
+
+ if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
+ return;
+
+ if (!check_fragment_space(mdr, destdn->get_dir()))
+ return;
+
+ if (!check_access(mdr, srci, MAY_WRITE))
+ return;
+ }
+
+ // with read lock, really verify oldin is empty
+ if (oldin &&
+ oldin->is_dir() &&
+ _dir_is_nonempty(mdr, oldin)) {
+ respond_to_request(mdr, -ENOTEMPTY);
+ return;
+ }
+
+ /* project_snaprealm_past_parent() will do this job
+ *
+ // moving between snaprealms?
+ if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
+ SnapRealm *srcrealm = srci->find_snaprealm();
+ SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
+ if (srcrealm != destrealm &&
+ (srcrealm->get_newest_seq() + 1 > srcdn->first ||
+ destrealm->get_newest_seq() + 1 > srcdn->first)) {
+ dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
+ mdcache->snaprealm_create(mdr, srci);
+ return;
+ }
+ }
+ */
+
+ SnapRealm *dest_realm = nullptr;
+ SnapRealm *src_realm = nullptr;
+ if (!linkmerge) {
+ dest_realm = destdir->inode->find_snaprealm();
+ if (srcdir->inode == destdir->inode)
+ src_realm = dest_realm;
+ else
+ src_realm = srcdir->inode->find_snaprealm();
+ if (src_realm != dest_realm &&
+ src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
+ respond_to_request(mdr, -EXDEV);
+ return;
+ }
+ }
+
+ ceph_assert(g_conf()->mds_kill_rename_at != 1);
+
+ // -- open all srcdn inode frags, if any --
+ // we need these open so that auth can properly delegate from inode to dirfrags
+ // after the inode is _ours_.
+ if (srcdnl->is_primary() &&
+ !srcdn->is_auth() &&
+ srci->is_dir()) {
+ dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
+ mdr->set_stickydirs(srci);
+
+ frag_vec_t leaves;
+ srci->dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ CDir *dir = srci->get_dirfrag(leaf);
+ if (!dir) {
+ dout(10) << " opening " << leaf << " under " << *srci << dendl;
+ mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ }
+ }
+
+ // -- prepare snaprealm ---
+
+ if (linkmerge) {
+ if (!mdr->more()->srci_srnode &&
+ srci->get_projected_inode()->nlink == 1 &&
+ srci->is_projected_snaprealm_global()) {
+ sr_t *new_srnode = srci->prepare_new_srnode(0);
+ srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
+
+ srci->clear_snaprealm_global(new_srnode);
+ mdr->more()->srci_srnode = new_srnode;
+ }
+ } else {
+ if (oldin && !mdr->more()->desti_srnode) {
+ if (oldin->is_projected_snaprealm_global()) {
+ sr_t *new_srnode = oldin->prepare_new_srnode(0);
+ oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
+ // dropping the last linkage or dropping the last remote linkage,
+ // detch the inode from global snaprealm
+ auto nlink = oldin->get_projected_inode()->nlink;
+ if (nlink == 1 ||
+ (nlink == 2 && !destdnl->is_primary() &&
+ !oldin->get_projected_parent_dir()->inode->is_stray()))
+ oldin->clear_snaprealm_global(new_srnode);
+ mdr->more()->desti_srnode = new_srnode;
+ } else if (destdnl->is_primary()) {
+ snapid_t follows = dest_realm->get_newest_seq();
+ if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
+ sr_t *new_srnode = oldin->prepare_new_srnode(follows);
+ oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
+ mdr->more()->desti_srnode = new_srnode;
+ }
+ }
+ }
+ if (!mdr->more()->srci_srnode) {
+ if (srci->is_projected_snaprealm_global()) {
+ sr_t *new_srnode = srci->prepare_new_srnode(0);
+ srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
+ mdr->more()->srci_srnode = new_srnode;
+ } else if (srcdnl->is_primary()) {
+ snapid_t follows = src_realm->get_newest_seq();
+ if (src_realm != dest_realm &&
+ (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
+ sr_t *new_srnode = srci->prepare_new_srnode(follows);
+ srci->record_snaprealm_past_parent(new_srnode, dest_realm);
+ mdr->more()->srci_srnode = new_srnode;
+ }
+ }
+ }
+ }
+
+ // -- prepare witnesses --
+
+ // do srcdn auth last
+ mds_rank_t last = MDS_RANK_NONE;
+ if (!srcdn->is_auth()) {
+ last = srcdn->authority().first;
+ mdr->more()->srcdn_auth_mds = last;
+ // ask auth of srci to mark srci as ambiguous auth if more than two MDS
+ // are involved in the rename operation.
+ if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
+ dout(10) << " preparing ambiguous auth for srci" << dendl;
+ ceph_assert(mdr->more()->is_remote_frozen_authpin);
+ ceph_assert(mdr->more()->rename_inode == srci);
+ _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
+ return;
+ }
+ }
+
+ for (set<mds_rank_t>::iterator p = witnesses.begin();
+ p != witnesses.end();
+ ++p) {
+ if (*p == last) continue; // do it last!
+ if (mdr->more()->witnessed.count(*p)) {
+ dout(10) << " already witnessed by mds." << *p << dendl;
+ } else if (mdr->more()->waiting_on_slave.count(*p)) {
+ dout(10) << " already waiting on witness mds." << *p << dendl;
+ } else {
+ if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
+ return;
+ }
+ }
+ if (!mdr->more()->waiting_on_slave.empty())
+ return; // we're waiting for a witness.
+
+ if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
+ dout(10) << " preparing last witness (srcdn auth)" << dendl;
+ ceph_assert(mdr->more()->waiting_on_slave.count(last) == 0);
+ _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
+ return;
+ }
+
+ // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
+ if (!mdr->more()->slaves.empty() && !srci->is_dir())
+ ceph_assert(g_conf()->mds_kill_rename_at != 3);
+ if (!mdr->more()->slaves.empty() && srci->is_dir())
+ ceph_assert(g_conf()->mds_kill_rename_at != 4);
+
+ // -- declare now --
+ mdr->set_mds_stamp(ceph_clock_now());
+
+ // -- prepare journal entry --
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "rename");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+ if (!mdr->more()->witnessed.empty()) {
+ dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
+
+ le->reqid = mdr->reqid;
+ le->had_slaves = true;
+
+ mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+ // no need to send frozen auth pin to recovring auth MDS of srci
+ mdr->more()->is_remote_frozen_authpin = false;
+ }
+
+ _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
+ if (le->client_map.length())
+ le->cmapv = mds->sessionmap.get_projected();
+
+ // -- commit locally --
+ C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
+
+ journal_and_reply(mdr, srci, destdn, le, fin);
+ mds->balancer->maybe_fragment(destdn->get_dir(), false);
+}
+
+
+void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+ dout(10) << "_rename_finish " << *mdr << dendl;
+
+ if (!mdr->more()->witnessed.empty())
+ mdcache->logged_master_update(mdr->reqid);
+
+ // apply
+ _rename_apply(mdr, srcdn, destdn, straydn);
+
+ mdcache->send_dentry_link(destdn, mdr);
+
+ CDentry::linkage_t *destdnl = destdn->get_linkage();
+ CInode *in = destdnl->get_inode();
+ bool need_eval = mdr->more()->cap_imports.count(in);
+
+ // test hack: test slave commit
+ if (!mdr->more()->slaves.empty() && !in->is_dir())
+ ceph_assert(g_conf()->mds_kill_rename_at != 5);
+ if (!mdr->more()->slaves.empty() && in->is_dir())
+ ceph_assert(g_conf()->mds_kill_rename_at != 6);
+
+ // bump popularity
+ mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
+ if (destdnl->is_remote() && in->is_auth())
+ mds->balancer->hit_inode(in, META_POP_IWR);
+
+ // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
+
+ ceph_assert(g_conf()->mds_kill_rename_at != 7);
+
+ // reply
+ respond_to_request(mdr, 0);
+
+ if (need_eval)
+ mds->locker->eval(in, CEPH_CAP_LOCKS, true);
+
+ // clean up?
+ // respond_to_request() drops locks. So stray reintegration can race with us.
+ if (straydn && !straydn->get_projected_linkage()->is_null()) {
+ mdcache->notify_stray(straydn);
+ }
+}
+
+
+
+// helpers
+
+bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
+ vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
+{
+ if (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
+ dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
+ if (mdr->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
+ return false;
+ }
+
+ dout(10) << "_rename_prepare_witness mds." << who << dendl;
+ auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREP);
+
+ req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
+ for (auto dn : srctrace)
+ req->srcdnpath.push_dentry(dn->get_name());
+ req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
+ for (auto dn : dsttrace)
+ req->destdnpath.push_dentry(dn->get_name());
+ if (straydn)
+ mdcache->replicate_stray(straydn, who, req->straybl);
+
+ if (mdr->more()->srci_srnode)
+ encode(*mdr->more()->srci_srnode, req->srci_snapbl);
+ if (mdr->more()->desti_srnode)
+ encode(*mdr->more()->desti_srnode, req->desti_snapbl);
+
+ req->srcdn_auth = mdr->more()->srcdn_auth_mds;
+
+ // srcdn auth will verify our current witness list is sufficient
+ req->witnesses = witnesse;
+
+ req->op_stamp = mdr->get_op_stamp();
+ mds->send_message_mds(req, who);
+
+ ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
+ mdr->more()->waiting_on_slave.insert(who);
+ return true;
+}
+
+version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
+{
+ version_t oldpv = mdr->more()->inode_import_v;
+
+ CDentry::linkage_t *srcdnl = srcdn->get_linkage();
+
+ /* import node */
+ auto blp = mdr->more()->inode_import.cbegin();
+
+ // imported caps
+ map<client_t,entity_inst_t> client_map;
+ map<client_t, client_metadata_t> client_metadata_map;
+ decode(client_map, blp);
+ decode(client_metadata_map, blp);
+ prepare_force_open_sessions(client_map, client_metadata_map,
+ mdr->more()->imported_session_map);
+ encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
+ encode(client_metadata_map, *client_map_bl);
+
+ list<ScatterLock*> updated_scatterlocks;
+ mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
+ mdr->more()->cap_imports, updated_scatterlocks);
+
+ // hack: force back to !auth and clean, temporarily
+ srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
+ srcdnl->get_inode()->mark_clean();
+
+ return oldpv;
+}
+
+bool Server::_need_force_journal(CInode *diri, bool empty)
+{
+ std::vector<CDir*> dirs;
+ diri->get_dirfrags(dirs);
+
+ bool force_journal = false;
+ if (empty) {
+ for (const auto& dir : dirs) {
+ if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
+ dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
+ force_journal = true;
+ break;
+ } else
+ dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
+ }
+ } else {
+ // see if any children of our frags are auth subtrees.
+ std::vector<CDir*> subtrees;
+ mdcache->get_subtrees(subtrees);
+ dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
+ for (const auto& dir : dirs) {
+ for (const auto& subtree : subtrees) {
+ if (dir->contains(subtree)) {
+ if (subtree->get_dir_auth().first == mds->get_nodeid()) {
+ dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
+ << *subtree << dendl;
+ force_journal = true;
+ break;
+ } else
+ dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
+ } else
+ dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
+ }
+ if (force_journal)
+ break;
+ }
+ }
+ return force_journal;
+}
+
+void Server::_rename_prepare(MDRequestRef& mdr,
+ EMetaBlob *metablob, bufferlist *client_map_bl,
+ CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+ dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
+ if (straydn)
+ dout(10) << " straydn " << *straydn << dendl;
+
+ CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+ CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
+ CInode *srci = srcdnl->get_inode();
+ CInode *oldin = destdnl->get_inode();
+
+ // primary+remote link merge?
+ bool linkmerge = (srci == oldin);
+ if (linkmerge)
+ ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
+ bool silent = srcdn->get_dir()->inode->is_stray();
+
+ bool force_journal_dest = false;
+ if (srci->is_dir() && !destdn->is_auth()) {
+ if (srci->is_auth()) {
+ // if we are auth for srci and exporting it, force journal because journal replay needs
+ // the source inode to create auth subtrees.
+ dout(10) << " we are exporting srci, will force journal destdn" << dendl;
+ force_journal_dest = true;
+ } else
+ force_journal_dest = _need_force_journal(srci, false);
+ }
+
+ bool force_journal_stray = false;
+ if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
+ force_journal_stray = _need_force_journal(oldin, true);
+
+ if (linkmerge)
+ dout(10) << " merging remote and primary links to the same inode" << dendl;
+ if (silent)
+ dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
+ if (force_journal_dest)
+ dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
+ if (force_journal_stray)
+ dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
+
+ if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
+ dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
+ metablob->renamed_dirino = srci->ino();
+ } else if (oldin && oldin->is_dir() && force_journal_stray) {
+ dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
+ metablob->renamed_dirino = oldin->ino();
+ }
+
+ // prepare
+ CInode::mempool_inode *spi = 0; // renamed inode
+ CInode::mempool_inode *tpi = 0; // target/overwritten inode
+
+ // target inode
+ if (!linkmerge) {
+ if (destdnl->is_primary()) {
+ ceph_assert(straydn); // moving to straydn.
+ // link--, and move.
+ if (destdn->is_auth()) {
+ auto &pi= oldin->project_inode(); //project_snaprealm
+ pi.inode.version = straydn->pre_dirty(pi.inode.version);
+ pi.inode.update_backtrace();
+ tpi = &pi.inode;
+ }
+ straydn->push_projected_linkage(oldin);
+ } else if (destdnl->is_remote()) {
+ // nlink-- targeti
+ if (oldin->is_auth()) {
+ auto &pi = oldin->project_inode();
+ pi.inode.version = oldin->pre_dirty();
+ tpi = &pi.inode;
+ }
+ }
+ }
+
+ // dest
+ if (srcdnl->is_remote()) {
+ if (!linkmerge) {
+ // destdn
+ if (destdn->is_auth())
+ mdr->more()->pvmap[destdn] = destdn->pre_dirty();
+ destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
+ // srci
+ if (srci->is_auth()) {
+ auto &pi = srci->project_inode();
+ pi.inode.version = srci->pre_dirty();
+ spi = &pi.inode;
+ }
+ } else {
+ dout(10) << " will merge remote onto primary link" << dendl;
+ if (destdn->is_auth()) {
+ auto &pi = oldin->project_inode();
+ pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
+ spi = &pi.inode;
+ }
+ }
+ } else { // primary
+ if (destdn->is_auth()) {
+ version_t oldpv;
+ if (srcdn->is_auth())
+ oldpv = srci->get_projected_version();
+ else {
+ oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
+
+ // note which dirfrags have child subtrees in the journal
+ // event, so that we can open those (as bounds) during replay.
+ if (srci->is_dir()) {
+ list<CDir*> ls;
+ srci->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ CDir *dir = *p;
+ if (!dir->is_auth())
+ metablob->renamed_dir_frags.push_back(dir->get_frag());
+ }
+ dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
+ }
+ }
+ auto &pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
+ // & srcdnl->snaprealm
+ pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
+ pi.inode.update_backtrace();
+ spi = &pi.inode;
+ }
+ destdn->push_projected_linkage(srci);
+ }
+
+ // src
+ if (srcdn->is_auth())
+ mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
+ srcdn->push_projected_linkage(); // push null linkage
+
+ if (!silent) {
+ if (spi) {
+ spi->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > spi->rstat.rctime)
+ spi->rstat.rctime = mdr->get_op_stamp();
+ spi->change_attr++;
+ if (linkmerge)
+ spi->nlink--;
+ }
+ if (tpi) {
+ tpi->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > tpi->rstat.rctime)
+ tpi->rstat.rctime = mdr->get_op_stamp();
+ tpi->change_attr++;
+ {
+ std::string t;
+ destdn->make_path_string(t, true);
+ tpi->stray_prior_path = std::move(t);
+ }
+ tpi->nlink--;
+ if (tpi->nlink == 0)
+ oldin->state_set(CInode::STATE_ORPHAN);
+ }
+ }
+
+ // prepare nesting, mtime updates
+ int predirty_dir = silent ? 0:PREDIRTY_DIR;
+
+ // guarantee stray dir is processed first during journal replay. unlink the old inode,
+ // then link the source inode to destdn
+ if (destdnl->is_primary()) {
+ ceph_assert(straydn);
+ if (straydn->is_auth()) {
+ metablob->add_dir_context(straydn->get_dir());
+ metablob->add_dir(straydn->get_dir(), true);
+ }
+ }
+
+ // sub off target
+ if (destdn->is_auth() && !destdnl->is_null()) {
+ mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
+ (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
+ if (destdnl->is_primary()) {
+ ceph_assert(straydn);
+ mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
+ PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+ }
+ }
+
+ // move srcdn
+ int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
+ int flags = predirty_dir | predirty_primary;
+ if (srcdn->is_auth())
+ mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
+ if (destdn->is_auth())
+ mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
+
+ // add it all to the metablob
+ // target inode
+ if (!linkmerge) {
+ if (destdnl->is_primary()) {
+ ceph_assert(straydn);
+ if (destdn->is_auth()) {
+ // project snaprealm, too
+ if (auto& desti_srnode = mdr->more()->desti_srnode) {
+ oldin->project_snaprealm(desti_srnode);
+ if (tpi->nlink == 0)
+ ceph_assert(!desti_srnode->is_parent_global());
+ desti_srnode = NULL;
+ }
+ straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+ metablob->add_primary_dentry(straydn, oldin, true, true);
+ } else if (force_journal_stray) {
+ dout(10) << " forced journaling straydn " << *straydn << dendl;
+ metablob->add_dir_context(straydn->get_dir());
+ metablob->add_primary_dentry(straydn, oldin, true);
+ }
+ } else if (destdnl->is_remote()) {
+ if (oldin->is_auth()) {
+ sr_t *new_srnode = NULL;
+ if (mdr->slave_request) {
+ if (mdr->slave_request->desti_snapbl.length() > 0) {
+ new_srnode = new sr_t();
+ auto p = mdr->slave_request->desti_snapbl.cbegin();
+ decode(*new_srnode, p);
+ }
+ } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
+ new_srnode = desti_srnode;
+ desti_srnode = NULL;
+ }
+ if (new_srnode) {
+ oldin->project_snaprealm(new_srnode);
+ if (tpi->nlink == 0)
+ ceph_assert(!new_srnode->is_parent_global());
+ }
+ // auth for targeti
+ metablob->add_dir_context(oldin->get_projected_parent_dir());
+ mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
+ CEPH_NOSNAP, 0, destdnl);
+ metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
+ }
+ }
+ }
+
+ // dest
+ if (srcdnl->is_remote()) {
+ ceph_assert(!linkmerge);
+ if (destdn->is_auth() && !destdnl->is_null())
+ mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
+ else
+ destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+
+ if (destdn->is_auth())
+ metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
+
+ if (srci->is_auth() ) { // it's remote
+ if (mdr->slave_request) {
+ if (mdr->slave_request->srci_snapbl.length() > 0) {
+ sr_t *new_srnode = new sr_t();
+ auto p = mdr->slave_request->srci_snapbl.cbegin();
+ decode(*new_srnode, p);
+ srci->project_snaprealm(new_srnode);
+ }
+ } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
+ srci->project_snaprealm(srci_srnode);
+ srci_srnode = NULL;
+ }
+
+ CDentry *srci_pdn = srci->get_projected_parent_dn();
+ metablob->add_dir_context(srci_pdn->get_dir());
+ mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn, CEPH_NOSNAP, 0, srcdnl);
+ metablob->add_primary_dentry(srci_pdn, srci, true);
+ }
+ } else if (srcdnl->is_primary()) {
+ // project snap parent update?
+ if (destdn->is_auth()) {
+ if (auto& srci_srnode = mdr->more()->srci_srnode) {
+ srci->project_snaprealm(srci_srnode);
+ srci_srnode = NULL;
+ }
+ }
+
+ if (destdn->is_auth() && !destdnl->is_null())
+ mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
+
+ destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+
+ if (destdn->is_auth())
+ metablob->add_primary_dentry(destdn, srci, true, true);
+ else if (force_journal_dest) {
+ dout(10) << " forced journaling destdn " << *destdn << dendl;
+ metablob->add_dir_context(destdn->get_dir());
+ metablob->add_primary_dentry(destdn, srci, true);
+ if (srcdn->is_auth() && srci->is_dir()) {
+ // journal new subtrees root dirfrags
+ list<CDir*> ls;
+ srci->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ CDir *dir = *p;
+ if (dir->is_auth())
+ metablob->add_dir(dir, true);
+ }
+ }
+ }
+ }
+
+ // src
+ if (srcdn->is_auth()) {
+ dout(10) << " journaling srcdn " << *srcdn << dendl;
+ mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
+ // also journal the inode in case we need do slave rename rollback. It is Ok to add
+ // both primary and NULL dentries. Because during journal replay, null dentry is
+ // processed after primary dentry.
+ if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
+ metablob->add_primary_dentry(srcdn, srci, true);
+ metablob->add_null_dentry(srcdn, true);
+ } else
+ dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
+
+ // make renamed inode first track the dn
+ if (srcdnl->is_primary() && destdn->is_auth()) {
+ ceph_assert(srci->first <= destdn->first);
+ srci->first = destdn->first;
+ }
+ // make stray inode first track the straydn
+ if (straydn && straydn->is_auth()) {
+ ceph_assert(oldin->first <= straydn->first);
+ oldin->first = straydn->first;
+ }
+
+ if (oldin && oldin->is_dir()) {
+ ceph_assert(straydn);
+ mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
+ }
+ if (srci->is_dir())
+ mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
+
+}
+
+
+void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+ dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
+ dout(10) << " pvs " << mdr->more()->pvmap << dendl;
+
+ CDentry::linkage_t *srcdnl = srcdn->get_linkage();
+ CDentry::linkage_t *destdnl = destdn->get_linkage();
+
+ CInode *oldin = destdnl->get_inode();
+
+ // primary+remote link merge?
+ bool linkmerge = (srcdnl->get_inode() == oldin);
+ if (linkmerge)
+ ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
+
+ bool new_in_snaprealm = false;
+ bool new_oldin_snaprealm = false;
+
+ // target inode
+ if (!linkmerge) {
+ if (destdnl->is_primary()) {
+ ceph_assert(straydn);
+ dout(10) << "straydn is " << *straydn << dendl;
+
+ // if there is newly created snaprealm, need to split old snaprealm's
+ // inodes_with_caps. So pop snaprealm before linkage changes.
+ if (destdn->is_auth()) {
+ bool hadrealm = (oldin->snaprealm ? true : false);
+ oldin->early_pop_projected_snaprealm();
+ new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
+ } else {
+ ceph_assert(mdr->slave_request);
+ if (mdr->slave_request->desti_snapbl.length()) {
+ new_oldin_snaprealm = !oldin->snaprealm;
+ oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
+ ceph_assert(oldin->snaprealm);
+ ceph_assert(oldin->snaprealm->have_past_parents_open());
+ }
+ }
+
+ destdn->get_dir()->unlink_inode(destdn, false);
+
+ straydn->pop_projected_linkage();
+ if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
+ ceph_assert(!straydn->is_projected()); // no other projected
+
+ // nlink-- targeti
+ if (destdn->is_auth())
+ oldin->pop_and_dirty_projected_inode(mdr->ls);
+
+ mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
+ } else if (destdnl->is_remote()) {
+ destdn->get_dir()->unlink_inode(destdn, false);
+ if (oldin->is_auth()) {
+ oldin->pop_and_dirty_projected_inode(mdr->ls);
+ } else if (mdr->slave_request) {
+ if (mdr->slave_request->desti_snapbl.length() > 0) {
+ ceph_assert(oldin->snaprealm);
+ oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
+ }
+ } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
+ delete desti_srnode;
+ desti_srnode = NULL;
+ }
+ }
+ }
+
+ // unlink src before we relink it at dest
+ CInode *in = srcdnl->get_inode();
+ ceph_assert(in);
+
+ bool srcdn_was_remote = srcdnl->is_remote();
+ if (!srcdn_was_remote) {
+ // if there is newly created snaprealm, need to split old snaprealm's
+ // inodes_with_caps. So pop snaprealm before linkage changes.
+ if (destdn->is_auth()) {
+ bool hadrealm = (in->snaprealm ? true : false);
+ in->early_pop_projected_snaprealm();
+ new_in_snaprealm = (in->snaprealm && !hadrealm);
+ } else {
+ ceph_assert(mdr->slave_request);
+ if (mdr->slave_request->srci_snapbl.length()) {
+ new_in_snaprealm = !in->snaprealm;
+ in->decode_snap_blob(mdr->slave_request->srci_snapbl);
+ ceph_assert(in->snaprealm);
+ ceph_assert(in->snaprealm->have_past_parents_open());
+ }
+ }
+ }
+
+ srcdn->get_dir()->unlink_inode(srcdn);
+
+ // dest
+ if (srcdn_was_remote) {
+ if (!linkmerge) {
+ // destdn
+ destdnl = destdn->pop_projected_linkage();
+ if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
+ ceph_assert(!destdn->is_projected()); // no other projected
+
+ destdn->link_remote(destdnl, in);
+ if (destdn->is_auth())
+ destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
+ // in
+ if (in->is_auth()) {
+ in->pop_and_dirty_projected_inode(mdr->ls);
+ } else if (mdr->slave_request) {
+ if (mdr->slave_request->srci_snapbl.length() > 0) {
+ ceph_assert(in->snaprealm);
+ in->decode_snap_blob(mdr->slave_request->srci_snapbl);
+ }
+ } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
+ delete srci_srnode;
+ srci_srnode = NULL;
+ }
+ } else {
+ dout(10) << "merging remote onto primary link" << dendl;
+ oldin->pop_and_dirty_projected_inode(mdr->ls);
+ }
+ } else { // primary
+ if (linkmerge) {
+ dout(10) << "merging primary onto remote link" << dendl;
+ destdn->get_dir()->unlink_inode(destdn, false);
+ }
+ destdnl = destdn->pop_projected_linkage();
+ if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
+ ceph_assert(!destdn->is_projected()); // no other projected
+
+ // srcdn inode import?
+ if (!srcdn->is_auth() && destdn->is_auth()) {
+ ceph_assert(mdr->more()->inode_import.length() > 0);
+
+ map<client_t,Capability::Import> imported_caps;
+
+ // finish cap imports
+ finish_force_open_sessions(mdr->more()->imported_session_map);
+ if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
+ mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
+ mdr->more()->srcdn_auth_mds, true,
+ mdr->more()->imported_session_map,
+ mdr->more()->cap_imports[destdnl->get_inode()],
+ imported_caps);
+ }
+
+ mdr->more()->inode_import.clear();
+ encode(imported_caps, mdr->more()->inode_import);
+
+ /* hack: add an auth pin for each xlock we hold. These were
+ * remote xlocks previously but now they're local and
+ * we're going to try and unpin when we xlock_finish. */
+
+ for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
+ i != mdr->locks.end();
+ ++i) {
+ SimpleLock *lock = i->lock;
+ if (lock->get_parent() != destdnl->get_inode())
+ break;
+ if (i->is_xlock() && !lock->is_locallock())
+ mds->locker->xlock_import(lock);
+ }
+
+ // hack: fix auth bit
+ in->state_set(CInode::STATE_AUTH);
+
+ mdr->clear_ambiguous_auth();
+ }
+
+ if (destdn->is_auth())
+ in->pop_and_dirty_projected_inode(mdr->ls);
+ }
+
+ // src
+ if (srcdn->is_auth())
+ srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
+ srcdn->pop_projected_linkage();
+ if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
+ ceph_assert(!srcdn->is_projected()); // no other projected
+
+ // apply remaining projected inodes (nested)
+ mdr->apply();
+
+ // update subtree map?
+ if (destdnl->is_primary() && in->is_dir())
+ mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
+
+ if (straydn && oldin->is_dir())
+ mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
+
+ if (new_oldin_snaprealm)
+ mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
+ if (new_in_snaprealm)
+ mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
+
+ // removing a new dn?
+ if (srcdn->is_auth())
+ srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
+}
+
+
+
+// ------------
+// SLAVE
+
+class C_MDS_SlaveRenamePrep : public ServerLogContext {
+ CDentry *srcdn, *destdn, *straydn;
+public:
+ C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
+ ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
+ void finish(int r) override {
+ server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
+ }
+};
+
+class C_MDS_SlaveRenameCommit : public ServerContext {
+ MDRequestRef mdr;
+ CDentry *srcdn, *destdn, *straydn;
+public:
+ C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
+ ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
+ void finish(int r) override {
+ server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
+ }
+};
+
+class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
+ MDRequestRef mdr;
+public:
+ C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
+ ServerContext(s), mdr(r) {}
+ void finish(int r) override {
+ server->_slave_rename_sessions_flushed(mdr);
+ }
+};
+
+void Server::handle_slave_rename_prep(MDRequestRef& mdr)
+{
+ dout(10) << "handle_slave_rename_prep " << *mdr
+ << " " << mdr->slave_request->srcdnpath
+ << " to " << mdr->slave_request->destdnpath
+ << dendl;
+
+ if (mdr->slave_request->is_interrupted()) {
+ dout(10) << " slave request interrupted, sending noop reply" << dendl;
+ auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
+ reply->mark_interrupted();
+ mds->send_message_mds(reply, mdr->slave_to_mds);
+ mdr->reset_slave_request();
+ return;
+ }
+
+ // discover destdn
+ filepath destpath(mdr->slave_request->destdnpath);
+ dout(10) << " dest " << destpath << dendl;
+ vector<CDentry*> trace;
+ CF_MDS_MDRContextFactory cf(mdcache, mdr);
+ int r = mdcache->path_traverse(mdr, cf, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
+ if (r > 0) return;
+ if (r == -ESTALE) {
+ mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
+ mdr->slave_to_mds);
+ return;
+ }
+ ceph_assert(r == 0); // we shouldn't get an error here!
+
+ CDentry *destdn = trace.back();
+ CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
+ dout(10) << " destdn " << *destdn << dendl;
+ mdr->pin(destdn);
+
+ // discover srcdn
+ filepath srcpath(mdr->slave_request->srcdnpath);
+ dout(10) << " src " << srcpath << dendl;
+ CInode *srci = nullptr;
+ r = mdcache->path_traverse(mdr, cf, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
+ if (r > 0) return;
+ ceph_assert(r == 0);
+
+ // srcpath must not point to a null dentry
+ ceph_assert(srci != nullptr);
+
+ CDentry *srcdn = trace.back();
+ CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+ dout(10) << " srcdn " << *srcdn << dendl;
+ mdr->pin(srcdn);
+ mdr->pin(srci);
+
+ // stray?
+ bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
+ if (linkmerge)
+ ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
+ CDentry *straydn = mdr->straydn;
+ if (destdnl->is_primary() && !linkmerge)
+ ceph_assert(straydn);
+
+ mdr->set_op_stamp(mdr->slave_request->op_stamp);
+ mdr->more()->srcdn_auth_mds = srcdn->authority().first;
+
+ // set up commit waiter (early, to clean up any freezing etc we do)
+ if (!mdr->more()->slave_commit)
+ mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
+
+ // am i srcdn auth?
+ if (srcdn->is_auth()) {
+ set<mds_rank_t> srcdnrep;
+ srcdn->list_replicas(srcdnrep);
+
+ bool reply_witness = false;
+ if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
+ // freeze?
+ // we need this to
+ // - avoid conflicting lock state changes
+ // - avoid concurrent updates to the inode
+ // (this could also be accomplished with the versionlock)
+ int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
+ dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
+ bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
+
+ // unfreeze auth pin after freezing the inode to avoid queueing waiters
+ if (srcdnl->get_inode()->is_frozen_auth_pin())
+ mdr->unfreeze_auth_pin();
+
+ if (!frozen_inode) {
+ srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ /*
+ * set ambiguous auth for srci
+ * NOTE: we don't worry about ambiguous cache expire as we do
+ * with subtree migrations because all slaves will pin
+ * srcdn->get_inode() for duration of this rename.
+ */
+ mdr->set_ambiguous_auth(srcdnl->get_inode());
+
+ // just mark the source inode as ambiguous auth if more than two MDS are involved.
+ // the master will send another OP_RENAMEPREP slave request later.
+ if (mdr->slave_request->witnesses.size() > 1) {
+ dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
+ reply_witness = true;
+ }
+
+ // make sure bystanders have received all lock related messages
+ for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
+ if (*p == mdr->slave_to_mds ||
+ (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
+ continue;
+ auto notify = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMENOTIFY);
+ mds->send_message_mds(notify, *p);
+ mdr->more()->waiting_on_slave.insert(*p);
+ }
+
+ // make sure clients have received all cap related messages
+ set<client_t> export_client_set;
+ mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
+
+ MDSGatherBuilder gather(g_ceph_context);
+ flush_client_sessions(export_client_set, gather);
+ if (gather.has_subs()) {
+ mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
+ gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
+ gather.activate();
+ }
+ }
+
+ // is witness list sufficient?
+ for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
+ if (*p == mdr->slave_to_mds ||
+ mdr->slave_request->witnesses.count(*p)) continue;
+ dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
+ reply_witness = true;
+ break;
+ }
+
+ if (reply_witness) {
+ ceph_assert(!srcdnrep.empty());
+ auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
+ reply->witnesses.swap(srcdnrep);
+ mds->send_message_mds(reply, mdr->slave_to_mds);
+ mdr->reset_slave_request();
+ return;
+ }
+ dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
+ if (!mdr->more()->waiting_on_slave.empty()) {
+ dout(10) << " still waiting for rename notify acks from "
+ << mdr->more()->waiting_on_slave << dendl;
+ return;
+ }
+ } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
+ // set ambiguous auth for srci on witnesses
+ mdr->set_ambiguous_auth(srcdnl->get_inode());
+ }
+
+ // encode everything we'd need to roll this back... basically, just the original state.
+ rename_rollback rollback;
+
+ rollback.reqid = mdr->reqid;
+
+ rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
+ rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
+ rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
+ rollback.orig_src.dname = srcdn->get_name();
+ if (srcdnl->is_primary())
+ rollback.orig_src.ino = srcdnl->get_inode()->ino();
+ else {
+ ceph_assert(srcdnl->is_remote());
+ rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
+ rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
+ }
+
+ rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
+ rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
+ rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
+ rollback.orig_dest.dname = destdn->get_name();
+ if (destdnl->is_primary())
+ rollback.orig_dest.ino = destdnl->get_inode()->ino();
+ else if (destdnl->is_remote()) {
+ rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
+ rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
+ }
+
+ if (straydn) {
+ rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
+ rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
+ rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
+ rollback.stray.dname = straydn->get_name();
+ }
+ if (mdr->slave_request->desti_snapbl.length()) {
+ CInode *oldin = destdnl->get_inode();
+ if (oldin->snaprealm) {
+ encode(true, rollback.desti_snapbl);
+ oldin->encode_snap_blob(rollback.desti_snapbl);
+ } else {
+ encode(false, rollback.desti_snapbl);
+ }
+ }
+ if (mdr->slave_request->srci_snapbl.length()) {
+ if (srci->snaprealm) {
+ encode(true, rollback.srci_snapbl);
+ srci->encode_snap_blob(rollback.srci_snapbl);
+ } else {
+ encode(false, rollback.srci_snapbl);
+ }
+ }
+ encode(rollback, mdr->more()->rollback_bl);
+ // FIXME: rollback snaprealm
+ dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
+
+ // journal.
+ mdr->ls = mdlog->get_current_segment();
+ ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
+ ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
+ mdlog->start_entry(le);
+ le->rollback = mdr->more()->rollback_bl;
+
+ bufferlist blah; // inode import data... obviously not used if we're the slave
+ _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
+
+ if (le->commit.empty()) {
+ dout(10) << " empty metablob, skipping journal" << dendl;
+ mdlog->cancel_entry(le);
+ mdr->ls = NULL;
+ _logged_slave_rename(mdr, srcdn, destdn, straydn);
+ } else {
+ mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
+ mdr->more()->slave_update_journaled = true;
+ submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
+ mdr, __func__);
+ mdlog->flush();
+ }
+}
+
+void Server::_logged_slave_rename(MDRequestRef& mdr,
+ CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+ dout(10) << "_logged_slave_rename " << *mdr << dendl;
+
+ // prepare ack
+ MMDSSlaveRequest::ref reply;
+ if (!mdr->aborted) {
+ reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
+ if (!mdr->more()->slave_update_journaled)
+ reply->mark_not_journaled();
+ }
+
+ CDentry::linkage_t *srcdnl = srcdn->get_linkage();
+ //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
+
+ // export srci?
+ if (srcdn->is_auth() && srcdnl->is_primary()) {
+ // set export bounds for CInode::encode_export()
+ if (reply) {
+ list<CDir*> bounds;
+ if (srcdnl->get_inode()->is_dir()) {
+ srcdnl->get_inode()->get_dirfrags(bounds);
+ for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
+ (*p)->state_set(CDir::STATE_EXPORTBOUND);
+ }
+
+ map<client_t,entity_inst_t> exported_client_map;
+ map<client_t, client_metadata_t> exported_client_metadata_map;
+ bufferlist inodebl;
+ mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
+ exported_client_map,
+ exported_client_metadata_map);
+
+ for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
+ (*p)->state_clear(CDir::STATE_EXPORTBOUND);
+
+ encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
+ encode(exported_client_metadata_map, reply->inode_export);
+ reply->inode_export.claim_append(inodebl);
+ reply->inode_export_v = srcdnl->get_inode()->inode.version;
+ }
+
+ // remove mdr auth pin
+ mdr->auth_unpin(srcdnl->get_inode());
+ mdr->more()->is_inode_exporter = true;
+
+ if (srcdnl->get_inode()->is_dirty())
+ srcdnl->get_inode()->mark_clean();
+
+ dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
+ }
+
+ // apply
+ _rename_apply(mdr, srcdn, destdn, straydn);
+
+ CDentry::linkage_t *destdnl = destdn->get_linkage();
+
+ // bump popularity
+ mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
+ if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
+ mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
+
+ // done.
+ mdr->reset_slave_request();
+ mdr->straydn = 0;
+
+ if (reply) {
+ mds->send_message_mds(reply, mdr->slave_to_mds);
+ } else {
+ ceph_assert(mdr->aborted);
+ dout(10) << " abort flag set, finishing" << dendl;
+ mdcache->request_finish(mdr);
+ }
+}
+
+void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
+ CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+ dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
+
+ CInode *in = destdn->get_linkage()->get_inode();
+
+ inodeno_t migrated_stray;
+ if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
+ migrated_stray = in->ino();
+
+ MDSContext::vec finished;
+ if (r == 0) {
+ // unfreeze+singleauth inode
+ // hmm, do i really need to delay this?
+ if (mdr->more()->is_inode_exporter) {
+ // drop our pins
+ // we exported, clear out any xlocks that we moved to another MDS
+
+ for (auto i = mdr->locks.lower_bound(&in->versionlock);
+ i != mdr->locks.end(); ) {
+ SimpleLock *lock = i->lock;
+ if (lock->get_parent() != in)
+ break;
+ // we only care about xlocks on the exported inode
+ if (i->is_xlock() && !lock->is_locallock())
+ mds->locker->xlock_export(i++, mdr.get());
+ else
+ ++i;
+ }
+
+ map<client_t,Capability::Import> peer_imported;
+ auto bp = mdr->more()->inode_import.cbegin();
+ decode(peer_imported, bp);
+
+ dout(10) << " finishing inode export on " << *in << dendl;
+ mdcache->migrator->finish_export_inode(in, mdr->slave_to_mds, peer_imported, finished);
+ mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
+
+ // unfreeze
+ ceph_assert(in->is_frozen_inode());
+ in->unfreeze_inode(finished);
+ }
+
+ // singleauth
+ if (mdr->more()->is_ambiguous_auth) {
+ mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+ mdr->more()->is_ambiguous_auth = false;
+ }
+
+ if (straydn && mdr->more()->slave_update_journaled) {
+ CInode *strayin = straydn->get_projected_linkage()->get_inode();
+ if (strayin && !strayin->snaprealm)
+ mdcache->clear_dirty_bits_for_stray(strayin);
+ }
+
+ mds->queue_waiters(finished);
+ mdr->cleanup();
+
+ if (mdr->more()->slave_update_journaled) {
+ // write a commit to the journal
+ ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
+ mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
+ ESlaveUpdate::RENAME);
+ mdlog->start_entry(le);
+ submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
+ mdlog->flush();
+ } else {
+ _committed_slave(mdr);
+ }
+ } else {
+
+ // abort
+ // rollback_bl may be empty if we froze the inode but had to provide an expanded
+ // witness list from the master, and they failed before we tried prep again.
+ if (mdr->more()->rollback_bl.length()) {
+ if (mdr->more()->is_inode_exporter) {
+ dout(10) << " reversing inode export of " << *in << dendl;
+ in->abort_export();
+ }
+ if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
+ mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
+ // rollback but preserve the slave request
+ do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
+ mdr->more()->rollback_bl.clear();
+ } else
+ do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
+ } else {
+ dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
+ // singleauth
+ if (mdr->more()->is_ambiguous_auth) {
+ if (srcdn->is_auth())
+ mdr->more()->rename_inode->unfreeze_inode(finished);
+
+ mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+ mdr->more()->is_ambiguous_auth = false;
+ }
+ mds->queue_waiters(finished);
+ mdcache->request_finish(mdr);
+ }
+ }
+
+ if (migrated_stray && mds->is_stopping())
+ mdcache->shutdown_export_stray_finish(migrated_stray);
+}
+
+void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
+ bool isdir, int linkunlink, nest_info_t &rstat)
+{
+ fnode_t *pf;
+ pf = dir->project_fnode();
+ mut->add_projected_fnode(dir);
+ pf->version = dir->pre_dirty();
+
+ if (isdir) {
+ pf->fragstat.nsubdirs += linkunlink;
+ } else {
+ pf->fragstat.nfiles += linkunlink;
+ }
+ if (r.ino) {
+ pf->rstat.rbytes += linkunlink * rstat.rbytes;
+ pf->rstat.rfiles += linkunlink * rstat.rfiles;
+ pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
+ pf->rstat.rsnaps += linkunlink * rstat.rsnaps;
+ }
+ if (pf->fragstat.mtime == ctime) {
+ pf->fragstat.mtime = r.dirfrag_old_mtime;
+ if (pf->rstat.rctime == ctime)
+ pf->rstat.rctime = r.dirfrag_old_rctime;
+ }
+ mut->add_updated_lock(&dir->get_inode()->filelock);
+ mut->add_updated_lock(&dir->get_inode()->nestlock);
+}
+
+struct C_MDS_LoggedRenameRollback : public ServerLogContext {
+ MutationRef mut;
+ CDentry *srcdn;
+ version_t srcdnpv;
+ CDentry *destdn;
+ CDentry *straydn;
+ map<client_t,MClientSnap::ref> splits[2];
+ bool finish_mdr;
+ C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
+ CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
+ map<client_t,MClientSnap::ref> _splits[2], bool f) :
+ ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
+ straydn(st), finish_mdr(f) {
+ splits[0].swap(_splits[0]);
+ splits[1].swap(_splits[1]);
+ }
+ void finish(int r) override {
+ server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
+ destdn, straydn, splits, finish_mdr);
+ }
+};
+
+void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
+ bool finish_mdr)
+{
+ rename_rollback rollback;
+ auto p = rbl.cbegin();
+ decode(rollback, p);
+
+ dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
+ // need to finish this update before sending resolve to claim the subtree
+ mdcache->add_rollback(rollback.reqid, master);
+
+ MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
+ mut->ls = mds->mdlog->get_current_segment();
+
+ CDentry *srcdn = NULL;
+ CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
+ if (!srcdir)
+ srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
+ if (srcdir) {
+ dout(10) << " srcdir " << *srcdir << dendl;
+ srcdn = srcdir->lookup(rollback.orig_src.dname);
+ if (srcdn) {
+ dout(10) << " srcdn " << *srcdn << dendl;
+ ceph_assert(srcdn->get_linkage()->is_null());
+ } else
+ dout(10) << " srcdn not found" << dendl;
+ } else
+ dout(10) << " srcdir not found" << dendl;
+
+ CDentry *destdn = NULL;
+ CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
+ if (!destdir)
+ destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
+ if (destdir) {
+ dout(10) << " destdir " << *destdir << dendl;
+ destdn = destdir->lookup(rollback.orig_dest.dname);
+ if (destdn)
+ dout(10) << " destdn " << *destdn << dendl;
+ else
+ dout(10) << " destdn not found" << dendl;
+ } else
+ dout(10) << " destdir not found" << dendl;
+
+ CInode *in = NULL;
+ if (rollback.orig_src.ino) {
+ in = mdcache->get_inode(rollback.orig_src.ino);
+ if (in && in->is_dir())
+ ceph_assert(srcdn && destdn);
+ } else
+ in = mdcache->get_inode(rollback.orig_src.remote_ino);
+
+ CDir *straydir = NULL;
+ CDentry *straydn = NULL;
+ if (rollback.stray.dirfrag.ino) {
+ straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
+ if (straydir) {
+ dout(10) << "straydir " << *straydir << dendl;
+ straydn = straydir->lookup(rollback.stray.dname);
+ if (straydn) {
+ dout(10) << " straydn " << *straydn << dendl;
+ ceph_assert(straydn->get_linkage()->is_primary());
+ } else
+ dout(10) << " straydn not found" << dendl;
+ } else
+ dout(10) << "straydir not found" << dendl;
+ }
+
+ CInode *target = NULL;
+ if (rollback.orig_dest.ino) {
+ target = mdcache->get_inode(rollback.orig_dest.ino);
+ if (target)
+ ceph_assert(destdn && straydn);
+ } else if (rollback.orig_dest.remote_ino)
+ target = mdcache->get_inode(rollback.orig_dest.remote_ino);
+
+ // can't use is_auth() in the resolve stage
+ mds_rank_t whoami = mds->get_nodeid();
+ // slave
+ ceph_assert(!destdn || destdn->authority().first != whoami);
+ ceph_assert(!straydn || straydn->authority().first != whoami);
+
+ bool force_journal_src = false;
+ bool force_journal_dest = false;
+ if (in && in->is_dir() && srcdn->authority().first != whoami)
+ force_journal_src = _need_force_journal(in, false);
+ if (in && target && target->is_dir())
+ force_journal_dest = _need_force_journal(in, true);
+
+ version_t srcdnpv = 0;
+ // repair src
+ if (srcdn) {
+ if (srcdn->authority().first == whoami)
+ srcdnpv = srcdn->pre_dirty();
+ if (rollback.orig_src.ino) {
+ ceph_assert(in);
+ srcdn->push_projected_linkage(in);
+ } else
+ srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
+ rollback.orig_src.remote_d_type);
+ }
+
+ map<client_t,MClientSnap::ref> splits[2];
+
+ CInode::mempool_inode *pip = nullptr;
+ if (in) {
+ bool projected;
+ if (in->get_projected_parent_dn()->authority().first == whoami) {
+ auto &pi = in->project_inode();
+ pip = &pi.inode;
+ mut->add_projected_inode(in);
+ pip->version = in->pre_dirty();
+ projected = true;
+ } else {
+ pip = in->get_projected_inode();
+ projected = false;
+ }
+ if (pip->ctime == rollback.ctime)
+ pip->ctime = rollback.orig_src.old_ctime;
+
+ if (rollback.srci_snapbl.length() && in->snaprealm) {
+ bool hadrealm;
+ auto p = rollback.srci_snapbl.cbegin();
+ decode(hadrealm, p);
+ if (hadrealm) {
+ if (projected && !mds->is_resolve()) {
+ sr_t *new_srnode = new sr_t();
+ decode(*new_srnode, p);
+ in->project_snaprealm(new_srnode);
+ } else
+ decode(in->snaprealm->srnode, p);
+ } else {
+ SnapRealm *realm;
+ if (rollback.orig_src.ino) {
+ ceph_assert(srcdir);
+ realm = srcdir->get_inode()->find_snaprealm();
+ } else {
+ realm = in->snaprealm->parent;
+ }
+ if (!mds->is_resolve())
+ mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
+ if (projected)
+ in->project_snaprealm(NULL);
+ else
+ in->snaprealm->merge_to(realm);
+ }
+ }
+ }
+
+ if (srcdn && srcdn->authority().first == whoami) {
+ nest_info_t blah;
+ _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
+ in ? in->is_dir() : false, 1, pip ? pip->accounted_rstat : blah);
+ }
+
+ // repair dest
+ if (destdn) {
+ if (rollback.orig_dest.ino && target) {
+ destdn->push_projected_linkage(target);
+ } else if (rollback.orig_dest.remote_ino) {
+ destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
+ rollback.orig_dest.remote_d_type);
+ } else {
+ // the dentry will be trimmed soon, it's ok to have wrong linkage
+ if (rollback.orig_dest.ino)
+ ceph_assert(mds->is_resolve());
+ destdn->push_projected_linkage();
+ }
+ }
+
+ if (straydn)
+ straydn->push_projected_linkage();
+
+ if (target) {
+ bool projected;
+ CInode::mempool_inode *ti = nullptr;
+ if (target->get_projected_parent_dn()->authority().first == whoami) {
+ auto &pi = target->project_inode();
+ ti = &pi.inode;
+ mut->add_projected_inode(target);
+ ti->version = target->pre_dirty();
+ projected = true;
+ } else {
+ ti = target->get_projected_inode();
+ projected = false;
+ }
+ if (ti->ctime == rollback.ctime)
+ ti->ctime = rollback.orig_dest.old_ctime;
+ if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
+ if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
+ ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
+ else
+ ceph_assert(rollback.orig_dest.remote_ino &&
+ rollback.orig_dest.remote_ino == rollback.orig_src.ino);
+ } else
+ ti->nlink++;
+
+ if (rollback.desti_snapbl.length() && target->snaprealm) {
+ bool hadrealm;
+ auto p = rollback.desti_snapbl.cbegin();
+ decode(hadrealm, p);
+ if (hadrealm) {
+ if (projected && !mds->is_resolve()) {
+ sr_t *new_srnode = new sr_t();
+ decode(*new_srnode, p);
+ target->project_snaprealm(new_srnode);
+ } else
+ decode(target->snaprealm->srnode, p);
+ } else {
+ SnapRealm *realm;
+ if (rollback.orig_dest.ino) {
+ ceph_assert(destdir);
+ realm = destdir->get_inode()->find_snaprealm();
+ } else {
+ realm = target->snaprealm->parent;
+ }
+ if (!mds->is_resolve())
+ mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
+ if (projected)
+ target->project_snaprealm(NULL);
+ else
+ target->snaprealm->merge_to(realm);
+ }
+ }
+ }
+
+ if (srcdn)
+ dout(0) << " srcdn back to " << *srcdn << dendl;
+ if (in)
+ dout(0) << " srci back to " << *in << dendl;
+ if (destdn)
+ dout(0) << " destdn back to " << *destdn << dendl;
+ if (target)
+ dout(0) << " desti back to " << *target << dendl;
+
+ // journal it
+ ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
+ ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
+ mdlog->start_entry(le);
+
+ if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
+ le->commit.add_dir_context(srcdir);
+ if (rollback.orig_src.ino)
+ le->commit.add_primary_dentry(srcdn, 0, true);
+ else
+ le->commit.add_remote_dentry(srcdn, true);
+ }
+
+ if (!rollback.orig_src.ino && // remote linkage
+ in && in->authority().first == whoami) {
+ le->commit.add_dir_context(in->get_projected_parent_dir());
+ le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
+ }
+
+ if (force_journal_dest) {
+ ceph_assert(rollback.orig_dest.ino);
+ le->commit.add_dir_context(destdir);
+ le->commit.add_primary_dentry(destdn, 0, true);
+ }
+
+ // slave: no need to journal straydn
+
+ if (target && target != in && target->authority().first == whoami) {
+ ceph_assert(rollback.orig_dest.remote_ino);
+ le->commit.add_dir_context(target->get_projected_parent_dir());
+ le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
+ }
+
+ if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
+ dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
+ le->commit.renamed_dirino = in->ino();
+ if (srcdn->authority().first == whoami) {
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ CDir *dir = *p;
+ if (!dir->is_auth())
+ le->commit.renamed_dir_frags.push_back(dir->get_frag());
+ }
+ dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
+ }
+ } else if (force_journal_dest) {
+ dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
+ le->commit.renamed_dirino = target->ino();
+ }
+
+ if (target && target->is_dir()) {
+ ceph_assert(destdn);
+ mdcache->project_subtree_rename(target, straydir, destdir);
+ }
+
+ if (in && in->is_dir()) {
+ ceph_assert(srcdn);
+ mdcache->project_subtree_rename(in, destdir, srcdir);
+ }
+
+ if (mdr && !mdr->more()->slave_update_journaled) {
+ ceph_assert(le->commit.empty());
+ mdlog->cancel_entry(le);
+ mut->ls = NULL;
+ _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
+ } else {
+ ceph_assert(!le->commit.empty());
+ if (mdr)
+ mdr->more()->slave_update_journaled = false;
+ MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
+ srcdn, srcdnpv, destdn, straydn,
+ splits, finish_mdr);
+ submit_mdlog_entry(le, fin, mdr, __func__);
+ mdlog->flush();
+ }
+}
+
+void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
+ version_t srcdnpv, CDentry *destdn, CDentry *straydn,
+ map<client_t,MClientSnap::ref> splits[2], bool finish_mdr)
+{
+ dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
+
+ if (straydn) {
+ straydn->get_dir()->unlink_inode(straydn);
+ straydn->pop_projected_linkage();
+ }
+ if (destdn) {
+ destdn->get_dir()->unlink_inode(destdn);
+ destdn->pop_projected_linkage();
+ }
+ if (srcdn) {
+ srcdn->pop_projected_linkage();
+ if (srcdn->authority().first == mds->get_nodeid()) {
+ srcdn->mark_dirty(srcdnpv, mut->ls);
+ if (srcdn->get_linkage()->is_primary())
+ srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
+ }
+ }
+
+ mut->apply();
+
+ if (srcdn && srcdn->get_linkage()->is_primary()) {
+ CInode *in = srcdn->get_linkage()->get_inode();
+ if (in && in->is_dir()) {
+ ceph_assert(destdn);
+ mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
+ }
+ }
+
+ if (destdn) {
+ CInode *oldin = destdn->get_linkage()->get_inode();
+ // update subtree map?
+ if (oldin && oldin->is_dir()) {
+ ceph_assert(straydn);
+ mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
+ }
+ }
+
+ if (mds->is_resolve()) {
+ CDir *root = NULL;
+ if (straydn)
+ root = mdcache->get_subtree_root(straydn->get_dir());
+ else if (destdn)
+ root = mdcache->get_subtree_root(destdn->get_dir());
+ if (root)
+ mdcache->try_trim_non_auth_subtree(root);
+ } else {
+ mdcache->send_snaps(splits[1]);
+ mdcache->send_snaps(splits[0]);
+ }
+
+ if (mdr) {
+ MDSContext::vec finished;
+ if (mdr->more()->is_ambiguous_auth) {
+ if (srcdn->is_auth())
+ mdr->more()->rename_inode->unfreeze_inode(finished);
+
+ mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+ mdr->more()->is_ambiguous_auth = false;
+ }
+ mds->queue_waiters(finished);
+ if (finish_mdr || mdr->aborted)
+ mdcache->request_finish(mdr);
+ else
+ mdr->more()->slave_rolling_back = false;
+ }
+
+ mdcache->finish_rollback(mut->reqid, mdr);
+
+ mut->cleanup();
+}
+
+void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
+{
+ dout(10) << "handle_slave_rename_prep_ack " << *mdr
+ << " witnessed by " << ack->get_source()
+ << " " << *ack << dendl;
+ mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+ // note slave
+ mdr->more()->slaves.insert(from);
+ if (mdr->more()->srcdn_auth_mds == from &&
+ mdr->more()->is_remote_frozen_authpin &&
+ !mdr->more()->is_ambiguous_auth) {
+ mdr->set_ambiguous_auth(mdr->more()->rename_inode);
+ }
+
+ // witnessed? or add extra witnesses?
+ ceph_assert(mdr->more()->witnessed.count(from) == 0);
+ if (ack->is_interrupted()) {
+ dout(10) << " slave request interrupted, noop" << dendl;
+ } else if (ack->witnesses.empty()) {
+ mdr->more()->witnessed.insert(from);
+ if (!ack->is_not_journaled())
+ mdr->more()->has_journaled_slaves = true;
+ } else {
+ dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
+ mdr->more()->extra_witnesses = ack->witnesses;
+ mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
+ }
+
+ // srci import?
+ if (ack->inode_export.length()) {
+ dout(10) << " got srci import" << dendl;
+ mdr->more()->inode_import.share(ack->inode_export);
+ mdr->more()->inode_import_v = ack->inode_export_v;
+ }
+
+ // remove from waiting list
+ ceph_assert(mdr->more()->waiting_on_slave.count(from));
+ mdr->more()->waiting_on_slave.erase(from);
+
+ if (mdr->more()->waiting_on_slave.empty())
+ dispatch_client_request(mdr); // go again!
+ else
+ dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
+}
+
+void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
+{
+ dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
+ << ack->get_source() << dendl;
+ ceph_assert(mdr->is_slave());
+ mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+ if (mdr->more()->waiting_on_slave.count(from)) {
+ mdr->more()->waiting_on_slave.erase(from);
+
+ if (mdr->more()->waiting_on_slave.empty()) {
+ if (mdr->slave_request)
+ dispatch_slave_request(mdr);
+ } else
+ dout(10) << " still waiting for rename notify acks from "
+ << mdr->more()->waiting_on_slave << dendl;
+ }
+}
+
+void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
+{
+ dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
+
+ if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
+ mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
+
+ if (mdr->more()->waiting_on_slave.empty()) {
+ if (mdr->slave_request)
+ dispatch_slave_request(mdr);
+ } else
+ dout(10) << " still waiting for rename notify acks from "
+ << mdr->more()->waiting_on_slave << dendl;
+ }
+}
+
+// snaps
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_lssnap(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+
+ // traverse to path
+ CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
+ if (!diri || diri->state_test(CInode::STATE_PURGING)) {
+ respond_to_request(mdr, -ESTALE);
+ return;
+ }
+ if (!diri->is_auth()) {
+ mdcache->request_forward(mdr, diri->authority().first);
+ return;
+ }
+ if (!diri->is_dir()) {
+ respond_to_request(mdr, -ENOTDIR);
+ return;
+ }
+ dout(10) << "lssnap on " << *diri << dendl;
+
+ // lock snap
+ MutationImpl::LockOpVec lov;
+ mds->locker->include_snap_rdlocks(diri, lov);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, diri, MAY_READ))
+ return;
+
+ SnapRealm *realm = diri->find_snaprealm();
+ map<snapid_t,const SnapInfo*> infomap;
+ realm->get_snap_info(infomap, diri->get_oldest_snap());
+
+ unsigned max_entries = req->head.args.readdir.max_entries;
+ if (!max_entries)
+ max_entries = infomap.size();
+ int max_bytes = req->head.args.readdir.max_bytes;
+ if (!max_bytes)
+ // make sure at least one item can be encoded
+ max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
+
+ __u64 last_snapid = 0;
+ string offset_str = req->get_path2();
+ if (!offset_str.empty())
+ last_snapid = realm->resolve_snapname(offset_str, diri->ino());
+
+ //Empty DirStat
+ bufferlist dirbl;
+ static DirStat empty;
+ CDir::encode_dirstat(dirbl, mdr->session->info, empty);
+
+ max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
+
+ __u32 num = 0;
+ bufferlist dnbl;
+ auto p = infomap.upper_bound(last_snapid);
+ for (; p != infomap.end() && num < max_entries; ++p) {
+ dout(10) << p->first << " -> " << *p->second << dendl;
+
+ // actual
+ string snap_name;
+ if (p->second->ino == diri->ino())
+ snap_name = p->second->name;
+ else
+ snap_name = p->second->get_long_name();
+
+ unsigned start_len = dnbl.length();
+ if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
+ break;
+
+ encode(snap_name, dnbl);
+ //infinite lease
+ LeaseStat e(-1, -1, 0);
+ mds->locker->encode_lease(dnbl, mdr->session->info, e);
+ dout(20) << "encode_infinite_lease" << dendl;
+
+ int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
+ if (r < 0) {
+ bufferlist keep;
+ keep.substr_of(dnbl, 0, start_len);
+ dnbl.swap(keep);
+ break;
+ }
+ ++num;
+ }
+
+ encode(num, dirbl);
+ __u16 flags = 0;
+ if (p == infomap.end()) {
+ flags = CEPH_READDIR_FRAG_END;
+ if (last_snapid == 0)
+ flags |= CEPH_READDIR_FRAG_COMPLETE;
+ }
+ encode(flags, dirbl);
+ dirbl.claim_append(dnbl);
+
+ mdr->reply_extra_bl = dirbl;
+ mdr->tracei = diri;
+ respond_to_request(mdr, 0);
+}
+
+
+// MKSNAP
+
+struct C_MDS_mksnap_finish : public ServerLogContext {
+ CInode *diri;
+ SnapInfo info;
+ C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
+ ServerLogContext(s, r), diri(di), info(i) {}
+ void finish(int r) override {
+ server->_mksnap_finish(mdr, diri, info);
+ }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_mksnap(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ // make sure we have as new a map as the client
+ if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+ mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ if (!mds->mdsmap->allows_snaps()) {
+ // you can't make snapshots until you set an option right now
+ respond_to_request(mdr, -EPERM);
+ return;
+ }
+
+ CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
+ if (!diri || diri->state_test(CInode::STATE_PURGING)) {
+ respond_to_request(mdr, -ESTALE);
+ return;
+ }
+
+ if (!diri->is_auth()) { // fw to auth?
+ mdcache->request_forward(mdr, diri->authority().first);
+ return;
+ }
+
+ // dir only
+ if (!diri->is_dir()) {
+ respond_to_request(mdr, -ENOTDIR);
+ return;
+ }
+ if (diri->is_system() && !diri->is_root()) {
+ // no snaps in system dirs (root is ok)
+ respond_to_request(mdr, -EPERM);
+ return;
+ }
+
+ std::string_view snapname = req->get_filepath().last_dentry();
+
+ if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
+ dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
+ respond_to_request(mdr, -EPERM);
+ return;
+ }
+
+ dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
+
+ // lock snap
+ MutationImpl::LockOpVec lov;
+
+ mds->locker->include_snap_rdlocks(diri, lov);
+ lov.erase_rdlock(&diri->snaplock);
+ lov.add_xlock(&diri->snaplock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
+ return;
+
+ if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
+ (subvol_ino && subvol_ino != diri->ino())) {
+ respond_to_request(mdr, -EPERM);
+ return;
+ }
+
+ // check if we can create any more snapshots
+ // we don't allow any more if we are already at or beyond the limit
+ if (diri->snaprealm &&
+ diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
+ respond_to_request(mdr, -EMLINK);
+ return;
+ }
+
+ // make sure name is unique
+ if (diri->snaprealm &&
+ diri->snaprealm->exists(snapname)) {
+ respond_to_request(mdr, -EEXIST);
+ return;
+ }
+ if (snapname.length() == 0 ||
+ snapname[0] == '_') {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ // allocate a snapid
+ if (!mdr->more()->stid) {
+ // prepare an stid
+ mds->snapclient->prepare_create(diri->ino(), snapname,
+ mdr->get_mds_stamp(),
+ &mdr->more()->stid, &mdr->more()->snapidbl,
+ new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ version_t stid = mdr->more()->stid;
+ snapid_t snapid;
+ auto p = mdr->more()->snapidbl.cbegin();
+ decode(snapid, p);
+ dout(10) << " stid " << stid << " snapid " << snapid << dendl;
+
+ ceph_assert(mds->snapclient->get_cached_version() >= stid);
+
+ // journal
+ SnapInfo info;
+ info.ino = diri->ino();
+ info.snapid = snapid;
+ info.name = snapname;
+ info.stamp = mdr->get_op_stamp();
+
+ auto &pi = diri->project_inode(false, true);
+ pi.inode.ctime = info.stamp;
+ if (info.stamp > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = info.stamp;
+ pi.inode.rstat.rsnaps++;
+ pi.inode.version = diri->pre_dirty();
+
+ // project the snaprealm
+ auto &newsnap = *pi.snapnode;
+ newsnap.created = snapid;
+ auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
+ if (!em.second)
+ em.first->second = info;
+ newsnap.seq = snapid;
+ newsnap.last_created = snapid;
+
+ // journal the inode changes
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "mksnap");
+ mdlog->start_entry(le);
+
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ le->metablob.add_table_transaction(TABLE_SNAP, stid);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
+
+ // journal the snaprealm changes
+ submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
+{
+ dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
+
+ int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
+
+ diri->pop_and_dirty_projected_inode(mdr->ls);
+ mdr->apply();
+
+ mds->snapclient->commit(mdr->more()->stid, mdr->ls);
+
+ // create snap
+ dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
+
+ // notify other mds
+ mdcache->send_snap_update(diri, mdr->more()->stid, op);
+
+ mdcache->do_realm_invalidate_and_update_notify(diri, op);
+
+ // yay
+ mdr->in[0] = diri;
+ mdr->snapid = info.snapid;
+ mdr->tracei = diri;
+ respond_to_request(mdr, 0);
+}
+
+
+// RMSNAP
+
+struct C_MDS_rmsnap_finish : public ServerLogContext {
+ CInode *diri;
+ snapid_t snapid;
+ C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
+ ServerLogContext(s, r), diri(di), snapid(sn) {}
+ void finish(int r) override {
+ server->_rmsnap_finish(mdr, diri, snapid);
+ }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_rmsnap(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+
+ CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
+ if (!diri || diri->state_test(CInode::STATE_PURGING)) {
+ respond_to_request(mdr, -ESTALE);
+ return;
+ }
+ if (!diri->is_auth()) { // fw to auth?
+ mdcache->request_forward(mdr, diri->authority().first);
+ return;
+ }
+ if (!diri->is_dir()) {
+ respond_to_request(mdr, -ENOTDIR);
+ return;
+ }
+
+ std::string_view snapname = req->get_filepath().last_dentry();
+
+ if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
+ dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
+ respond_to_request(mdr, -EPERM);
+ return;
+ }
+
+ dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
+
+ // does snap exist?
+ if (snapname.length() == 0 || snapname[0] == '_') {
+ respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
+ return;
+ }
+ if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
+ respond_to_request(mdr, -ENOENT);
+ return;
+ }
+ snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
+ dout(10) << " snapname " << snapname << " is " << snapid << dendl;
+
+ MutationImpl::LockOpVec lov;
+ mds->locker->include_snap_rdlocks(diri, lov);
+ lov.erase_rdlock(&diri->snaplock);
+ lov.add_xlock(&diri->snaplock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
+ return;
+
+ // prepare
+ if (!mdr->more()->stid) {
+ mds->snapclient->prepare_destroy(diri->ino(), snapid,
+ &mdr->more()->stid, &mdr->more()->snapidbl,
+ new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ version_t stid = mdr->more()->stid;
+ auto p = mdr->more()->snapidbl.cbegin();
+ snapid_t seq;
+ decode(seq, p);
+ dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
+
+ ceph_assert(mds->snapclient->get_cached_version() >= stid);
+
+ // journal
+ auto &pi = diri->project_inode(false, true);
+ pi.inode.version = diri->pre_dirty();
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.rstat.rsnaps--;
+
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "rmsnap");
+ mdlog->start_entry(le);
+
+ // project the snaprealm
+ auto &newnode = *pi.snapnode;
+ newnode.snaps.erase(snapid);
+ newnode.seq = seq;
+ newnode.last_destroyed = seq;
+
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ le->metablob.add_table_transaction(TABLE_SNAP, stid);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
+
+ submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
+{
+ dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
+ snapid_t stid = mdr->more()->stid;
+ auto p = mdr->more()->snapidbl.cbegin();
+ snapid_t seq;
+ decode(seq, p);
+
+ diri->pop_and_dirty_projected_inode(mdr->ls);
+ mdr->apply();
+
+ mds->snapclient->commit(stid, mdr->ls);
+
+ dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
+
+ // notify other mds
+ mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
+
+ mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
+
+ // yay
+ mdr->in[0] = diri;
+ respond_to_request(mdr, 0);
+
+ // purge snapshot data
+ if (diri->snaprealm->have_past_parents_open())
+ diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
+}
+
+struct C_MDS_renamesnap_finish : public ServerLogContext {
+ CInode *diri;
+ snapid_t snapid;
+ C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
+ ServerLogContext(s, r), diri(di), snapid(sn) {}
+ void finish(int r) override {
+ server->_renamesnap_finish(mdr, diri, snapid);
+ }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_renamesnap(MDRequestRef& mdr)
+{
+ const MClientRequest::const_ref &req = mdr->client_request;
+ if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+
+ CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
+ if (!diri || diri->state_test(CInode::STATE_PURGING)) {
+ respond_to_request(mdr, -ESTALE);
+ return;
+ }
+
+ if (!diri->is_auth()) { // fw to auth?
+ mdcache->request_forward(mdr, diri->authority().first);
+ return;
+ }
+
+ if (!diri->is_dir()) { // dir only
+ respond_to_request(mdr, -ENOTDIR);
+ return;
+ }
+
+ if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
+ mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
+ respond_to_request(mdr, -EPERM);
+ return;
+ }
+
+ std::string_view dstname = req->get_filepath().last_dentry();
+ std::string_view srcname = req->get_filepath2().last_dentry();
+ dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
+
+ if (srcname.length() == 0 || srcname[0] == '_') {
+ respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
+ return;
+ }
+ if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
+ respond_to_request(mdr, -ENOENT);
+ return;
+ }
+ if (dstname.length() == 0 || dstname[0] == '_') {
+ respond_to_request(mdr, -EINVAL);
+ return;
+ }
+ if (diri->snaprealm->exists(dstname)) {
+ respond_to_request(mdr, -EEXIST);
+ return;
+ }
+
+ snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
+ dout(10) << " snapname " << srcname << " is " << snapid << dendl;
+
+ // lock snap
+ MutationImpl::LockOpVec lov;
+
+ mds->locker->include_snap_rdlocks(diri, lov);
+ lov.erase_rdlock(&diri->snaplock);
+ lov.add_xlock(&diri->snaplock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
+ return;
+
+ // prepare
+ if (!mdr->more()->stid) {
+ mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
+ &mdr->more()->stid,
+ new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ version_t stid = mdr->more()->stid;
+ dout(10) << " stid is " << stid << dendl;
+
+ ceph_assert(mds->snapclient->get_cached_version() >= stid);
+
+ // journal
+ auto &pi = diri->project_inode(false, true);
+ pi.inode.ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
+ pi.inode.rstat.rctime = mdr->get_op_stamp();
+ pi.inode.version = diri->pre_dirty();
+
+ // project the snaprealm
+ auto &newsnap = *pi.snapnode;
+ auto it = newsnap.snaps.find(snapid);
+ ceph_assert(it != newsnap.snaps.end());
+ it->second.name = dstname;
+
+ // journal the inode changes
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "renamesnap");
+ mdlog->start_entry(le);
+
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ le->metablob.add_table_transaction(TABLE_SNAP, stid);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
+
+ // journal the snaprealm changes
+ submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
+{
+ dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
+
+ diri->pop_and_dirty_projected_inode(mdr->ls);
+ mdr->apply();
+
+ mds->snapclient->commit(mdr->more()->stid, mdr->ls);
+
+ dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
+
+ // notify other mds
+ mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
+
+ mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
+
+ // yay
+ mdr->in[0] = diri;
+ mdr->tracei = diri;
+ mdr->snapid = snapid;
+ respond_to_request(mdr, 0);
+}
+
+/**
+ * Return true if server is in state RECONNECT and this
+ * client has not yet reconnected.
+ */
+bool Server::waiting_for_reconnect(client_t c) const
+{
+ return client_reconnect_gather.count(c) > 0;
+}
+
+void Server::dump_reconnect_status(Formatter *f) const
+{
+ f->open_object_section("reconnect_status");
+ f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
+ f->close_section();
+}
diff --git a/src/mds/Server.h b/src/mds/Server.h
new file mode 100644
index 00000000..715e8496
--- /dev/null
+++ b/src/mds/Server.h
@@ -0,0 +1,384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_SERVER_H
+#define CEPH_MDS_SERVER_H
+
+#include <string_view>
+
+#include <common/DecayCounter.h>
+
+#include "messages/MClientReconnect.h"
+#include "messages/MClientReply.h"
+#include "messages/MClientRequest.h"
+#include "messages/MClientSession.h"
+#include "messages/MClientSnap.h"
+#include "messages/MClientReclaim.h"
+#include "messages/MClientReclaimReply.h"
+#include "messages/MLock.h"
+
+#include "MDSRank.h"
+#include "Mutation.h"
+#include "MDSContext.h"
+
+class OSDMap;
+class PerfCounters;
+class LogEvent;
+class EMetaBlob;
+class EUpdate;
+class MDLog;
+struct SnapInfo;
+
+enum {
+ l_mdss_first = 1000,
+ l_mdss_dispatch_client_request,
+ l_mdss_dispatch_slave_request,
+ l_mdss_handle_client_request,
+ l_mdss_handle_client_session,
+ l_mdss_handle_slave_request,
+ l_mdss_req_create_latency,
+ l_mdss_req_getattr_latency,
+ l_mdss_req_getfilelock_latency,
+ l_mdss_req_link_latency,
+ l_mdss_req_lookup_latency,
+ l_mdss_req_lookuphash_latency,
+ l_mdss_req_lookupino_latency,
+ l_mdss_req_lookupname_latency,
+ l_mdss_req_lookupparent_latency,
+ l_mdss_req_lookupsnap_latency,
+ l_mdss_req_lssnap_latency,
+ l_mdss_req_mkdir_latency,
+ l_mdss_req_mknod_latency,
+ l_mdss_req_mksnap_latency,
+ l_mdss_req_open_latency,
+ l_mdss_req_readdir_latency,
+ l_mdss_req_rename_latency,
+ l_mdss_req_renamesnap_latency,
+ l_mdss_req_rmdir_latency,
+ l_mdss_req_rmsnap_latency,
+ l_mdss_req_rmxattr_latency,
+ l_mdss_req_setattr_latency,
+ l_mdss_req_setdirlayout_latency,
+ l_mdss_req_setfilelock_latency,
+ l_mdss_req_setlayout_latency,
+ l_mdss_req_setxattr_latency,
+ l_mdss_req_symlink_latency,
+ l_mdss_req_unlink_latency,
+ l_mdss_cap_revoke_eviction,
+ l_mdss_cap_acquisition_throttle,
+ l_mdss_last,
+};
+
+class Server {
+public:
+ using clock = ceph::coarse_mono_clock;
+ using time = ceph::coarse_mono_time;
+
+private:
+ MDSRank *mds;
+ MDCache *mdcache;
+ MDLog *mdlog;
+ PerfCounters *logger;
+
+ // OSDMap full status, used to generate ENOSPC on some operations
+ bool is_full;
+
+ // State for while in reconnect
+ MDSContext *reconnect_done;
+ int failed_reconnects;
+ bool reconnect_evicting; // true if I am waiting for evictions to complete
+ // before proceeding to reconnect_gather_finish
+ time reconnect_start = clock::zero();
+ time reconnect_last_seen = clock::zero();
+ set<client_t> client_reconnect_gather; // clients i need a reconnect msg from.
+
+ feature_bitset_t supported_features;
+ feature_bitset_t required_client_features;
+
+ bool replay_unsafe_with_closed_session = false;
+ double cap_revoke_eviction_timeout = 0;
+ uint64_t max_snaps_per_dir = 100;
+
+ friend class MDSContinuation;
+ friend class ServerContext;
+ friend class ServerLogContext;
+
+public:
+ bool terminating_sessions;
+
+ explicit Server(MDSRank *m);
+ ~Server() {
+ g_ceph_context->get_perfcounters_collection()->remove(logger);
+ delete logger;
+ delete reconnect_done;
+ }
+
+ void create_logger();
+
+ // message handler
+ void dispatch(const Message::const_ref &m);
+
+ void handle_osd_map();
+
+ // -- sessions and recovery --
+ bool waiting_for_reconnect(client_t c) const;
+ void dump_reconnect_status(Formatter *f) const;
+
+ time last_recalled() const {
+ return last_recall_state;
+ }
+
+ void handle_client_session(const MClientSession::const_ref &m);
+ void _session_logged(Session *session, uint64_t state_seq,
+ bool open, version_t pv, interval_set<inodeno_t>& inos,version_t piv);
+ version_t prepare_force_open_sessions(map<client_t,entity_inst_t> &cm,
+ map<client_t,client_metadata_t>& cmm,
+ map<client_t,pair<Session*,uint64_t> >& smap);
+ void finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
+ bool dec_import=true);
+ void flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather);
+ void finish_flush_session(Session *session, version_t seq);
+ void terminate_sessions();
+ void find_idle_sessions();
+ void kill_session(Session *session, Context *on_safe);
+ size_t apply_blacklist(const std::set<entity_addr_t> &blacklist);
+ void journal_close_session(Session *session, int state, Context *on_safe);
+
+ set<client_t> client_reclaim_gather;
+ size_t get_num_pending_reclaim() const { return client_reclaim_gather.size(); }
+ Session *find_session_by_uuid(std::string_view uuid);
+ void reclaim_session(Session *session, const MClientReclaim::const_ref &m);
+ void finish_reclaim_session(Session *session, const MClientReclaimReply::ref &reply=nullptr);
+ void handle_client_reclaim(const MClientReclaim::const_ref &m);
+
+ void reconnect_clients(MDSContext *reconnect_done_);
+ void handle_client_reconnect(const MClientReconnect::const_ref &m);
+ void infer_supported_features(Session *session, client_metadata_t& client_metadata);
+ void update_required_client_features();
+
+ //void process_reconnect_cap(CInode *in, int from, ceph_mds_cap_reconnect& capinfo);
+ void reconnect_gather_finish();
+ void reconnect_tick();
+ void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
+
+ enum class RecallFlags : uint64_t {
+ NONE = 0,
+ STEADY = (1<<0),
+ ENFORCE_MAX = (1<<1),
+ TRIM = (1<<2),
+ ENFORCE_LIVENESS = (1<<3),
+ };
+ std::pair<bool, uint64_t> recall_client_state(MDSGatherBuilder* gather, RecallFlags=RecallFlags::NONE);
+ void force_clients_readonly();
+
+ // -- requests --
+ void handle_client_request(const MClientRequest::const_ref &m);
+
+ void journal_and_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn,
+ LogEvent *le, MDSLogContextBase *fin);
+ void submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin,
+ MDRequestRef& mdr, std::string_view event);
+ void dispatch_client_request(MDRequestRef& mdr);
+ void perf_gather_op_latency(const MClientRequest::const_ref &req, utime_t lat);
+ void early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn);
+ void respond_to_request(MDRequestRef& mdr, int r = 0);
+ void set_trace_dist(Session *session, const MClientReply::ref &reply, CInode *in, CDentry *dn,
+ snapid_t snapid,
+ int num_dentries_wanted,
+ MDRequestRef& mdr);
+
+
+ void handle_slave_request(const MMDSSlaveRequest::const_ref &m);
+ void handle_slave_request_reply(const MMDSSlaveRequest::const_ref &m);
+ void dispatch_slave_request(MDRequestRef& mdr);
+ void handle_slave_auth_pin(MDRequestRef& mdr);
+ void handle_slave_auth_pin_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack);
+
+ // some helpers
+ bool check_fragment_space(MDRequestRef& mdr, CDir *in);
+ bool check_access(MDRequestRef& mdr, CInode *in, unsigned mask);
+ bool _check_access(Session *session, CInode *in, unsigned mask, int caller_uid, int caller_gid, int setattr_uid, int setattr_gid);
+ CDir *validate_dentry_dir(MDRequestRef& mdr, CInode *diri, std::string_view dname);
+ CDir *traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath);
+ CDentry *prepare_null_dentry(MDRequestRef& mdr, CDir *dir, std::string_view dname, bool okexist=false);
+ CDentry *prepare_stray_dentry(MDRequestRef& mdr, CInode *in);
+ CInode* prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
+ file_layout_t *layout=NULL);
+ void journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob);
+ void apply_allocated_inos(MDRequestRef& mdr, Session *session);
+
+ CInode* rdlock_path_pin_ref(MDRequestRef& mdr, int n, MutationImpl::LockOpVec& lov,
+ bool want_auth, bool no_want_auth=false,
+ file_layout_t **layout=nullptr,
+ bool no_lookup=false);
+ CDentry* rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
+ MutationImpl::LockOpVec& lov,
+ bool okexist, bool mustexist, bool alwaysxlock,
+ file_layout_t **layout=nullptr);
+
+ CDir* try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr);
+
+
+ // requests on existing inodes.
+ void handle_client_getattr(MDRequestRef& mdr, bool is_lookup);
+ void handle_client_lookup_ino(MDRequestRef& mdr,
+ bool want_parent, bool want_dentry);
+ void _lookup_snap_ino(MDRequestRef& mdr);
+ void _lookup_ino_2(MDRequestRef& mdr, int r);
+ void handle_client_readdir(MDRequestRef& mdr);
+ void handle_client_file_setlock(MDRequestRef& mdr);
+ void handle_client_file_readlock(MDRequestRef& mdr);
+
+ void handle_client_setattr(MDRequestRef& mdr);
+ void handle_client_setlayout(MDRequestRef& mdr);
+ void handle_client_setdirlayout(MDRequestRef& mdr);
+
+ int parse_quota_vxattr(string name, string value, quota_info_t *quota);
+ void create_quota_realm(CInode *in);
+ int parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
+ file_layout_t *layout, bool validate=true);
+ int check_layout_vxattr(MDRequestRef& mdr,
+ string name,
+ string value,
+ file_layout_t *layout);
+ void handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
+ file_layout_t *dir_layout,
+ MutationImpl::LockOpVec& lov);
+ void handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
+ file_layout_t *dir_layout,
+ MutationImpl::LockOpVec& lov);
+ void handle_client_setxattr(MDRequestRef& mdr);
+ void handle_client_removexattr(MDRequestRef& mdr);
+
+ void handle_client_fsync(MDRequestRef& mdr);
+
+ // open
+ void handle_client_open(MDRequestRef& mdr);
+ void handle_client_openc(MDRequestRef& mdr); // O_CREAT variant.
+ void do_open_truncate(MDRequestRef& mdr, int cmode); // O_TRUNC variant.
+
+ // namespace changes
+ void handle_client_mknod(MDRequestRef& mdr);
+ void handle_client_mkdir(MDRequestRef& mdr);
+ void handle_client_symlink(MDRequestRef& mdr);
+
+ // link
+ void handle_client_link(MDRequestRef& mdr);
+ void _link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm);
+ void _link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
+ version_t, version_t, bool);
+
+ void _link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti);
+ void _link_remote_finish(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti,
+ version_t);
+
+ void handle_slave_link_prep(MDRequestRef& mdr);
+ void _logged_slave_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm);
+ void _commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti);
+ void _committed_slave(MDRequestRef& mdr); // use for rename, too
+ void handle_slave_link_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &m);
+ void do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr);
+ void _link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
+ map<client_t,MClientSnap::ref>& split);
+
+ // unlink
+ void handle_client_unlink(MDRequestRef& mdr);
+ bool _dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *rmdiri);
+ bool _dir_is_nonempty(MDRequestRef& mdr, CInode *rmdiri);
+ void _unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn);
+ void _unlink_local_finish(MDRequestRef& mdr,
+ CDentry *dn, CDentry *straydn,
+ version_t);
+ bool _rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn);
+ void handle_slave_rmdir_prep(MDRequestRef& mdr);
+ void _logged_slave_rmdir(MDRequestRef& mdr, CDentry *srcdn, CDentry *straydn);
+ void _commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn);
+ void handle_slave_rmdir_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack);
+ void do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr);
+ void _rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn);
+
+ // rename
+ void handle_client_rename(MDRequestRef& mdr);
+ void _rename_finish(MDRequestRef& mdr,
+ CDentry *srcdn, CDentry *destdn, CDentry *straydn);
+
+ void handle_client_lssnap(MDRequestRef& mdr);
+ void handle_client_mksnap(MDRequestRef& mdr);
+ void _mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info);
+ void handle_client_rmsnap(MDRequestRef& mdr);
+ void _rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid);
+ void handle_client_renamesnap(MDRequestRef& mdr);
+ void _renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid);
+
+
+ // helpers
+ bool _rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
+ vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn);
+ version_t _rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl);
+ bool _need_force_journal(CInode *diri, bool empty);
+ void _rename_prepare(MDRequestRef& mdr,
+ EMetaBlob *metablob, bufferlist *client_map_bl,
+ CDentry *srcdn, CDentry *destdn, CDentry *straydn);
+ /* set not_journaling=true if you're going to discard the results --
+ * this bypasses the asserts to make sure we're journaling the right
+ * things on the right nodes */
+ void _rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
+
+ // slaving
+ void handle_slave_rename_prep(MDRequestRef& mdr);
+ void handle_slave_rename_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &m);
+ void handle_slave_rename_notify_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &m);
+ void _slave_rename_sessions_flushed(MDRequestRef& mdr);
+ void _logged_slave_rename(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
+ void _commit_slave_rename(MDRequestRef& mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
+ void do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr, bool finish_mdr=false);
+ void _rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn, version_t srcdnpv,
+ CDentry *destdn, CDentry *staydn, map<client_t,MClientSnap::ref> splits[2],
+ bool finish_mdr);
+
+ void evict_cap_revoke_non_responders();
+ void handle_conf_change(const std::set<std::string>& changed);
+
+private:
+ void reply_client_request(MDRequestRef& mdr, const MClientReply::ref &reply);
+ void flush_session(Session *session, MDSGatherBuilder& gather);
+
+ DecayCounter recall_throttle;
+ time last_recall_state;
+
+ // Cache cap acquisition throttle configs
+ uint64_t max_caps_per_client;
+ uint64_t cap_acquisition_throttle;
+ double max_caps_throttle_ratio;
+ double caps_throttle_retry_request_timeout;
+};
+
+static inline constexpr auto operator|(Server::RecallFlags a, Server::RecallFlags b) {
+ using T = std::underlying_type<Server::RecallFlags>::type;
+ return static_cast<Server::RecallFlags>(static_cast<T>(a) | static_cast<T>(b));
+}
+static inline constexpr auto operator&(Server::RecallFlags a, Server::RecallFlags b) {
+ using T = std::underlying_type<Server::RecallFlags>::type;
+ return static_cast<Server::RecallFlags>(static_cast<T>(a) & static_cast<T>(b));
+}
+static inline std::ostream& operator<<(std::ostream& os, const Server::RecallFlags& f) {
+ using T = std::underlying_type<Server::RecallFlags>::type;
+ return os << "0x" << std::hex << static_cast<T>(f) << std::dec;
+}
+static inline constexpr bool operator!(const Server::RecallFlags& f) {
+ using T = std::underlying_type<Server::RecallFlags>::type;
+ return static_cast<T>(f) == static_cast<T>(0);
+}
+
+#endif
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc
new file mode 100644
index 00000000..56c71de8
--- /dev/null
+++ b/src/mds/SessionMap.cc
@@ -0,0 +1,1226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "MDSRank.h"
+#include "MDCache.h"
+#include "Mutation.h"
+#include "SessionMap.h"
+#include "osdc/Filer.h"
+#include "common/Finisher.h"
+
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/DecayCounter.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << rank << ".sessionmap "
+
+namespace {
+class SessionMapIOContext : public MDSIOContextBase
+{
+ protected:
+ SessionMap *sessionmap;
+ MDSRank *get_mds() override {return sessionmap->mds;}
+ public:
+ explicit SessionMapIOContext(SessionMap *sessionmap_) : sessionmap(sessionmap_) {
+ ceph_assert(sessionmap != NULL);
+ }
+};
+};
+
+void SessionMap::register_perfcounters()
+{
+ PerfCountersBuilder plb(g_ceph_context, "mds_sessions",
+ l_mdssm_first, l_mdssm_last);
+
+ plb.add_u64(l_mdssm_session_count, "session_count",
+ "Session count", "sess", PerfCountersBuilder::PRIO_INTERESTING);
+
+ plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+ plb.add_u64_counter(l_mdssm_session_add, "session_add",
+ "Sessions added");
+ plb.add_u64_counter(l_mdssm_session_remove, "session_remove",
+ "Sessions removed");
+ plb.add_u64(l_mdssm_session_open, "sessions_open",
+ "Sessions currently open");
+ plb.add_u64(l_mdssm_session_stale, "sessions_stale",
+ "Sessions currently stale");
+ plb.add_u64(l_mdssm_total_load, "total_load", "Total Load");
+ plb.add_u64(l_mdssm_avg_load, "average_load", "Average Load");
+ plb.add_u64(l_mdssm_avg_session_uptime, "avg_session_uptime",
+ "Average session uptime");
+
+ logger = plb.create_perf_counters();
+ g_ceph_context->get_perfcounters_collection()->add(logger);
+}
+
+void SessionMap::dump()
+{
+ dout(10) << "dump" << dendl;
+ for (ceph::unordered_map<entity_name_t,Session*>::iterator p = session_map.begin();
+ p != session_map.end();
+ ++p)
+ dout(10) << p->first << " " << p->second
+ << " state " << p->second->get_state_name()
+ << " completed " << p->second->info.completed_requests
+ << " prealloc_inos " << p->second->info.prealloc_inos
+ << " used_inos " << p->second->info.used_inos
+ << dendl;
+}
+
+
+// ----------------
+// LOAD
+
+
+object_t SessionMap::get_object_name() const
+{
+ char s[30];
+ snprintf(s, sizeof(s), "mds%d_sessionmap", int(mds->get_nodeid()));
+ return object_t(s);
+}
+
+namespace {
+class C_IO_SM_Load : public SessionMapIOContext {
+public:
+ const bool first; //< Am I the initial (header) load?
+ int header_r; //< Return value from OMAP header read
+ int values_r; //< Return value from OMAP value read
+ bufferlist header_bl;
+ std::map<std::string, bufferlist> session_vals;
+ bool more_session_vals = false;
+
+ C_IO_SM_Load(SessionMap *cm, const bool f)
+ : SessionMapIOContext(cm), first(f), header_r(0), values_r(0) {}
+
+ void finish(int r) override {
+ sessionmap->_load_finish(r, header_r, values_r, first, header_bl, session_vals,
+ more_session_vals);
+ }
+ void print(ostream& out) const override {
+ out << "session_load";
+ }
+};
+}
+
+
+/**
+ * Decode OMAP header. Call this once when loading.
+ */
+void SessionMapStore::decode_header(
+ bufferlist &header_bl)
+{
+ auto q = header_bl.cbegin();
+ DECODE_START(1, q)
+ decode(version, q);
+ DECODE_FINISH(q);
+}
+
+void SessionMapStore::encode_header(
+ bufferlist *header_bl)
+{
+ ENCODE_START(1, 1, *header_bl);
+ encode(version, *header_bl);
+ ENCODE_FINISH(*header_bl);
+}
+
+/**
+ * Decode and insert some serialized OMAP values. Call this
+ * repeatedly to insert batched loads.
+ */
+void SessionMapStore::decode_values(std::map<std::string, bufferlist> &session_vals)
+{
+ for (std::map<std::string, bufferlist>::iterator i = session_vals.begin();
+ i != session_vals.end(); ++i) {
+
+ entity_inst_t inst;
+
+ bool parsed = inst.name.parse(i->first);
+ if (!parsed) {
+ derr << "Corrupt entity name '" << i->first << "' in sessionmap" << dendl;
+ throw buffer::malformed_input("Corrupt entity name in sessionmap");
+ }
+
+ Session *s = get_or_add_session(inst);
+ if (s->is_closed()) {
+ s->set_state(Session::STATE_OPEN);
+ s->set_load_avg_decay_rate(decay_rate);
+ }
+ auto q = i->second.cbegin();
+ s->decode(q);
+ }
+}
+
+/**
+ * An OMAP read finished.
+ */
+void SessionMap::_load_finish(
+ int operation_r,
+ int header_r,
+ int values_r,
+ bool first,
+ bufferlist &header_bl,
+ std::map<std::string, bufferlist> &session_vals,
+ bool more_session_vals)
+{
+ if (operation_r < 0) {
+ derr << "_load_finish got " << cpp_strerror(operation_r) << dendl;
+ mds->clog->error() << "error reading sessionmap '" << get_object_name()
+ << "' " << operation_r << " ("
+ << cpp_strerror(operation_r) << ")";
+ mds->damaged();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+
+ // Decode header
+ if (first) {
+ if (header_r != 0) {
+ derr << __func__ << ": header error: " << cpp_strerror(header_r) << dendl;
+ mds->clog->error() << "error reading sessionmap header "
+ << header_r << " (" << cpp_strerror(header_r) << ")";
+ mds->damaged();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+
+ if(header_bl.length() == 0) {
+ dout(4) << __func__ << ": header missing, loading legacy..." << dendl;
+ load_legacy();
+ return;
+ }
+
+ try {
+ decode_header(header_bl);
+ } catch (buffer::error &e) {
+ mds->clog->error() << "corrupt sessionmap header: " << e.what();
+ mds->damaged();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+ dout(10) << __func__ << " loaded version " << version << dendl;
+ }
+
+ if (values_r != 0) {
+ derr << __func__ << ": error reading values: "
+ << cpp_strerror(values_r) << dendl;
+ mds->clog->error() << "error reading sessionmap values: "
+ << values_r << " (" << cpp_strerror(values_r) << ")";
+ mds->damaged();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+
+ // Decode session_vals
+ try {
+ decode_values(session_vals);
+ } catch (buffer::error &e) {
+ mds->clog->error() << "corrupt sessionmap values: " << e.what();
+ mds->damaged();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+
+ if (more_session_vals) {
+ // Issue another read if we're not at the end of the omap
+ const std::string last_key = session_vals.rbegin()->first;
+ dout(10) << __func__ << ": continue omap load from '"
+ << last_key << "'" << dendl;
+ object_t oid = get_object_name();
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+ C_IO_SM_Load *c = new C_IO_SM_Load(this, false);
+ ObjectOperation op;
+ op.omap_get_vals(last_key, "", g_conf()->mds_sessionmap_keys_per_op,
+ &c->session_vals, &c->more_session_vals, &c->values_r);
+ mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, NULL, 0,
+ new C_OnFinisher(c, mds->finisher));
+ } else {
+ // I/O is complete. Update `by_state`
+ dout(10) << __func__ << ": omap load complete" << dendl;
+ for (ceph::unordered_map<entity_name_t, Session*>::iterator i = session_map.begin();
+ i != session_map.end(); ++i) {
+ Session *s = i->second;
+ auto by_state_entry = by_state.find(s->get_state());
+ if (by_state_entry == by_state.end())
+ by_state_entry = by_state.emplace(s->get_state(),
+ new xlist<Session*>).first;
+ by_state_entry->second->push_back(&s->item_session_list);
+ }
+
+ // Population is complete. Trigger load waiters.
+ dout(10) << __func__ << ": v " << version
+ << ", " << session_map.size() << " sessions" << dendl;
+ projected = committing = committed = version;
+ dump();
+ finish_contexts(g_ceph_context, waiting_for_load);
+ }
+}
+
+/**
+ * Populate session state from OMAP records in this
+ * rank's sessionmap object.
+ */
+void SessionMap::load(MDSContext *onload)
+{
+ dout(10) << "load" << dendl;
+
+ if (onload)
+ waiting_for_load.push_back(onload);
+
+ C_IO_SM_Load *c = new C_IO_SM_Load(this, true);
+ object_t oid = get_object_name();
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+
+ ObjectOperation op;
+ op.omap_get_header(&c->header_bl, &c->header_r);
+ op.omap_get_vals("", "", g_conf()->mds_sessionmap_keys_per_op,
+ &c->session_vals, &c->more_session_vals, &c->values_r);
+
+ mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, NULL, 0, new C_OnFinisher(c, mds->finisher));
+}
+
+namespace {
+class C_IO_SM_LoadLegacy : public SessionMapIOContext {
+public:
+ bufferlist bl;
+ explicit C_IO_SM_LoadLegacy(SessionMap *cm) : SessionMapIOContext(cm) {}
+ void finish(int r) override {
+ sessionmap->_load_legacy_finish(r, bl);
+ }
+ void print(ostream& out) const override {
+ out << "session_load_legacy";
+ }
+};
+}
+
+
+/**
+ * Load legacy (object data blob) SessionMap format, assuming
+ * that waiting_for_load has already been populated with
+ * the relevant completion. This is the fallback if we do not
+ * find an OMAP header when attempting to load normally.
+ */
+void SessionMap::load_legacy()
+{
+ dout(10) << __func__ << dendl;
+
+ C_IO_SM_LoadLegacy *c = new C_IO_SM_LoadLegacy(this);
+ object_t oid = get_object_name();
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+
+ mds->objecter->read_full(oid, oloc, CEPH_NOSNAP, &c->bl, 0,
+ new C_OnFinisher(c, mds->finisher));
+}
+
+void SessionMap::_load_legacy_finish(int r, bufferlist &bl)
+{
+ auto blp = bl.cbegin();
+ if (r < 0) {
+ derr << "_load_finish got " << cpp_strerror(r) << dendl;
+ ceph_abort_msg("failed to load sessionmap");
+ }
+ dump();
+ decode_legacy(blp); // note: this sets last_cap_renew = now()
+ dout(10) << "_load_finish v " << version
+ << ", " << session_map.size() << " sessions, "
+ << bl.length() << " bytes"
+ << dendl;
+ projected = committing = committed = version;
+ dump();
+
+ // Mark all sessions dirty, so that on next save() we will write
+ // a complete OMAP version of the data loaded from the legacy format
+ for (ceph::unordered_map<entity_name_t, Session*>::iterator i = session_map.begin();
+ i != session_map.end(); ++i) {
+ // Don't use mark_dirty because on this occasion we want to ignore the
+ // keys_per_op limit and do one big write (upgrade must be atomic)
+ dirty_sessions.insert(i->first);
+ }
+ loaded_legacy = true;
+
+ finish_contexts(g_ceph_context, waiting_for_load);
+}
+
+
+// ----------------
+// SAVE
+
+namespace {
+class C_IO_SM_Save : public SessionMapIOContext {
+ version_t version;
+public:
+ C_IO_SM_Save(SessionMap *cm, version_t v) : SessionMapIOContext(cm), version(v) {}
+ void finish(int r) override {
+ if (r != 0) {
+ get_mds()->handle_write_error(r);
+ } else {
+ sessionmap->_save_finish(version);
+ }
+ }
+ void print(ostream& out) const override {
+ out << "session_save";
+ }
+};
+}
+
+void SessionMap::save(MDSContext *onsave, version_t needv)
+{
+ dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl;
+
+ if (needv && committing >= needv) {
+ ceph_assert(committing > committed);
+ commit_waiters[committing].push_back(onsave);
+ return;
+ }
+
+ commit_waiters[version].push_back(onsave);
+
+ committing = version;
+ SnapContext snapc;
+ object_t oid = get_object_name();
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+
+ ObjectOperation op;
+
+ /* Compose OSD OMAP transaction for full write */
+ bufferlist header_bl;
+ encode_header(&header_bl);
+ op.omap_set_header(header_bl);
+
+ /* If we loaded a legacy sessionmap, then erase the old data. If
+ * an old-versioned MDS tries to read it, it'll fail out safely
+ * with an end_of_buffer exception */
+ if (loaded_legacy) {
+ dout(4) << __func__ << " erasing legacy sessionmap" << dendl;
+ op.truncate(0);
+ loaded_legacy = false; // only need to truncate once.
+ }
+
+ dout(20) << " updating keys:" << dendl;
+ map<string, bufferlist> to_set;
+ for(std::set<entity_name_t>::iterator i = dirty_sessions.begin();
+ i != dirty_sessions.end(); ++i) {
+ const entity_name_t name = *i;
+ Session *session = session_map[name];
+
+ if (session->is_open() ||
+ session->is_closing() ||
+ session->is_stale() ||
+ session->is_killing()) {
+ dout(20) << " " << name << dendl;
+ // Serialize K
+ std::ostringstream k;
+ k << name;
+
+ // Serialize V
+ bufferlist bl;
+ session->info.encode(bl, mds->mdsmap->get_up_features());
+
+ // Add to RADOS op
+ to_set[k.str()] = bl;
+
+ session->clear_dirty_completed_requests();
+ } else {
+ dout(20) << " " << name << " (ignoring)" << dendl;
+ }
+ }
+ if (!to_set.empty()) {
+ op.omap_set(to_set);
+ }
+
+ dout(20) << " removing keys:" << dendl;
+ set<string> to_remove;
+ for(std::set<entity_name_t>::const_iterator i = null_sessions.begin();
+ i != null_sessions.end(); ++i) {
+ dout(20) << " " << *i << dendl;
+ std::ostringstream k;
+ k << *i;
+ to_remove.insert(k.str());
+ }
+ if (!to_remove.empty()) {
+ op.omap_rm_keys(to_remove);
+ }
+
+ dirty_sessions.clear();
+ null_sessions.clear();
+
+ mds->objecter->mutate(oid, oloc, op, snapc,
+ ceph::real_clock::now(),
+ 0,
+ new C_OnFinisher(new C_IO_SM_Save(this, version),
+ mds->finisher));
+}
+
+void SessionMap::_save_finish(version_t v)
+{
+ dout(10) << "_save_finish v" << v << dendl;
+ committed = v;
+
+ finish_contexts(g_ceph_context, commit_waiters[v]);
+ commit_waiters.erase(v);
+}
+
+
+/**
+ * Deserialize sessions, and update by_state index
+ */
+void SessionMap::decode_legacy(bufferlist::const_iterator &p)
+{
+ // Populate `sessions`
+ SessionMapStore::decode_legacy(p);
+
+ // Update `by_state`
+ for (ceph::unordered_map<entity_name_t, Session*>::iterator i = session_map.begin();
+ i != session_map.end(); ++i) {
+ Session *s = i->second;
+ auto by_state_entry = by_state.find(s->get_state());
+ if (by_state_entry == by_state.end())
+ by_state_entry = by_state.emplace(s->get_state(),
+ new xlist<Session*>).first;
+ by_state_entry->second->push_back(&s->item_session_list);
+ }
+}
+
+uint64_t SessionMap::set_state(Session *session, int s) {
+ if (session->state != s) {
+ session->set_state(s);
+ auto by_state_entry = by_state.find(s);
+ if (by_state_entry == by_state.end())
+ by_state_entry = by_state.emplace(s, new xlist<Session*>).first;
+ by_state_entry->second->push_back(&session->item_session_list);
+
+ if (session->is_open() || session->is_stale()) {
+ session->set_load_avg_decay_rate(decay_rate);
+ }
+
+ // refresh number of sessions for states which have perf
+ // couters associated
+ logger->set(l_mdssm_session_open,
+ get_session_count_in_state(Session::STATE_OPEN));
+ logger->set(l_mdssm_session_stale,
+ get_session_count_in_state(Session::STATE_STALE));
+ }
+
+ return session->get_state_seq();
+}
+
+void SessionMapStore::decode_legacy(bufferlist::const_iterator& p)
+{
+ auto now = clock::now();
+ uint64_t pre;
+ decode(pre, p);
+ if (pre == (uint64_t)-1) {
+ DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, p);
+ ceph_assert(struct_v >= 2);
+
+ decode(version, p);
+
+ while (!p.end()) {
+ entity_inst_t inst;
+ decode(inst.name, p);
+ Session *s = get_or_add_session(inst);
+ if (s->is_closed()) {
+ s->set_state(Session::STATE_OPEN);
+ s->set_load_avg_decay_rate(decay_rate);
+ }
+ s->decode(p);
+ }
+
+ DECODE_FINISH(p);
+ } else {
+ // --- old format ----
+ version = pre;
+
+ // this is a meaningless upper bound. can be ignored.
+ __u32 n;
+ decode(n, p);
+
+ while (n-- && !p.end()) {
+ auto p2 = p;
+ Session *s = new Session(ConnectionRef());
+ s->info.decode(p);
+ {
+ auto& name = s->info.inst.name;
+ auto it = session_map.find(name);
+ if (it != session_map.end()) {
+ // eager client connected too fast! aie.
+ dout(10) << " already had session for " << name << ", recovering" << dendl;
+ delete s;
+ s = it->second;
+ p = p2;
+ s->info.decode(p);
+ } else {
+ it->second = s;
+ }
+ }
+ s->set_state(Session::STATE_OPEN);
+ s->set_load_avg_decay_rate(decay_rate);
+ s->last_cap_renew = now;
+ }
+ }
+}
+
+void Session::dump(Formatter *f) const
+{
+ f->dump_int("id", info.inst.name.num());
+ f->dump_object("entity", info.inst);
+ f->dump_string("state", get_state_name());
+ f->dump_int("num_leases", leases.size());
+ f->dump_int("num_caps", caps.size());
+ if (is_open() || is_stale()) {
+ f->dump_unsigned("request_load_avg", get_load_avg());
+ }
+ f->dump_float("uptime", get_session_uptime());
+ f->dump_unsigned("requests_in_flight", get_request_count());
+ f->dump_unsigned("completed_requests", get_num_completed_requests());
+ f->dump_bool("reconnecting", reconnecting);
+ f->dump_object("recall_caps", recall_caps);
+ f->dump_object("release_caps", release_caps);
+ f->dump_object("recall_caps_throttle", recall_caps_throttle);
+ f->dump_object("recall_caps_throttle2o", recall_caps_throttle2o);
+ f->dump_object("session_cache_liveness", session_cache_liveness);
+ f->dump_object("cap_acquisition", cap_acquisition);
+ info.dump(f);
+}
+
+void SessionMapStore::dump(Formatter *f) const
+{
+ f->open_array_section("sessions");
+ for (const auto& p : session_map) {
+ f->dump_object("session", *p.second);
+ }
+ f->close_section(); // Sessions
+}
+
+void SessionMapStore::generate_test_instances(list<SessionMapStore*>& ls)
+{
+ // pretty boring for now
+ ls.push_back(new SessionMapStore());
+}
+
+void SessionMap::wipe()
+{
+ dout(1) << "wipe start" << dendl;
+ dump();
+ while (!session_map.empty()) {
+ Session *s = session_map.begin()->second;
+ remove_session(s);
+ }
+ version = ++projected;
+ dout(1) << "wipe result" << dendl;
+ dump();
+ dout(1) << "wipe done" << dendl;
+}
+
+void SessionMap::wipe_ino_prealloc()
+{
+ for (ceph::unordered_map<entity_name_t,Session*>::iterator p = session_map.begin();
+ p != session_map.end();
+ ++p) {
+ p->second->pending_prealloc_inos.clear();
+ p->second->info.prealloc_inos.clear();
+ p->second->info.used_inos.clear();
+ }
+ projected = ++version;
+}
+
+void SessionMap::add_session(Session *s)
+{
+ dout(10) << __func__ << " s=" << s << " name=" << s->info.inst.name << dendl;
+
+ ceph_assert(session_map.count(s->info.inst.name) == 0);
+ session_map[s->info.inst.name] = s;
+ auto by_state_entry = by_state.find(s->state);
+ if (by_state_entry == by_state.end())
+ by_state_entry = by_state.emplace(s->state, new xlist<Session*>).first;
+ by_state_entry->second->push_back(&s->item_session_list);
+ s->get();
+
+ update_average_birth_time(*s);
+
+ logger->set(l_mdssm_session_count, session_map.size());
+ logger->inc(l_mdssm_session_add);
+}
+
+void SessionMap::remove_session(Session *s)
+{
+ dout(10) << __func__ << " s=" << s << " name=" << s->info.inst.name << dendl;
+
+ update_average_birth_time(*s, false);
+
+ s->trim_completed_requests(0);
+ s->item_session_list.remove_myself();
+ session_map.erase(s->info.inst.name);
+ dirty_sessions.erase(s->info.inst.name);
+ null_sessions.insert(s->info.inst.name);
+ s->put();
+
+ logger->set(l_mdssm_session_count, session_map.size());
+ logger->inc(l_mdssm_session_remove);
+}
+
+void SessionMap::touch_session(Session *session)
+{
+ dout(10) << __func__ << " s=" << session << " name=" << session->info.inst.name << dendl;
+
+ // Move to the back of the session list for this state (should
+ // already be on a list courtesy of add_session and set_state)
+ ceph_assert(session->item_session_list.is_on_list());
+ auto by_state_entry = by_state.find(session->state);
+ if (by_state_entry == by_state.end())
+ by_state_entry = by_state.emplace(session->state,
+ new xlist<Session*>).first;
+ by_state_entry->second->push_back(&session->item_session_list);
+
+ session->last_cap_renew = clock::now();
+}
+
+void SessionMap::_mark_dirty(Session *s, bool may_save)
+{
+ if (dirty_sessions.count(s->info.inst.name))
+ return;
+
+ if (may_save &&
+ dirty_sessions.size() >= g_conf()->mds_sessionmap_keys_per_op) {
+ // Pre-empt the usual save() call from journal segment trim, in
+ // order to avoid building up an oversized OMAP update operation
+ // from too many sessions modified at once
+ save(new C_MDSInternalNoop, version);
+ }
+
+ null_sessions.erase(s->info.inst.name);
+ dirty_sessions.insert(s->info.inst.name);
+}
+
+void SessionMap::mark_dirty(Session *s, bool may_save)
+{
+ dout(20) << __func__ << " s=" << s << " name=" << s->info.inst.name
+ << " v=" << version << dendl;
+
+ _mark_dirty(s, may_save);
+ version++;
+ s->pop_pv(version);
+}
+
+void SessionMap::replay_dirty_session(Session *s)
+{
+ dout(20) << __func__ << " s=" << s << " name=" << s->info.inst.name
+ << " v=" << version << dendl;
+
+ _mark_dirty(s, false);
+
+ replay_advance_version();
+}
+
+void SessionMap::replay_advance_version()
+{
+ version++;
+ projected = version;
+}
+
+void SessionMap::replay_open_sessions(version_t event_cmapv,
+ map<client_t,entity_inst_t>& client_map,
+ map<client_t,client_metadata_t>& client_metadata_map)
+{
+ unsigned already_saved;
+
+ if (version + client_map.size() < event_cmapv)
+ goto bad;
+
+ // Server::finish_force_open_sessions() marks sessions dirty one by one.
+ // Marking a session dirty may flush all existing dirty sessions. So it's
+ // possible that some sessions are already saved in sessionmap.
+ already_saved = client_map.size() - (event_cmapv - version);
+ for (const auto& p : client_map) {
+ Session *s = get_or_add_session(p.second);
+ auto q = client_metadata_map.find(p.first);
+ if (q != client_metadata_map.end())
+ s->info.client_metadata.merge(q->second);
+
+ if (already_saved > 0) {
+ if (s->is_closed())
+ goto bad;
+
+ --already_saved;
+ continue;
+ }
+
+ set_state(s, Session::STATE_OPEN);
+ replay_dirty_session(s);
+ }
+ return;
+
+bad:
+ mds->clog->error() << "error replaying open sessions(" << client_map.size()
+ << ") sessionmap v " << event_cmapv << " table " << version;
+ ceph_assert(g_conf()->mds_wipe_sessions);
+ mds->sessionmap.wipe();
+ mds->sessionmap.set_version(event_cmapv);
+}
+
+version_t SessionMap::mark_projected(Session *s)
+{
+ dout(20) << __func__ << " s=" << s << " name=" << s->info.inst.name
+ << " pv=" << projected << " -> " << projected + 1 << dendl;
+ ++projected;
+ s->push_pv(projected);
+ return projected;
+}
+
+namespace {
+class C_IO_SM_Save_One : public SessionMapIOContext {
+ MDSContext *on_safe;
+public:
+ C_IO_SM_Save_One(SessionMap *cm, MDSContext *on_safe_)
+ : SessionMapIOContext(cm), on_safe(on_safe_) {}
+ void finish(int r) override {
+ if (r != 0) {
+ get_mds()->handle_write_error(r);
+ } else {
+ on_safe->complete(r);
+ }
+ }
+ void print(ostream& out) const override {
+ out << "session_save_one";
+ }
+};
+}
+
+
+void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
+ MDSGatherBuilder *gather_bld)
+{
+ ceph_assert(gather_bld != NULL);
+
+ std::vector<entity_name_t> write_sessions;
+
+ // Decide which sessions require a write
+ for (std::set<entity_name_t>::iterator i = tgt_sessions.begin();
+ i != tgt_sessions.end(); ++i) {
+ const entity_name_t &session_id = *i;
+
+ if (session_map.count(session_id) == 0) {
+ // Session isn't around any more, never mind.
+ continue;
+ }
+
+ Session *session = session_map[session_id];
+ if (!session->has_dirty_completed_requests()) {
+ // Session hasn't had completed_requests
+ // modified since last write, no need to
+ // write it now.
+ continue;
+ }
+
+ if (dirty_sessions.count(session_id) > 0) {
+ // Session is already dirtied, will be written, no
+ // need to pre-empt that.
+ continue;
+ }
+ // Okay, passed all our checks, now we write
+ // this session out. The version we write
+ // into the OMAP may now be higher-versioned
+ // than the version in the header, but that's
+ // okay because it's never a problem to have
+ // an overly-fresh copy of a session.
+ write_sessions.push_back(*i);
+ }
+
+ dout(4) << __func__ << ": writing " << write_sessions.size() << dendl;
+
+ // Batch writes into mds_sessionmap_keys_per_op
+ const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op;
+ map<string, bufferlist> to_set;
+ for (uint32_t i = 0; i < write_sessions.size(); ++i) {
+ const entity_name_t &session_id = write_sessions[i];
+ Session *session = session_map[session_id];
+ session->clear_dirty_completed_requests();
+
+ // Serialize K
+ std::ostringstream k;
+ k << session_id;
+
+ // Serialize V
+ bufferlist bl;
+ session->info.encode(bl, mds->mdsmap->get_up_features());
+
+ // Add to RADOS op
+ to_set[k.str()] = bl;
+
+ // Complete this write transaction?
+ if (i == write_sessions.size() - 1
+ || i % kpo == kpo - 1) {
+ ObjectOperation op;
+ op.omap_set(to_set);
+ to_set.clear(); // clear to start a new transaction
+
+ SnapContext snapc;
+ object_t oid = get_object_name();
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+ MDSContext *on_safe = gather_bld->new_sub();
+ mds->objecter->mutate(oid, oloc, op, snapc,
+ ceph::real_clock::now(), 0,
+ new C_OnFinisher(
+ new C_IO_SM_Save_One(this, on_safe),
+ mds->finisher));
+ }
+ }
+}
+
+// =================
+// Session
+
+#undef dout_prefix
+#define dout_prefix *_dout << "Session "
+
+/**
+ * Calculate the length of the `requests` member list,
+ * because elist does not have a size() method.
+ *
+ * O(N) runtime.
+ */
+size_t Session::get_request_count() const
+{
+ size_t result = 0;
+ for (auto p = requests.begin(); !p.end(); ++p)
+ ++result;
+ return result;
+}
+
+/**
+ * Capped in response to a CEPH_MSG_CLIENT_CAPRELEASE message,
+ * with n_caps equal to the number of caps that were released
+ * in the message. Used to update state about how many caps a
+ * client has released since it was last instructed to RECALL_STATE.
+ */
+void Session::notify_cap_release(size_t n_caps)
+{
+ recall_caps.hit(-(double)n_caps);
+ release_caps.hit(n_caps);
+}
+
+/**
+ * Called when a CEPH_MSG_CLIENT_SESSION->CEPH_SESSION_RECALL_STATE
+ * message is sent to the client. Update our recall-related state
+ * in order to generate health metrics if the session doesn't see
+ * a commensurate number of calls to ::notify_cap_release
+ */
+uint64_t Session::notify_recall_sent(size_t new_limit)
+{
+ const auto num_caps = caps.size();
+ ceph_assert(new_limit < num_caps); // Behaviour of Server::recall_client_state
+ const auto count = num_caps-new_limit;
+ uint64_t new_change;
+ if (recall_limit != new_limit) {
+ new_change = count;
+ } else {
+ new_change = 0; /* no change! */
+ }
+
+ /* Always hit the session counter as a RECALL message is still sent to the
+ * client and we do not want the MDS to burn its global counter tokens on a
+ * session that is not releasing caps (i.e. allow the session counter to
+ * throttle future RECALL messages).
+ */
+ recall_caps_throttle.hit(count);
+ recall_caps_throttle2o.hit(count);
+ recall_caps.hit(count);
+ return new_change;
+}
+
+/**
+ * Use client metadata to generate a somewhat-friendlier
+ * name for the client than its session ID.
+ *
+ * This is *not* guaranteed to be unique, and any machine
+ * consumers of session-related output should always use
+ * the session ID as a primary capacity and use this only
+ * as a presentation hint.
+ */
+void Session::_update_human_name()
+{
+ auto info_client_metadata_entry = info.client_metadata.find("hostname");
+ if (info_client_metadata_entry != info.client_metadata.end()) {
+ // Happy path, refer to clients by hostname
+ human_name = info_client_metadata_entry->second;
+ if (!info.auth_name.has_default_id()) {
+ // When a non-default entity ID is set by the user, assume they
+ // would like to see it in references to the client, if it's
+ // reasonable short. Limit the length because we don't want
+ // to put e.g. uuid-generated names into a "human readable"
+ // rendering.
+ const int arbitrarily_short = 16;
+ if (info.auth_name.get_id().size() < arbitrarily_short) {
+ human_name += std::string(":") + info.auth_name.get_id();
+ }
+ }
+ } else {
+ // Fallback, refer to clients by ID e.g. client.4567
+ human_name = stringify(info.inst.name.num());
+ }
+}
+
+void Session::decode(bufferlist::const_iterator &p)
+{
+ info.decode(p);
+
+ _update_human_name();
+}
+
+int Session::check_access(CInode *in, unsigned mask,
+ int caller_uid, int caller_gid,
+ const vector<uint64_t> *caller_gid_list,
+ int new_uid, int new_gid)
+{
+ string path;
+ CInode *diri = NULL;
+ if (!in->is_base())
+ diri = in->get_projected_parent_dn()->get_dir()->get_inode();
+ if (diri && diri->is_stray()){
+ path = in->get_projected_inode()->stray_prior_path;
+ dout(20) << __func__ << " stray_prior_path " << path << dendl;
+ } else {
+ in->make_path_string(path, true);
+ dout(20) << __func__ << " path " << path << dendl;
+ }
+ if (path.length())
+ path = path.substr(1); // drop leading /
+
+ if (in->inode.is_dir() &&
+ in->inode.has_layout() &&
+ in->inode.layout.pool_ns.length() &&
+ !connection->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) {
+ dout(10) << __func__ << " client doesn't support FS_FILE_LAYOUT_V2" << dendl;
+ return -EIO;
+ }
+
+ if (!auth_caps.is_capable(path, in->inode.uid, in->inode.gid, in->inode.mode,
+ caller_uid, caller_gid, caller_gid_list, mask,
+ new_uid, new_gid,
+ info.inst.addr)) {
+ return -EACCES;
+ }
+ return 0;
+}
+
+// track total and per session load
+void SessionMap::hit_session(Session *session) {
+ uint64_t sessions = get_session_count_in_state(Session::STATE_OPEN) +
+ get_session_count_in_state(Session::STATE_STALE) +
+ get_session_count_in_state(Session::STATE_CLOSING);
+ ceph_assert(sessions != 0);
+
+ double total_load = total_load_avg.hit();
+ double avg_load = total_load / sessions;
+
+ logger->set(l_mdssm_total_load, (uint64_t)total_load);
+ logger->set(l_mdssm_avg_load, (uint64_t)avg_load);
+
+ session->hit_session();
+}
+
+void SessionMap::handle_conf_change(const std::set<std::string>& changed)
+{
+ auto apply_to_open_sessions = [this](auto f) {
+ if (auto it = by_state.find(Session::STATE_OPEN); it != by_state.end()) {
+ for (const auto &session : *(it->second)) {
+ f(session);
+ }
+ }
+ if (auto it = by_state.find(Session::STATE_STALE); it != by_state.end()) {
+ for (const auto &session : *(it->second)) {
+ f(session);
+ }
+ }
+ };
+
+ if (changed.count("mds_request_load_average_decay_rate")) {
+ auto d = g_conf().get_val<double>("mds_request_load_average_decay_rate");
+
+ decay_rate = d;
+ total_load_avg = DecayCounter(d);
+
+ auto mut = [d](auto s) {
+ s->set_load_avg_decay_rate(d);
+ };
+ apply_to_open_sessions(mut);
+ }
+ if (changed.count("mds_recall_max_decay_rate")) {
+ auto d = g_conf().get_val<double>("mds_recall_max_decay_rate");
+ auto mut = [d](auto s) {
+ s->recall_caps_throttle = DecayCounter(d);
+ };
+ apply_to_open_sessions(mut);
+ }
+ if (changed.count("mds_recall_warning_decay_rate")) {
+ auto d = g_conf().get_val<double>("mds_recall_warning_decay_rate");
+ auto mut = [d](auto s) {
+ s->recall_caps = DecayCounter(d);
+ s->release_caps = DecayCounter(d);
+ };
+ apply_to_open_sessions(mut);
+ }
+ if (changed.count("mds_session_cache_liveness_decay_rate")) {
+ auto d = g_conf().get_val<double>("mds_session_cache_liveness_decay_rate");
+ auto mut = [d](auto s) {
+ s->session_cache_liveness = DecayCounter(d);
+ s->session_cache_liveness.hit(s->caps.size()); /* so the MDS doesn't immediately start trimming a new session */
+ };
+ apply_to_open_sessions(mut);
+ }
+ if (changed.count("mds_session_cap_acquisition_decay_rate")) {
+ auto d = g_conf().get_val<double>("mds_session_cap_acquisition_decay_rate");
+ auto mut = [d](auto s) {
+ s->cap_acquisition = DecayCounter(d);
+ };
+ apply_to_open_sessions(mut);
+ }
+}
+
+void SessionMap::update_average_session_age() {
+ if (!session_map.size()) {
+ return;
+ }
+
+ double avg_uptime = std::chrono::duration<double>(clock::now()-avg_birth_time).count();
+ logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime);
+}
+
+int SessionFilter::parse(
+ const std::vector<std::string> &args,
+ std::stringstream *ss)
+{
+ ceph_assert(ss != NULL);
+
+ for (const auto &s : args) {
+ dout(20) << __func__ << " parsing filter '" << s << "'" << dendl;
+
+ auto eq = s.find("=");
+ if (eq == std::string::npos || eq == s.size()) {
+ *ss << "Invalid filter '" << s << "'";
+ return -EINVAL;
+ }
+
+ // Keys that start with this are to be taken as referring
+ // to freeform client metadata fields.
+ const std::string metadata_prefix("client_metadata.");
+
+ auto k = s.substr(0, eq);
+ auto v = s.substr(eq + 1);
+
+ dout(20) << __func__ << " parsed k='" << k << "', v='" << v << "'" << dendl;
+
+ if (k.compare(0, metadata_prefix.size(), metadata_prefix) == 0
+ && k.size() > metadata_prefix.size()) {
+ // Filter on arbitrary metadata key (no fixed schema for this,
+ // so anything after the dot is a valid field to filter on)
+ auto metadata_key = k.substr(metadata_prefix.size());
+ metadata.insert(std::make_pair(metadata_key, v));
+ } else if (k == "auth_name") {
+ // Filter on client entity name
+ auth_name = v;
+ } else if (k == "state") {
+ state = v;
+ } else if (k == "id") {
+ std::string err;
+ id = strict_strtoll(v.c_str(), 10, &err);
+ if (!err.empty()) {
+ *ss << err;
+ return -EINVAL;
+ }
+ } else if (k == "reconnecting") {
+
+ /**
+ * Strict boolean parser. Allow true/false/0/1.
+ * Anything else is -EINVAL.
+ */
+ auto is_true = [](std::string_view bstr, bool *out) -> bool
+ {
+ ceph_assert(out != nullptr);
+
+ if (bstr == "true" || bstr == "1") {
+ *out = true;
+ return 0;
+ } else if (bstr == "false" || bstr == "0") {
+ *out = false;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+ };
+
+ bool bval;
+ int r = is_true(v, &bval);
+ if (r == 0) {
+ set_reconnecting(bval);
+ } else {
+ *ss << "Invalid boolean value '" << v << "'";
+ return -EINVAL;
+ }
+ } else {
+ *ss << "Invalid filter key '" << k << "'";
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+bool SessionFilter::match(
+ const Session &session,
+ std::function<bool(client_t)> is_reconnecting) const
+{
+ for (const auto &m : metadata) {
+ const auto &k = m.first;
+ const auto &v = m.second;
+ auto it = session.info.client_metadata.find(k);
+ if (it == session.info.client_metadata.end()) {
+ return false;
+ }
+ if (it->second != v) {
+ return false;
+ }
+ }
+
+ if (!auth_name.empty() && auth_name != session.info.auth_name.get_id()) {
+ return false;
+ }
+
+ if (!state.empty() && state != session.get_state_name()) {
+ return false;
+ }
+
+ if (id != 0 && id != session.info.inst.name.num()) {
+ return false;
+ }
+
+ if (reconnecting.first) {
+ const bool am_reconnecting = is_reconnecting(session.info.inst.name.num());
+ if (reconnecting.second != am_reconnecting) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+std::ostream& operator<<(std::ostream &out, const Session &s)
+{
+ if (s.get_human_name() == stringify(s.get_client())) {
+ out << s.get_human_name();
+ } else {
+ out << s.get_human_name() << " (" << std::dec << s.get_client() << ")";
+ }
+ return out;
+}
+
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h
new file mode 100644
index 00000000..dd7721cc
--- /dev/null
+++ b/src/mds/SessionMap.h
@@ -0,0 +1,838 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_SESSIONMAP_H
+#define CEPH_MDS_SESSIONMAP_H
+
+#include <set>
+using std::set;
+
+#include "include/unordered_map.h"
+
+#include "include/Context.h"
+#include "include/xlist.h"
+#include "include/elist.h"
+#include "include/interval_set.h"
+#include "mdstypes.h"
+#include "mds/MDSAuthCaps.h"
+#include "common/perf_counters.h"
+#include "common/DecayCounter.h"
+
+class CInode;
+struct MDRequestImpl;
+
+#include "CInode.h"
+#include "Capability.h"
+#include "MDSContext.h"
+#include "msg/Message.h"
+
+enum {
+ l_mdssm_first = 5500,
+ l_mdssm_session_count,
+ l_mdssm_session_add,
+ l_mdssm_session_remove,
+ l_mdssm_session_open,
+ l_mdssm_session_stale,
+ l_mdssm_total_load,
+ l_mdssm_avg_load,
+ l_mdssm_avg_session_uptime,
+ l_mdssm_last,
+};
+
+/*
+ * session
+ */
+
+class Session : public RefCountedObject {
+ // -- state etc --
+public:
+ /*
+
+ <deleted> <-- closed <------------+
+ ^ | |
+ | v |
+ killing <-- opening <----+ |
+ ^ | | |
+ | v | |
+ stale <--> open --> closing ---+
+
+ + additional dimension of 'importing' (with counter)
+
+ */
+
+ using clock = ceph::coarse_mono_clock;
+ using time = ceph::coarse_mono_time;
+
+
+ enum {
+ STATE_CLOSED = 0,
+ STATE_OPENING = 1, // journaling open
+ STATE_OPEN = 2,
+ STATE_CLOSING = 3, // journaling close
+ STATE_STALE = 4,
+ STATE_KILLING = 5
+ };
+
+ static std::string_view get_state_name(int s) {
+ switch (s) {
+ case STATE_CLOSED: return "closed";
+ case STATE_OPENING: return "opening";
+ case STATE_OPEN: return "open";
+ case STATE_CLOSING: return "closing";
+ case STATE_STALE: return "stale";
+ case STATE_KILLING: return "killing";
+ default: return "???";
+ }
+ }
+
+ void dump(Formatter *f) const;
+
+private:
+ int state = STATE_CLOSED;
+ bool reconnecting = false;
+ uint64_t state_seq = 0;
+ int importing_count = 0;
+ friend class SessionMap;
+
+ // Human (friendly) name is soft state generated from client metadata
+ void _update_human_name();
+ std::string human_name;
+
+ // Versions in this session was projected: used to verify
+ // that appropriate mark_dirty calls follow.
+ std::deque<version_t> projected;
+
+ // request load average for this session
+ DecayCounter load_avg;
+
+ // Ephemeral state for tracking progress of capability recalls
+ // caps being recalled recently by this session; used for Beacon warnings
+ DecayCounter recall_caps;
+ // caps that have been released
+ DecayCounter release_caps;
+ // throttle on caps recalled
+ DecayCounter recall_caps_throttle;
+ // second order throttle that prevents recalling too quickly
+ DecayCounter recall_caps_throttle2o;
+ // New limit in SESSION_RECALL
+ uint32_t recall_limit = 0;
+
+ // session caps liveness
+ DecayCounter session_cache_liveness;
+
+ // cap acquisition via readdir
+ DecayCounter cap_acquisition;
+
+ // session start time -- used to track average session time
+ // note that this is initialized in the constructor rather
+ // than at the time of adding a session to the sessionmap
+ // as journal replay of sessionmap will not call add_session().
+ time birth_time;
+
+public:
+ Session *reclaiming_from = nullptr;
+
+ void push_pv(version_t pv)
+ {
+ ceph_assert(projected.empty() || projected.back() != pv);
+ projected.push_back(pv);
+ }
+
+ void pop_pv(version_t v)
+ {
+ ceph_assert(!projected.empty());
+ ceph_assert(projected.front() == v);
+ projected.pop_front();
+ }
+
+ int get_state() const { return state; }
+ void set_state(int new_state)
+ {
+ if (state != new_state) {
+ state = new_state;
+ state_seq++;
+ }
+ }
+
+ void set_reconnecting(bool s) { reconnecting = s; }
+
+ void decode(bufferlist::const_iterator &p);
+ template<typename T>
+ void set_client_metadata(T&& meta)
+ {
+ info.client_metadata = std::forward<T>(meta);
+ _update_human_name();
+ }
+
+ const std::string& get_human_name() const {return human_name;}
+
+ session_info_t info; ///< durable bits
+
+ MDSAuthCaps auth_caps;
+
+protected:
+ ConnectionRef connection;
+public:
+ xlist<Session*>::item item_session_list;
+
+ list<Message::ref> preopen_out_queue; ///< messages for client, queued before they connect
+
+ /* This is mutable to allow get_request_count to be const. elist does not
+ * support const iterators yet.
+ */
+ mutable elist<MDRequestImpl*> requests;
+ size_t get_request_count() const;
+
+ interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos
+
+ void notify_cap_release(size_t n_caps);
+ uint64_t notify_recall_sent(size_t new_limit);
+ auto get_recall_caps_throttle() const {
+ return recall_caps_throttle.get();
+ }
+ auto get_recall_caps_throttle2o() const {
+ return recall_caps_throttle2o.get();
+ }
+ auto get_recall_caps() const {
+ return recall_caps.get();
+ }
+ auto get_release_caps() const {
+ return release_caps.get();
+ }
+ auto get_session_cache_liveness() const {
+ return session_cache_liveness.get();
+ }
+ auto get_cap_acquisition() const {
+ return cap_acquisition.get();
+ }
+
+ inodeno_t next_ino() const {
+ if (info.prealloc_inos.empty())
+ return 0;
+ return info.prealloc_inos.range_start();
+ }
+ inodeno_t take_ino(inodeno_t ino = 0) {
+ ceph_assert(!info.prealloc_inos.empty());
+
+ if (ino) {
+ if (info.prealloc_inos.contains(ino))
+ info.prealloc_inos.erase(ino);
+ else
+ ino = 0;
+ }
+ if (!ino) {
+ ino = info.prealloc_inos.range_start();
+ info.prealloc_inos.erase(ino);
+ }
+ info.used_inos.insert(ino, 1);
+ return ino;
+ }
+ int get_num_projected_prealloc_inos() const {
+ return info.prealloc_inos.size() + pending_prealloc_inos.size();
+ }
+
+ client_t get_client() const {
+ return info.get_client();
+ }
+
+ std::string_view get_state_name() const { return get_state_name(state); }
+ uint64_t get_state_seq() const { return state_seq; }
+ bool is_closed() const { return state == STATE_CLOSED; }
+ bool is_opening() const { return state == STATE_OPENING; }
+ bool is_open() const { return state == STATE_OPEN; }
+ bool is_closing() const { return state == STATE_CLOSING; }
+ bool is_stale() const { return state == STATE_STALE; }
+ bool is_killing() const { return state == STATE_KILLING; }
+
+ void inc_importing() {
+ ++importing_count;
+ }
+ void dec_importing() {
+ ceph_assert(importing_count > 0);
+ --importing_count;
+ }
+ bool is_importing() const { return importing_count > 0; }
+
+ void set_load_avg_decay_rate(double rate) {
+ ceph_assert(is_open() || is_stale());
+ load_avg = DecayCounter(rate);
+ }
+ uint64_t get_load_avg() const {
+ return (uint64_t)load_avg.get();
+ }
+ void hit_session() {
+ load_avg.adjust();
+ }
+
+ double get_session_uptime() const {
+ chrono::duration<double> uptime = clock::now() - birth_time;
+ return uptime.count();
+ }
+
+ time get_birth_time() const {
+ return birth_time;
+ }
+
+ // -- caps --
+private:
+ uint32_t cap_gen = 0;
+ version_t cap_push_seq = 0; // cap push seq #
+ map<version_t, MDSContext::vec > waitfor_flush; // flush session messages
+
+public:
+ xlist<Capability*> caps; // inodes with caps; front=most recently used
+ xlist<ClientLease*> leases; // metadata leases to clients
+ time last_cap_renew = clock::zero();
+ time last_seen = clock::zero();
+
+ void inc_cap_gen() { ++cap_gen; }
+ uint32_t get_cap_gen() const { return cap_gen; }
+
+ version_t inc_push_seq() { return ++cap_push_seq; }
+ version_t get_push_seq() const { return cap_push_seq; }
+
+ version_t wait_for_flush(MDSContext* c) {
+ waitfor_flush[get_push_seq()].push_back(c);
+ return get_push_seq();
+ }
+ void finish_flush(version_t seq, MDSContext::vec& ls) {
+ while (!waitfor_flush.empty()) {
+ auto it = waitfor_flush.begin();
+ if (it->first > seq)
+ break;
+ auto& v = it->second;
+ ls.insert(ls.end(), v.begin(), v.end());
+ waitfor_flush.erase(it);
+ }
+ }
+
+ void touch_readdir_cap(uint32_t count) {
+ cap_acquisition.hit(count);
+ }
+
+ void touch_cap(Capability *cap) {
+ session_cache_liveness.hit(1.0);
+ caps.push_front(&cap->item_session_caps);
+ }
+
+ void touch_cap_bottom(Capability *cap) {
+ session_cache_liveness.hit(1.0);
+ caps.push_back(&cap->item_session_caps);
+ }
+
+ void touch_lease(ClientLease *r) {
+ session_cache_liveness.hit(1.0);
+ leases.push_back(&r->item_session_lease);
+ }
+
+ bool is_any_flush_waiter() {
+ return !waitfor_flush.empty();
+ }
+
+ // -- leases --
+ uint32_t lease_seq = 0;
+
+ // -- completed requests --
+private:
+ // Has completed_requests been modified since the last time we
+ // wrote this session out?
+ bool completed_requests_dirty = false;
+
+ unsigned num_trim_flushes_warnings = 0;
+ unsigned num_trim_requests_warnings = 0;
+public:
+ void add_completed_request(ceph_tid_t t, inodeno_t created) {
+ info.completed_requests[t] = created;
+ completed_requests_dirty = true;
+ }
+ bool trim_completed_requests(ceph_tid_t mintid) {
+ // trim
+ bool erased_any = false;
+ while (!info.completed_requests.empty() &&
+ (mintid == 0 || info.completed_requests.begin()->first < mintid)) {
+ info.completed_requests.erase(info.completed_requests.begin());
+ erased_any = true;
+ }
+
+ if (erased_any) {
+ completed_requests_dirty = true;
+ }
+ return erased_any;
+ }
+ bool have_completed_request(ceph_tid_t tid, inodeno_t *pcreated) const {
+ map<ceph_tid_t,inodeno_t>::const_iterator p = info.completed_requests.find(tid);
+ if (p == info.completed_requests.end())
+ return false;
+ if (pcreated)
+ *pcreated = p->second;
+ return true;
+ }
+
+ void add_completed_flush(ceph_tid_t tid) {
+ info.completed_flushes.insert(tid);
+ }
+ bool trim_completed_flushes(ceph_tid_t mintid) {
+ bool erased_any = false;
+ while (!info.completed_flushes.empty() &&
+ (mintid == 0 || *info.completed_flushes.begin() < mintid)) {
+ info.completed_flushes.erase(info.completed_flushes.begin());
+ erased_any = true;
+ }
+ if (erased_any) {
+ completed_requests_dirty = true;
+ }
+ return erased_any;
+ }
+ bool have_completed_flush(ceph_tid_t tid) const {
+ return info.completed_flushes.count(tid);
+ }
+
+ uint64_t get_num_caps() const {
+ return caps.size();
+ }
+
+ unsigned get_num_completed_flushes() const { return info.completed_flushes.size(); }
+ unsigned get_num_trim_flushes_warnings() const {
+ return num_trim_flushes_warnings;
+ }
+ void inc_num_trim_flushes_warnings() { ++num_trim_flushes_warnings; }
+ void reset_num_trim_flushes_warnings() { num_trim_flushes_warnings = 0; }
+
+ unsigned get_num_completed_requests() const { return info.completed_requests.size(); }
+ unsigned get_num_trim_requests_warnings() const {
+ return num_trim_requests_warnings;
+ }
+ void inc_num_trim_requests_warnings() { ++num_trim_requests_warnings; }
+ void reset_num_trim_requests_warnings() { num_trim_requests_warnings = 0; }
+
+ bool has_dirty_completed_requests() const
+ {
+ return completed_requests_dirty;
+ }
+
+ void clear_dirty_completed_requests()
+ {
+ completed_requests_dirty = false;
+ }
+
+ int check_access(CInode *in, unsigned mask, int caller_uid, int caller_gid,
+ const vector<uint64_t> *gid_list, int new_uid, int new_gid);
+
+ Session() = delete;
+ Session(ConnectionRef con) :
+ recall_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")),
+ release_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")),
+ recall_caps_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
+ recall_caps_throttle2o(0.5),
+ session_cache_liveness(g_conf().get_val<double>("mds_session_cache_liveness_decay_rate")),
+ cap_acquisition(g_conf().get_val<double>("mds_session_cap_acquisition_decay_rate")),
+ birth_time(clock::now()),
+ auth_caps(g_ceph_context),
+ item_session_list(this),
+ requests(member_offset(MDRequestImpl, item_session_request))
+ {
+ set_connection(std::move(con));
+ }
+ ~Session() override {
+ if (state == STATE_CLOSED) {
+ item_session_list.remove_myself();
+ } else {
+ ceph_assert(!item_session_list.is_on_list());
+ }
+ preopen_out_queue.clear();
+ }
+
+ void set_connection(ConnectionRef con) {
+ connection = std::move(con);
+ auto& c = connection;
+ if (c) {
+ info.auth_name = c->get_peer_entity_name();
+ info.inst.addr = c->get_peer_socket_addr();
+ info.inst.name = entity_name_t(c->get_peer_type(), c->get_peer_global_id());
+ }
+ }
+ const ConnectionRef& get_connection() const {
+ return connection;
+ }
+
+ void clear() {
+ pending_prealloc_inos.clear();
+ info.clear_meta();
+
+ cap_push_seq = 0;
+ last_cap_renew = clock::zero();
+ }
+};
+
+class SessionFilter
+{
+protected:
+ // First is whether to filter, second is filter value
+ std::pair<bool, bool> reconnecting;
+
+public:
+ std::map<std::string, std::string> metadata;
+ std::string auth_name;
+ std::string state;
+ int64_t id;
+
+ SessionFilter()
+ : reconnecting(false, false), id(0)
+ {}
+
+ bool match(
+ const Session &session,
+ std::function<bool(client_t)> is_reconnecting) const;
+ int parse(const std::vector<std::string> &args, std::stringstream *ss);
+ void set_reconnecting(bool v)
+ {
+ reconnecting.first = true;
+ reconnecting.second = v;
+ }
+};
+
+/*
+ * session map
+ */
+
+class MDSRank;
+
+/**
+ * Encapsulate the serialized state associated with SessionMap. Allows
+ * encode/decode outside of live MDS instance.
+ */
+class SessionMapStore {
+public:
+ using clock = Session::clock;
+ using time = Session::time;
+
+protected:
+ version_t version;
+ ceph::unordered_map<entity_name_t, Session*> session_map;
+ PerfCounters *logger;
+
+ // total request load avg
+ double decay_rate;
+ DecayCounter total_load_avg;
+
+public:
+ mds_rank_t rank;
+
+ version_t get_version() const {return version;}
+
+ virtual void encode_header(bufferlist *header_bl);
+ virtual void decode_header(bufferlist &header_bl);
+ virtual void decode_values(std::map<std::string, bufferlist> &session_vals);
+ virtual void decode_legacy(bufferlist::const_iterator& blp);
+ void dump(Formatter *f) const;
+
+ void set_rank(mds_rank_t r)
+ {
+ rank = r;
+ }
+
+ Session* get_or_add_session(const entity_inst_t& i) {
+ Session *s;
+ auto session_map_entry = session_map.find(i.name);
+ if (session_map_entry != session_map.end()) {
+ s = session_map_entry->second;
+ } else {
+ s = session_map[i.name] = new Session(ConnectionRef());
+ s->info.inst = i;
+ s->last_cap_renew = Session::clock::now();
+ if (logger) {
+ logger->set(l_mdssm_session_count, session_map.size());
+ logger->inc(l_mdssm_session_add);
+ }
+ }
+
+ return s;
+ }
+
+ static void generate_test_instances(list<SessionMapStore*>& ls);
+
+ void reset_state()
+ {
+ session_map.clear();
+ }
+
+ SessionMapStore()
+ : version(0), logger(nullptr),
+ decay_rate(g_conf().get_val<double>("mds_request_load_average_decay_rate")),
+ total_load_avg(decay_rate), rank(MDS_RANK_NONE) {
+ }
+ virtual ~SessionMapStore() {};
+};
+
+class SessionMap : public SessionMapStore {
+public:
+ MDSRank *mds;
+
+protected:
+ version_t projected = 0, committing = 0, committed = 0;
+public:
+ map<int,xlist<Session*>* > by_state;
+ uint64_t set_state(Session *session, int state);
+ map<version_t, MDSContext::vec > commit_waiters;
+ void update_average_session_age();
+
+ SessionMap() = delete;
+ explicit SessionMap(MDSRank *m) : mds(m) {}
+
+ ~SessionMap() override
+ {
+ for (auto p : by_state)
+ delete p.second;
+
+ if (logger) {
+ g_ceph_context->get_perfcounters_collection()->remove(logger);
+ }
+
+ delete logger;
+ }
+
+ void register_perfcounters();
+
+ void set_version(const version_t v)
+ {
+ version = projected = v;
+ }
+
+ void set_projected(const version_t v)
+ {
+ projected = v;
+ }
+
+ version_t get_projected() const
+ {
+ return projected;
+ }
+
+ version_t get_committed() const
+ {
+ return committed;
+ }
+
+ version_t get_committing() const
+ {
+ return committing;
+ }
+
+ // sessions
+ void decode_legacy(bufferlist::const_iterator& blp) override;
+ bool empty() const { return session_map.empty(); }
+ const auto& get_sessions() const {
+ return session_map;
+ }
+
+ bool is_any_state(int state) const {
+ auto it = by_state.find(state);
+ if (it == by_state.end() || it->second->empty())
+ return false;
+ return true;
+ }
+
+ bool have_unclosed_sessions() const {
+ return
+ is_any_state(Session::STATE_OPENING) ||
+ is_any_state(Session::STATE_OPEN) ||
+ is_any_state(Session::STATE_CLOSING) ||
+ is_any_state(Session::STATE_STALE) ||
+ is_any_state(Session::STATE_KILLING);
+ }
+ bool have_session(entity_name_t w) const {
+ return session_map.count(w);
+ }
+ Session* get_session(entity_name_t w) {
+ auto session_map_entry = session_map.find(w);
+ return (session_map_entry != session_map.end() ?
+ session_map_entry-> second : nullptr);
+ }
+ const Session* get_session(entity_name_t w) const {
+ ceph::unordered_map<entity_name_t, Session*>::const_iterator p = session_map.find(w);
+ if (p == session_map.end()) {
+ return NULL;
+ } else {
+ return p->second;
+ }
+ }
+
+ void add_session(Session *s);
+ void remove_session(Session *s);
+ void touch_session(Session *session);
+
+ Session *get_oldest_session(int state) {
+ auto by_state_entry = by_state.find(state);
+ if (by_state_entry == by_state.end() || by_state_entry->second->empty())
+ return 0;
+ return by_state_entry->second->front();
+ }
+
+ void dump();
+
+ template<typename F>
+ void get_client_sessions(F&& f) const {
+ for (const auto& p : session_map) {
+ auto& session = p.second;
+ if (session->info.inst.name.is_client())
+ f(session);
+ }
+ }
+ template<typename C>
+ void get_client_session_set(C& c) const {
+ auto f = [&c](auto& s) {
+ c.insert(s);
+ };
+ get_client_sessions(f);
+ }
+
+ // helpers
+ entity_inst_t& get_inst(entity_name_t w) {
+ ceph_assert(session_map.count(w));
+ return session_map[w]->info.inst;
+ }
+ version_t get_push_seq(client_t client) {
+ return get_session(entity_name_t::CLIENT(client.v))->get_push_seq();
+ }
+ bool have_completed_request(metareqid_t rid) {
+ Session *session = get_session(rid.name);
+ return session && session->have_completed_request(rid.tid, NULL);
+ }
+ void trim_completed_requests(entity_name_t c, ceph_tid_t tid) {
+ Session *session = get_session(c);
+ ceph_assert(session);
+ session->trim_completed_requests(tid);
+ }
+
+ void wipe();
+ void wipe_ino_prealloc();
+
+ // -- loading, saving --
+ inodeno_t ino;
+ MDSContext::vec waiting_for_load;
+
+ object_t get_object_name() const;
+
+ void load(MDSContext *onload);
+ void _load_finish(
+ int operation_r,
+ int header_r,
+ int values_r,
+ bool first,
+ bufferlist &header_bl,
+ std::map<std::string, bufferlist> &session_vals,
+ bool more_session_vals);
+
+ void load_legacy();
+ void _load_legacy_finish(int r, bufferlist &bl);
+
+ void save(MDSContext *onsave, version_t needv=0);
+ void _save_finish(version_t v);
+
+protected:
+ std::set<entity_name_t> dirty_sessions;
+ std::set<entity_name_t> null_sessions;
+ bool loaded_legacy = false;
+ void _mark_dirty(Session *session, bool may_save);
+public:
+
+ /**
+ * Advance the version, and mark this session
+ * as dirty within the new version.
+ *
+ * Dirty means journalled but needing writeback
+ * to the backing store. Must have called
+ * mark_projected previously for this session.
+ */
+ void mark_dirty(Session *session, bool may_save=true);
+
+ /**
+ * Advance the projected version, and mark this
+ * session as projected within the new version
+ *
+ * Projected means the session is updated in memory
+ * but we're waiting for the journal write of the update
+ * to finish. Must subsequently call mark_dirty
+ * for sessions in the same global order as calls
+ * to mark_projected.
+ */
+ version_t mark_projected(Session *session);
+
+ /**
+ * During replay, advance versions to account
+ * for a session modification, and mark the
+ * session dirty.
+ */
+ void replay_dirty_session(Session *session);
+
+ /**
+ * During replay, if a session no longer present
+ * would have consumed a version, advance `version`
+ * and `projected` to account for that.
+ */
+ void replay_advance_version();
+
+ /**
+ * During replay, open sessions, advance versions and
+ * mark these sessions as dirty.
+ */
+ void replay_open_sessions(version_t event_cmapv,
+ map<client_t,entity_inst_t>& client_map,
+ map<client_t,client_metadata_t>& client_metadata_map);
+
+ /**
+ * For these session IDs, if a session exists with this ID, and it has
+ * dirty completed_requests, then persist it immediately
+ * (ahead of usual project/dirty versioned writes
+ * of the map).
+ */
+ void save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
+ MDSGatherBuilder *gather_bld);
+
+private:
+ time avg_birth_time = clock::zero();
+
+ uint64_t get_session_count_in_state(int state) {
+ return !is_any_state(state) ? 0 : by_state[state]->size();
+ }
+
+ void update_average_birth_time(const Session &s, bool added=true) {
+ uint32_t sessions = session_map.size();
+ time birth_time = s.get_birth_time();
+
+ if (sessions == 1) {
+ avg_birth_time = added ? birth_time : clock::zero();
+ return;
+ }
+
+ if (added) {
+ avg_birth_time = clock::time_point(
+ ((avg_birth_time - clock::zero()) / sessions) * (sessions - 1) +
+ (birth_time - clock::zero()) / sessions);
+ } else {
+ avg_birth_time = clock::time_point(
+ ((avg_birth_time - clock::zero()) / (sessions - 1)) * sessions -
+ (birth_time - clock::zero()) / (sessions - 1));
+ }
+ }
+
+public:
+ void hit_session(Session *session);
+ void handle_conf_change(const std::set <std::string> &changed);
+};
+
+std::ostream& operator<<(std::ostream &out, const Session &s);
+
+
+#endif
diff --git a/src/mds/SimpleLock.cc b/src/mds/SimpleLock.cc
new file mode 100644
index 00000000..c4c0ae0d
--- /dev/null
+++ b/src/mds/SimpleLock.cc
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "SimpleLock.h"
+#include "Mutation.h"
+
+void SimpleLock::dump(Formatter *f) const {
+ ceph_assert(f != NULL);
+ if (is_sync_and_unlocked()) {
+ return;
+ }
+
+ f->open_array_section("gather_set");
+ if (have_more()) {
+ for(const auto &i : more()->gather_set) {
+ f->dump_int("rank", i);
+ }
+ }
+ f->close_section();
+
+ f->dump_string("state", get_state_name(get_state()));
+ f->dump_bool("is_leased", is_leased());
+ f->dump_int("num_rdlocks", get_num_rdlocks());
+ f->dump_int("num_wrlocks", get_num_wrlocks());
+ f->dump_int("num_xlocks", get_num_xlocks());
+ f->open_object_section("xlock_by");
+ if (get_xlock_by()) {
+ get_xlock_by()->dump(f);
+ }
+ f->close_section();
+}
diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h
new file mode 100644
index 00000000..2d719b27
--- /dev/null
+++ b/src/mds/SimpleLock.h
@@ -0,0 +1,720 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_SIMPLELOCK_H
+#define CEPH_SIMPLELOCK_H
+
+#include <boost/intrusive_ptr.hpp>
+
+#include "MDSCacheObject.h"
+#include "MDSContext.h"
+
+// -- lock types --
+// see CEPH_LOCK_*
+
+
+struct MutationImpl;
+typedef boost::intrusive_ptr<MutationImpl> MutationRef;
+
+extern "C" {
+#include "locks.h"
+}
+
+
+#define CAP_ANY 0
+#define CAP_LONER 1
+#define CAP_XLOCKER 2
+
+struct LockType {
+ int type;
+ const sm_t *sm;
+
+ explicit LockType(int t) : type(t) {
+ switch (type) {
+ case CEPH_LOCK_DN:
+ case CEPH_LOCK_IAUTH:
+ case CEPH_LOCK_ILINK:
+ case CEPH_LOCK_IXATTR:
+ case CEPH_LOCK_ISNAP:
+ case CEPH_LOCK_IFLOCK:
+ case CEPH_LOCK_IPOLICY:
+ sm = &sm_simplelock;
+ break;
+ case CEPH_LOCK_IDFT:
+ case CEPH_LOCK_INEST:
+ sm = &sm_scatterlock;
+ break;
+ case CEPH_LOCK_IFILE:
+ sm = &sm_filelock;
+ break;
+ case CEPH_LOCK_DVERSION:
+ case CEPH_LOCK_IVERSION:
+ sm = &sm_locallock;
+ break;
+ default:
+ sm = 0;
+ }
+ }
+
+};
+
+
+class SimpleLock {
+public:
+ LockType *type;
+
+ static std::string_view get_state_name(int n) {
+ switch (n) {
+ case LOCK_UNDEF: return "UNDEF";
+ case LOCK_SYNC: return "sync";
+ case LOCK_LOCK: return "lock";
+
+ case LOCK_PREXLOCK: return "prexlock";
+ case LOCK_XLOCK: return "xlock";
+ case LOCK_XLOCKDONE: return "xlockdone";
+ case LOCK_XLOCKSNAP: return "xlocksnap";
+ case LOCK_LOCK_XLOCK: return "lock->xlock";
+
+ case LOCK_SYNC_LOCK: return "sync->lock";
+ case LOCK_LOCK_SYNC: return "lock->sync";
+ case LOCK_REMOTEXLOCK: return "remote_xlock";
+ case LOCK_EXCL: return "excl";
+ case LOCK_EXCL_SYNC: return "excl->sync";
+ case LOCK_EXCL_LOCK: return "excl->lock";
+ case LOCK_SYNC_EXCL: return "sync->excl";
+ case LOCK_LOCK_EXCL: return "lock->excl";
+
+ case LOCK_XSYN: return "xsyn";
+ case LOCK_XSYN_EXCL: return "xsyn->excl";
+ case LOCK_EXCL_XSYN: return "excl->xsyn";
+ case LOCK_XSYN_SYNC: return "xsyn->sync";
+ case LOCK_XSYN_LOCK: return "xsyn->lock";
+ case LOCK_XSYN_MIX: return "xsyn->mix";
+
+ case LOCK_SYNC_MIX: return "sync->mix";
+ case LOCK_SYNC_MIX2: return "sync->mix(2)";
+ case LOCK_LOCK_TSYN: return "lock->tsyn";
+
+ case LOCK_MIX_LOCK: return "mix->lock";
+ case LOCK_MIX_LOCK2: return "mix->lock(2)";
+ case LOCK_MIX: return "mix";
+ case LOCK_MIX_TSYN: return "mix->tsyn";
+
+ case LOCK_TSYN_MIX: return "tsyn->mix";
+ case LOCK_TSYN_LOCK: return "tsyn->lock";
+ case LOCK_TSYN: return "tsyn";
+
+ case LOCK_MIX_SYNC: return "mix->sync";
+ case LOCK_MIX_SYNC2: return "mix->sync(2)";
+ case LOCK_EXCL_MIX: return "excl->mix";
+ case LOCK_MIX_EXCL: return "mix->excl";
+
+ case LOCK_PRE_SCAN: return "*->scan";
+ case LOCK_SCAN: return "scan";
+
+ case LOCK_SNAP_SYNC: return "snap->sync";
+
+ default: ceph_abort(); return std::string_view();
+ }
+ }
+
+ static std::string_view get_lock_type_name(int t) {
+ switch (t) {
+ case CEPH_LOCK_DN: return "dn";
+ case CEPH_LOCK_DVERSION: return "dversion";
+ case CEPH_LOCK_IVERSION: return "iversion";
+ case CEPH_LOCK_IFILE: return "ifile";
+ case CEPH_LOCK_IAUTH: return "iauth";
+ case CEPH_LOCK_ILINK: return "ilink";
+ case CEPH_LOCK_IDFT: return "idft";
+ case CEPH_LOCK_INEST: return "inest";
+ case CEPH_LOCK_IXATTR: return "ixattr";
+ case CEPH_LOCK_ISNAP: return "isnap";
+ case CEPH_LOCK_INO: return "ino";
+ case CEPH_LOCK_IFLOCK: return "iflock";
+ case CEPH_LOCK_IPOLICY: return "ipolicy";
+ default: ceph_abort(); return std::string_view();
+ }
+ }
+
+ static std::string_view get_lock_action_name(int a) {
+ switch (a) {
+ case LOCK_AC_SYNC: return "sync";
+ case LOCK_AC_MIX: return "mix";
+ case LOCK_AC_LOCK: return "lock";
+ case LOCK_AC_LOCKFLUSHED: return "lockflushed";
+
+ case LOCK_AC_SYNCACK: return "syncack";
+ case LOCK_AC_MIXACK: return "mixack";
+ case LOCK_AC_LOCKACK: return "lockack";
+
+ case LOCK_AC_REQSCATTER: return "reqscatter";
+ case LOCK_AC_REQUNSCATTER: return "requnscatter";
+ case LOCK_AC_NUDGE: return "nudge";
+ case LOCK_AC_REQRDLOCK: return "reqrdlock";
+ default: return "???";
+ }
+ }
+
+ // waiting
+ static const uint64_t WAIT_RD = (1<<0); // to read
+ static const uint64_t WAIT_WR = (1<<1); // to write
+ static const uint64_t WAIT_XLOCK = (1<<2); // to xlock (** dup)
+ static const uint64_t WAIT_STABLE = (1<<2); // for a stable state
+ static const uint64_t WAIT_REMOTEXLOCK = (1<<3); // for a remote xlock
+ static const int WAIT_BITS = 4;
+ static const uint64_t WAIT_ALL = ((1<<WAIT_BITS)-1);
+
+
+protected:
+ // parent (what i lock)
+ MDSCacheObject *parent;
+
+ // lock state
+ __s16 state;
+ __s16 state_flags;
+
+ enum {
+ LEASED = 1 << 0,
+ NEED_RECOVER = 1 << 1,
+ };
+
+private:
+ int num_rdlock;
+
+ // XXX not in mempool
+ struct unstable_bits_t {
+ set<__s32> gather_set; // auth+rep. >= 0 is mds, < 0 is client
+
+ // local state
+ int num_wrlock = 0, num_xlock = 0;
+ MutationRef xlock_by;
+ client_t xlock_by_client = -1;
+ client_t excl_client = -1;
+
+ bool empty() {
+ return
+ gather_set.empty() &&
+ num_wrlock == 0 &&
+ num_xlock == 0 &&
+ xlock_by.get() == NULL &&
+ xlock_by_client == -1 &&
+ excl_client == -1;
+ }
+
+ unstable_bits_t() {}
+ };
+
+ mutable std::unique_ptr<unstable_bits_t> _unstable;
+
+ bool have_more() const { return _unstable ? true : false; }
+ unstable_bits_t *more() const {
+ if (!_unstable)
+ _unstable.reset(new unstable_bits_t);
+ return _unstable.get();
+ }
+ void try_clear_more() {
+ if (_unstable && _unstable->empty()) {
+ _unstable.reset();
+ }
+ }
+
+public:
+
+ client_t get_excl_client() const {
+ return have_more() ? more()->excl_client : -1;
+ }
+ void set_excl_client(client_t c) {
+ if (c < 0 && !have_more())
+ return; // default is -1
+ more()->excl_client = c;
+ }
+
+ SimpleLock(MDSCacheObject *o, LockType *lt) :
+ type(lt),
+ parent(o),
+ state(LOCK_SYNC),
+ state_flags(0),
+ num_rdlock(0)
+ {}
+ virtual ~SimpleLock() {}
+
+ virtual bool is_scatterlock() const {
+ return false;
+ }
+ virtual bool is_locallock() const {
+ return false;
+ }
+
+ // parent
+ MDSCacheObject *get_parent() { return parent; }
+ int get_type() const { return type->type; }
+ const sm_t* get_sm() const { return type->sm; }
+
+ int get_wait_shift() const {
+ switch (get_type()) {
+ case CEPH_LOCK_DN: return 8;
+ case CEPH_LOCK_DVERSION: return 8 + 1*SimpleLock::WAIT_BITS;
+ case CEPH_LOCK_IAUTH: return 8 + 2*SimpleLock::WAIT_BITS;
+ case CEPH_LOCK_ILINK: return 8 + 3*SimpleLock::WAIT_BITS;
+ case CEPH_LOCK_IDFT: return 8 + 4*SimpleLock::WAIT_BITS;
+ case CEPH_LOCK_IFILE: return 8 + 5*SimpleLock::WAIT_BITS;
+ case CEPH_LOCK_IVERSION: return 8 + 6*SimpleLock::WAIT_BITS;
+ case CEPH_LOCK_IXATTR: return 8 + 7*SimpleLock::WAIT_BITS;
+ case CEPH_LOCK_ISNAP: return 8 + 8*SimpleLock::WAIT_BITS;
+ case CEPH_LOCK_INEST: return 8 + 9*SimpleLock::WAIT_BITS;
+ case CEPH_LOCK_IFLOCK: return 8 +10*SimpleLock::WAIT_BITS;
+ case CEPH_LOCK_IPOLICY: return 8 +11*SimpleLock::WAIT_BITS;
+ default:
+ ceph_abort();
+ }
+ }
+
+ int get_cap_shift() const {
+ switch (get_type()) {
+ case CEPH_LOCK_IAUTH: return CEPH_CAP_SAUTH;
+ case CEPH_LOCK_ILINK: return CEPH_CAP_SLINK;
+ case CEPH_LOCK_IFILE: return CEPH_CAP_SFILE;
+ case CEPH_LOCK_IXATTR: return CEPH_CAP_SXATTR;
+ default: return 0;
+ }
+ }
+ int get_cap_mask() const {
+ switch (get_type()) {
+ case CEPH_LOCK_IFILE: return (1 << CEPH_CAP_FILE_BITS) - 1;
+ default: return (1 << CEPH_CAP_SIMPLE_BITS) - 1;
+ }
+ }
+
+ struct ptr_lt {
+ bool operator()(const SimpleLock* l, const SimpleLock* r) const {
+ // first sort by object type (dn < inode)
+ if (!(l->type->type > CEPH_LOCK_DN) && (r->type->type > CEPH_LOCK_DN)) return true;
+ if ((l->type->type > CEPH_LOCK_DN) == (r->type->type > CEPH_LOCK_DN)) {
+ // then sort by object
+ if (l->parent->is_lt(r->parent)) return true;
+ if (l->parent == r->parent) {
+ // then sort by (inode) lock type
+ if (l->type->type < r->type->type) return true;
+ }
+ }
+ return false;
+ }
+ };
+
+ void decode_locked_state(const bufferlist& bl) {
+ parent->decode_lock_state(type->type, bl);
+ }
+ void encode_locked_state(bufferlist& bl) {
+ parent->encode_lock_state(type->type, bl);
+ }
+ void finish_waiters(uint64_t mask, int r=0) {
+ parent->finish_waiting(mask << get_wait_shift(), r);
+ }
+ void take_waiting(uint64_t mask, MDSContext::vec& ls) {
+ parent->take_waiting(mask << get_wait_shift(), ls);
+ }
+ void add_waiter(uint64_t mask, MDSContext *c) {
+ parent->add_waiter((mask << get_wait_shift()) | MDSCacheObject::WAIT_ORDERED, c);
+ }
+ bool is_waiter_for(uint64_t mask) const {
+ return parent->is_waiter_for(mask << get_wait_shift());
+ }
+
+
+
+ // state
+ int get_state() const { return state; }
+ int set_state(int s) {
+ state = s;
+ //assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states.
+ return s;
+ }
+ void set_state_rejoin(int s, MDSContext::vec& waiters, bool survivor) {
+ ceph_assert(!get_parent()->is_auth());
+
+ // If lock in the replica object was not in SYNC state when auth mds of the object failed.
+ // Auth mds of the object may take xlock on the lock and change the object when replaying
+ // unsafe requests.
+ if (!survivor || state != LOCK_SYNC)
+ mark_need_recover();
+
+ state = s;
+
+ if (is_stable())
+ take_waiting(SimpleLock::WAIT_ALL, waiters);
+ }
+
+ bool is_stable() const {
+ return get_sm()->states[state].next == 0;
+ }
+ bool is_unstable_and_locked() const {
+ if (is_stable())
+ return false;
+ return is_rdlocked() || is_wrlocked() || is_xlocked();
+ }
+ int get_next_state() {
+ return get_sm()->states[state].next;
+ }
+
+
+ bool is_sync_and_unlocked() const {
+ return
+ get_state() == LOCK_SYNC &&
+ !is_rdlocked() &&
+ !is_leased() &&
+ !is_wrlocked() &&
+ !is_xlocked();
+ }
+
+
+ /*
+ bool fw_rdlock_to_auth() {
+ return get_sm()->states[state].can_rdlock == FW;
+ }
+ */
+ bool req_rdlock_from_auth() {
+ return get_sm()->states[state].can_rdlock == REQ;
+ }
+
+ // gather set
+ static set<int32_t> empty_gather_set;
+
+ // int32_t: <0 is client, >=0 is MDS rank
+ const set<int32_t>& get_gather_set() const {
+ return have_more() ? more()->gather_set : empty_gather_set;
+ }
+
+ void init_gather() {
+ for (const auto p : parent->get_replicas()) {
+ more()->gather_set.insert(p.first);
+ }
+ }
+ bool is_gathering() const {
+ return have_more() && !more()->gather_set.empty();
+ }
+ bool is_gathering(int32_t i) const {
+ return have_more() && more()->gather_set.count(i);
+ }
+ void clear_gather() {
+ if (have_more())
+ more()->gather_set.clear();
+ }
+ void remove_gather(int32_t i) {
+ if (have_more())
+ more()->gather_set.erase(i);
+ }
+
+
+
+ virtual bool is_dirty() const { return false; }
+ virtual bool is_stale() const { return false; }
+ virtual bool is_flushing() const { return false; }
+ virtual bool is_flushed() const { return false; }
+ virtual void clear_flushed() { }
+
+ // can_*
+ bool can_lease(client_t client) const {
+ return get_sm()->states[state].can_lease == ANY ||
+ (get_sm()->states[state].can_lease == AUTH && parent->is_auth()) ||
+ (get_sm()->states[state].can_lease == XCL && client >= 0 && get_xlock_by_client() == client);
+ }
+ bool can_read(client_t client) const {
+ return get_sm()->states[state].can_read == ANY ||
+ (get_sm()->states[state].can_read == AUTH && parent->is_auth()) ||
+ (get_sm()->states[state].can_read == XCL && client >= 0 && get_xlock_by_client() == client);
+ }
+ bool can_read_projected(client_t client) const {
+ return get_sm()->states[state].can_read_projected == ANY ||
+ (get_sm()->states[state].can_read_projected == AUTH && parent->is_auth()) ||
+ (get_sm()->states[state].can_read_projected == XCL && client >= 0 && get_xlock_by_client() == client);
+ }
+ bool can_rdlock(client_t client) const {
+ return get_sm()->states[state].can_rdlock == ANY ||
+ (get_sm()->states[state].can_rdlock == AUTH && parent->is_auth()) ||
+ (get_sm()->states[state].can_rdlock == XCL && client >= 0 && get_xlock_by_client() == client);
+ }
+ bool can_wrlock(client_t client) const {
+ return get_sm()->states[state].can_wrlock == ANY ||
+ (get_sm()->states[state].can_wrlock == AUTH && parent->is_auth()) ||
+ (get_sm()->states[state].can_wrlock == XCL && client >= 0 && (get_xlock_by_client() == client ||
+ get_excl_client() == client));
+ }
+ bool can_force_wrlock(client_t client) const {
+ return get_sm()->states[state].can_force_wrlock == ANY ||
+ (get_sm()->states[state].can_force_wrlock == AUTH && parent->is_auth()) ||
+ (get_sm()->states[state].can_force_wrlock == XCL && client >= 0 && (get_xlock_by_client() == client ||
+ get_excl_client() == client));
+ }
+ bool can_xlock(client_t client) const {
+ return get_sm()->states[state].can_xlock == ANY ||
+ (get_sm()->states[state].can_xlock == AUTH && parent->is_auth()) ||
+ (get_sm()->states[state].can_xlock == XCL && client >= 0 && get_xlock_by_client() == client);
+ }
+
+ // rdlock
+ bool is_rdlocked() const { return num_rdlock > 0; }
+ int get_rdlock() {
+ if (!num_rdlock)
+ parent->get(MDSCacheObject::PIN_LOCK);
+ return ++num_rdlock;
+ }
+ int put_rdlock() {
+ ceph_assert(num_rdlock>0);
+ --num_rdlock;
+ if (num_rdlock == 0)
+ parent->put(MDSCacheObject::PIN_LOCK);
+ return num_rdlock;
+ }
+ int get_num_rdlocks() const {
+ return num_rdlock;
+ }
+
+ // wrlock
+ void get_wrlock(bool force=false) {
+ //assert(can_wrlock() || force);
+ if (more()->num_wrlock == 0)
+ parent->get(MDSCacheObject::PIN_LOCK);
+ ++more()->num_wrlock;
+ }
+ void put_wrlock() {
+ --more()->num_wrlock;
+ if (more()->num_wrlock == 0) {
+ parent->put(MDSCacheObject::PIN_LOCK);
+ try_clear_more();
+ }
+ }
+ bool is_wrlocked() const {
+ return have_more() && more()->num_wrlock > 0;
+ }
+ int get_num_wrlocks() const {
+ return have_more() ? more()->num_wrlock : 0;
+ }
+
+ // xlock
+ void get_xlock(MutationRef who, client_t client) {
+ ceph_assert(get_xlock_by() == MutationRef());
+ ceph_assert(state == LOCK_XLOCK || is_locallock() ||
+ state == LOCK_LOCK /* if we are a slave */);
+ parent->get(MDSCacheObject::PIN_LOCK);
+ more()->num_xlock++;
+ more()->xlock_by = who;
+ more()->xlock_by_client = client;
+ }
+ void set_xlock_done() {
+ ceph_assert(more()->xlock_by);
+ ceph_assert(state == LOCK_XLOCK || is_locallock() ||
+ state == LOCK_LOCK /* if we are a slave */);
+ if (!is_locallock())
+ state = LOCK_XLOCKDONE;
+ more()->xlock_by.reset();
+ }
+ void put_xlock() {
+ ceph_assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE ||
+ state == LOCK_XLOCKSNAP || state == LOCK_LOCK_XLOCK ||
+ state == LOCK_LOCK || /* if we are a master of a slave */
+ is_locallock());
+ --more()->num_xlock;
+ parent->put(MDSCacheObject::PIN_LOCK);
+ if (more()->num_xlock == 0) {
+ more()->xlock_by.reset();
+ more()->xlock_by_client = -1;
+ try_clear_more();
+ }
+ }
+ bool is_xlocked() const {
+ return have_more() && more()->num_xlock > 0;
+ }
+ int get_num_xlocks() const {
+ return have_more() ? more()->num_xlock : 0;
+ }
+ client_t get_xlock_by_client() const {
+ return have_more() ? more()->xlock_by_client : -1;
+ }
+ bool is_xlocked_by_client(client_t c) const {
+ return have_more() ? more()->xlock_by_client == c : false;
+ }
+ MutationRef get_xlock_by() const {
+ return have_more() ? more()->xlock_by : MutationRef();
+ }
+
+ // lease
+ bool is_leased() const {
+ return state_flags & LEASED;
+ }
+ void get_client_lease() {
+ ceph_assert(!is_leased());
+ state_flags |= LEASED;
+ }
+ void put_client_lease() {
+ ceph_assert(is_leased());
+ state_flags &= ~LEASED;
+ }
+
+ bool needs_recover() const {
+ return state_flags & NEED_RECOVER;
+ }
+ void mark_need_recover() {
+ state_flags |= NEED_RECOVER;
+ }
+ void clear_need_recover() {
+ state_flags &= ~NEED_RECOVER;
+ }
+
+ // encode/decode
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 2, bl);
+ encode(state, bl);
+ if (have_more())
+ encode(more()->gather_set, bl);
+ else
+ encode(empty_gather_set, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ DECODE_START(2, p);
+ decode(state, p);
+ set<__s32> g;
+ decode(g, p);
+ if (!g.empty())
+ more()->gather_set.swap(g);
+ DECODE_FINISH(p);
+ }
+ void encode_state_for_replica(bufferlist& bl) const {
+ __s16 s = get_replica_state();
+ using ceph::encode;
+ encode(s, bl);
+ }
+ void decode_state(bufferlist::const_iterator& p, bool is_new=true) {
+ using ceph::decode;
+ __s16 s;
+ decode(s, p);
+ if (is_new)
+ state = s;
+ }
+ void decode_state_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters, bool survivor) {
+ __s16 s;
+ using ceph::decode;
+ decode(s, p);
+ set_state_rejoin(s, waiters, survivor);
+ }
+
+
+ // caps
+ bool is_loner_mode() const {
+ return get_sm()->states[state].loner;
+ }
+ int gcaps_allowed_ever() const {
+ return parent->is_auth() ? get_sm()->allowed_ever_auth : get_sm()->allowed_ever_replica;
+ }
+ int gcaps_allowed(int who, int s=-1) const {
+ if (s < 0) s = state;
+ if (parent->is_auth()) {
+ if (get_xlock_by_client() >= 0 && who == CAP_XLOCKER)
+ return get_sm()->states[s].xlocker_caps | get_sm()->states[s].caps; // xlocker always gets more
+ else if (is_loner_mode() && who == CAP_ANY)
+ return get_sm()->states[s].caps;
+ else
+ return get_sm()->states[s].loner_caps | get_sm()->states[s].caps; // loner always gets more
+ } else
+ return get_sm()->states[s].replica_caps;
+ }
+ int gcaps_careful() const {
+ if (get_num_wrlocks())
+ return get_sm()->careful;
+ return 0;
+ }
+
+
+ int gcaps_xlocker_mask(client_t client) const {
+ if (client == get_xlock_by_client())
+ return type->type == CEPH_LOCK_IFILE ? 0xf : (CEPH_CAP_GSHARED|CEPH_CAP_GEXCL);
+ return 0;
+ }
+
+ // simplelock specifics
+ int get_replica_state() const {
+ return get_sm()->states[state].replica_state;
+ }
+ void export_twiddle() {
+ clear_gather();
+ state = get_replica_state();
+ }
+
+ bool remove_replica(int from) {
+ if (is_gathering(from)) {
+ remove_gather(from);
+ if (!is_gathering())
+ return true;
+ }
+ return false;
+ }
+ bool do_import(int from, int to) {
+ if (!is_stable()) {
+ remove_gather(from);
+ remove_gather(to);
+ if (!is_gathering())
+ return true;
+ }
+ if (!is_stable() && !is_gathering())
+ return true;
+ return false;
+ }
+
+ void _print(ostream& out) const {
+ out << get_lock_type_name(get_type()) << " ";
+ out << get_state_name(get_state());
+ if (!get_gather_set().empty())
+ out << " g=" << get_gather_set();
+ if (is_leased())
+ out << " l";
+ if (is_rdlocked())
+ out << " r=" << get_num_rdlocks();
+ if (is_wrlocked())
+ out << " w=" << get_num_wrlocks();
+ if (is_xlocked()) {
+ out << " x=" << get_num_xlocks();
+ if (get_xlock_by())
+ out << " by " << get_xlock_by();
+ }
+ /*if (is_stable())
+ out << " stable";
+ else
+ out << " unstable";
+ */
+ }
+
+ /**
+ * Write bare values (caller must be in an object section)
+ * to formatter, or nothing if is_sync_and_unlocked.
+ */
+ void dump(Formatter *f) const;
+
+ virtual void print(ostream& out) const {
+ out << "(";
+ _print(out);
+ out << ")";
+ }
+};
+WRITE_CLASS_ENCODER(SimpleLock)
+
+inline ostream& operator<<(ostream& out, const SimpleLock& l)
+{
+ l.print(out);
+ return out;
+}
+
+
+#endif
diff --git a/src/mds/SnapClient.cc b/src/mds/SnapClient.cc
new file mode 100644
index 00000000..fa1f56b4
--- /dev/null
+++ b/src/mds/SnapClient.cc
@@ -0,0 +1,316 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "MDSMap.h"
+#include "MDSRank.h"
+#include "msg/Messenger.h"
+#include "messages/MMDSTableRequest.h"
+#include "SnapClient.h"
+
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".snapclient "
+
+void SnapClient::resend_queries()
+{
+ if (!waiting_for_version.empty() || (!synced && sync_reqid > 0)) {
+ version_t want;
+ if (!waiting_for_version.empty())
+ want = std::max<version_t>(cached_version, waiting_for_version.rbegin()->first);
+ else
+ want = std::max<version_t>(cached_version, 1);
+ refresh(want, NULL);
+ if (!synced)
+ sync_reqid = last_reqid;
+ }
+}
+
+void SnapClient::handle_query_result(const MMDSTableRequest::const_ref &m)
+{
+ dout(10) << __func__ << " " << *m << dendl;
+
+ char type;
+ using ceph::decode;
+ auto p = m->bl.cbegin();
+ decode(type, p);
+
+ switch (type) {
+ case 'U': // uptodate
+ ceph_assert(cached_version == m->get_tid());
+ break;
+ case 'F': // full
+ {
+ decode(cached_snaps, p);
+ decode(cached_pending_update, p);
+ decode(cached_pending_destroy, p);
+
+ snapid_t last_created, last_destroyed;
+ decode(last_created, p);
+ decode(last_destroyed, p);
+
+ if (last_created > cached_last_created)
+ cached_last_created = last_created;
+ if (last_destroyed > cached_last_destroyed)
+ cached_last_destroyed = last_destroyed;
+
+ cached_version = m->get_tid();
+ }
+ break;
+ default:
+ ceph_abort();
+ };
+
+ if (!committing_tids.empty()) {
+ for (auto p = committing_tids.begin();
+ p != committing_tids.end() && *p <= cached_version; ) {
+ if (cached_pending_update.count(*p)) {
+ if (cached_pending_update[*p].snapid > cached_last_created)
+ cached_last_created = cached_pending_update[*p].snapid;
+ ++p;
+ } else if (cached_pending_destroy.count(*p)) {
+ if (cached_pending_destroy[*p].second > cached_last_destroyed)
+ cached_last_destroyed = cached_pending_destroy[*p].second;
+ ++p;
+ } else {
+ // pending update/destroy have been committed.
+ committing_tids.erase(p++);
+ }
+ }
+ }
+
+ if (m->op == TABLESERVER_OP_QUERY_REPLY && m->reqid >= sync_reqid)
+ synced = true;
+
+ if (synced && !waiting_for_version.empty()) {
+ MDSContext::vec finished;
+ while (!waiting_for_version.empty()) {
+ auto it = waiting_for_version.begin();
+ if (it->first > cached_version)
+ break;
+ auto& v = it->second;
+ finished.insert(finished.end(), v.begin(), v.end());
+ waiting_for_version.erase(it);
+ }
+ if (!finished.empty())
+ mds->queue_waiters(finished);
+ }
+}
+
+void SnapClient::handle_notify_prep(const MMDSTableRequest::const_ref &m)
+{
+ dout(10) << __func__ << " " << *m << dendl;
+ handle_query_result(m);
+ auto ack = MMDSTableRequest::create(table, TABLESERVER_OP_NOTIFY_ACK, 0, m->get_tid());
+ mds->send_message(ack, m->get_connection());
+}
+
+void SnapClient::notify_commit(version_t tid)
+{
+ dout(10) << __func__ << " tid " << tid << dendl;
+
+ ceph_assert(cached_version == 0 || cached_version >= tid);
+ if (cached_version == 0) {
+ committing_tids.insert(tid);
+ } else if (cached_pending_update.count(tid)) {
+ committing_tids.insert(tid);
+ if (cached_pending_update[tid].snapid > cached_last_created)
+ cached_last_created = cached_pending_update[tid].snapid;
+ } else if (cached_pending_destroy.count(tid)) {
+ committing_tids.insert(tid);
+ if (cached_pending_destroy[tid].second > cached_last_destroyed)
+ cached_last_destroyed = cached_pending_destroy[tid].second;
+ } else if (cached_version > tid) {
+ // no need to record the tid if it has already been committed.
+ } else {
+ ceph_abort();
+ }
+}
+
+void SnapClient::refresh(version_t want, MDSContext *onfinish)
+{
+ dout(10) << __func__ << " want " << want << dendl;
+
+ ceph_assert(want >= cached_version);
+ if (onfinish)
+ waiting_for_version[want].push_back(onfinish);
+
+ if (!server_ready)
+ return;
+
+ mds_rank_t ts = mds->mdsmap->get_tableserver();
+ auto req = MMDSTableRequest::create(table, TABLESERVER_OP_QUERY, ++last_reqid, 0);
+ using ceph::encode;
+ char op = 'F';
+ encode(op, req->bl);
+ encode(cached_version, req->bl);
+ mds->send_message_mds(req, ts);
+}
+
+void SnapClient::sync(MDSContext *onfinish)
+{
+ dout(10) << __func__ << dendl;
+
+ refresh(std::max<version_t>(cached_version, 1), onfinish);
+ synced = false;
+ if (server_ready)
+ sync_reqid = last_reqid;
+ else
+ sync_reqid = (last_reqid == ~0ULL) ? 1 : last_reqid + 1;
+}
+
+void SnapClient::get_snaps(set<snapid_t>& result) const
+{
+ ceph_assert(cached_version > 0);
+ for (auto& p : cached_snaps)
+ result.insert(p.first);
+
+ for (auto tid : committing_tids) {
+ auto q = cached_pending_update.find(tid);
+ if (q != cached_pending_update.end())
+ result.insert(q->second.snapid);
+
+ auto r = cached_pending_destroy.find(tid);
+ if (r != cached_pending_destroy.end())
+ result.erase(r->second.first);
+ }
+}
+
+set<snapid_t> SnapClient::filter(const set<snapid_t>& snaps) const
+{
+ ceph_assert(cached_version > 0);
+ if (snaps.empty())
+ return snaps;
+
+ set<snapid_t> result;
+
+ for (auto p : snaps) {
+ if (cached_snaps.count(p))
+ result.insert(p);
+ }
+
+ for (auto tid : committing_tids) {
+ auto q = cached_pending_update.find(tid);
+ if (q != cached_pending_update.end()) {
+ if (snaps.count(q->second.snapid))
+ result.insert(q->second.snapid);
+ }
+
+ auto r = cached_pending_destroy.find(tid);
+ if (r != cached_pending_destroy.end())
+ result.erase(r->second.first);
+ }
+
+ dout(10) << __func__ << " " << snaps << " -> " << result << dendl;
+ return result;
+}
+
+const SnapInfo* SnapClient::get_snap_info(snapid_t snapid) const
+{
+ ceph_assert(cached_version > 0);
+
+ const SnapInfo* result = NULL;
+ auto it = cached_snaps.find(snapid);
+ if (it != cached_snaps.end())
+ result = &it->second;
+
+ for (auto tid : committing_tids) {
+ auto q = cached_pending_update.find(tid);
+ if (q != cached_pending_update.end() && q->second.snapid == snapid) {
+ result = &q->second;
+ break;
+ }
+
+ auto r = cached_pending_destroy.find(tid);
+ if (r != cached_pending_destroy.end() && r->second.first == snapid) {
+ result = NULL;
+ break;
+ }
+ }
+
+ dout(10) << __func__ << " snapid " << snapid << " -> " << result << dendl;
+ return result;
+}
+
+void SnapClient::get_snap_infos(map<snapid_t, const SnapInfo*>& infomap,
+ const set<snapid_t>& snaps) const
+{
+ ceph_assert(cached_version > 0);
+
+ if (snaps.empty())
+ return;
+
+ map<snapid_t, const SnapInfo*> result;
+ for (auto p : snaps) {
+ auto it = cached_snaps.find(p);
+ if (it != cached_snaps.end())
+ result[p] = &it->second;
+ }
+
+ for (auto tid : committing_tids) {
+ auto q = cached_pending_update.find(tid);
+ if (q != cached_pending_update.end()) {
+ if (snaps.count(q->second.snapid))
+ result[q->second.snapid] = &q->second;
+ }
+
+ auto r = cached_pending_destroy.find(tid);
+ if (r != cached_pending_destroy.end())
+ result.erase(r->second.first);
+ }
+
+ infomap.insert(result.begin(), result.end());
+}
+
+int SnapClient::dump_cache(Formatter *f) const
+{
+ if (!is_synced()) {
+ dout(5) << "dump_cache: not synced" << dendl;
+ return -EINVAL;
+ }
+
+ map<snapid_t, const SnapInfo*> snaps;
+ for (auto& p : cached_snaps)
+ snaps[p.first] = &p.second;
+
+ for (auto tid : committing_tids) {
+ auto q = cached_pending_update.find(tid);
+ if (q != cached_pending_update.end())
+ snaps[q->second.snapid] = &q->second;
+
+ auto r = cached_pending_destroy.find(tid);
+ if (r != cached_pending_destroy.end())
+ snaps.erase(r->second.first);
+ }
+
+ f->open_object_section("snapclient");
+
+ f->dump_int("last_created", get_last_created());
+ f->dump_int("last_destroyed", get_last_destroyed());
+
+ f->open_array_section("snaps");
+ for (auto p : snaps) {
+ f->open_object_section("snap");
+ p.second->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->close_section();
+
+ return 0;
+}
diff --git a/src/mds/SnapClient.h b/src/mds/SnapClient.h
new file mode 100644
index 00000000..c0d595ba
--- /dev/null
+++ b/src/mds/SnapClient.h
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_SNAPCLIENT_H
+#define CEPH_SNAPCLIENT_H
+
+#include <string_view>
+
+#include "MDSTableClient.h"
+#include "snap.h"
+#include "MDSContext.h"
+
+class MDSRank;
+class LogSegment;
+
+class SnapClient : public MDSTableClient {
+ version_t cached_version;
+ snapid_t cached_last_created, cached_last_destroyed;
+ map<snapid_t, SnapInfo> cached_snaps;
+ map<version_t, SnapInfo> cached_pending_update;
+ map<version_t, pair<snapid_t,snapid_t> > cached_pending_destroy;
+
+ set<version_t> committing_tids;
+
+ map<version_t, MDSContext::vec > waiting_for_version;
+
+ uint64_t sync_reqid;
+ bool synced;
+
+public:
+ explicit SnapClient(MDSRank *m) :
+ MDSTableClient(m, TABLE_SNAP),
+ cached_version(0), cached_last_created(0), cached_last_destroyed(0),
+ sync_reqid(0), synced(false) {}
+
+ void resend_queries() override;
+ void handle_query_result(const MMDSTableRequest::const_ref &m) override;
+ void handle_notify_prep(const MMDSTableRequest::const_ref &m) override;
+ void notify_commit(version_t tid) override;
+
+ void prepare_create(inodeno_t dirino, std::string_view name, utime_t stamp,
+ version_t *pstid, bufferlist *pbl, MDSContext *onfinish) {
+ bufferlist bl;
+ __u32 op = TABLE_OP_CREATE;
+ encode(op, bl);
+ encode(dirino, bl);
+ encode(name, bl);
+ encode(stamp, bl);
+ _prepare(bl, pstid, pbl, onfinish);
+ }
+
+ void prepare_create_realm(inodeno_t ino, version_t *pstid, bufferlist *pbl, MDSContext *onfinish) {
+ bufferlist bl;
+ __u32 op = TABLE_OP_CREATE;
+ encode(op, bl);
+ encode(ino, bl);
+ _prepare(bl, pstid, pbl, onfinish);
+ }
+
+ void prepare_destroy(inodeno_t ino, snapid_t snapid, version_t *pstid, bufferlist *pbl, MDSContext *onfinish) {
+ bufferlist bl;
+ __u32 op = TABLE_OP_DESTROY;
+ encode(op, bl);
+ encode(ino, bl);
+ encode(snapid, bl);
+ _prepare(bl, pstid, pbl, onfinish);
+ }
+
+ void prepare_update(inodeno_t ino, snapid_t snapid, std::string_view name, utime_t stamp,
+ version_t *pstid, MDSContext *onfinish) {
+ bufferlist bl;
+ __u32 op = TABLE_OP_UPDATE;
+ encode(op, bl);
+ encode(ino, bl);
+ encode(snapid, bl);
+ encode(name, bl);
+ encode(stamp, bl);
+ _prepare(bl, pstid, NULL, onfinish);
+ }
+
+ version_t get_cached_version() const { return cached_version; }
+ void refresh(version_t want, MDSContext *onfinish);
+
+ void sync(MDSContext *onfinish);
+
+ bool is_synced() const { return synced; }
+ void wait_for_sync(MDSContext *c) {
+ ceph_assert(!synced);
+ waiting_for_version[std::max<version_t>(cached_version, 1)].push_back(c);
+ }
+
+ snapid_t get_last_created() const { return cached_last_created; }
+ snapid_t get_last_destroyed() const { return cached_last_destroyed; }
+
+ void get_snaps(set<snapid_t>& snaps) const;
+ set<snapid_t> filter(const set<snapid_t>& snaps) const;
+ const SnapInfo* get_snap_info(snapid_t snapid) const;
+ void get_snap_infos(map<snapid_t, const SnapInfo*>& infomap, const set<snapid_t>& snaps) const;
+
+ int dump_cache(Formatter *f) const;
+};
+
+#endif
diff --git a/src/mds/SnapRealm.cc b/src/mds/SnapRealm.cc
new file mode 100644
index 00000000..4ef775dc
--- /dev/null
+++ b/src/mds/SnapRealm.cc
@@ -0,0 +1,726 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "SnapRealm.h"
+#include "MDCache.h"
+#include "MDSRank.h"
+#include "SnapClient.h"
+
+#include <string_view>
+
+
+/*
+ * SnapRealm
+ */
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mdcache->mds->get_nodeid(), inode, srnode.seq, this)
+static ostream& _prefix(std::ostream *_dout, int whoami, const CInode *inode,
+ uint64_t seq, const SnapRealm *realm) {
+ return *_dout << " mds." << whoami
+ << ".cache.snaprealm(" << inode->ino()
+ << " seq " << seq << " " << realm << ") ";
+}
+
+ostream& operator<<(ostream& out, const SnapRealm& realm)
+{
+ out << "snaprealm(" << realm.inode->ino()
+ << " seq " << realm.srnode.seq
+ << " lc " << realm.srnode.last_created
+ << " cr " << realm.srnode.created;
+ if (realm.srnode.created != realm.srnode.current_parent_since)
+ out << " cps " << realm.srnode.current_parent_since;
+ out << " snaps=" << realm.srnode.snaps;
+ out << " past_parent_snaps=" << realm.srnode.past_parent_snaps;
+
+ if (realm.srnode.past_parents.size()) {
+ out << " past_parents=(";
+ for (map<snapid_t, snaplink_t>::const_iterator p = realm.srnode.past_parents.begin();
+ p != realm.srnode.past_parents.end();
+ ++p) {
+ if (p != realm.srnode.past_parents.begin()) out << ",";
+ out << p->second.first << "-" << p->first
+ << "=" << p->second.ino;
+ }
+ out << ")";
+ }
+
+ if (realm.srnode.is_parent_global())
+ out << " global ";
+ out << " " << &realm << ")";
+ return out;
+}
+
+SnapRealm::SnapRealm(MDCache *c, CInode *in) :
+ mdcache(c), inode(in), parent(nullptr),
+ num_open_past_parents(0), inodes_with_caps(0)
+{
+ global = (inode->ino() == MDS_INO_GLOBAL_SNAPREALM);
+}
+
+void SnapRealm::add_open_past_parent(SnapRealm *parent, snapid_t last)
+{
+ auto p = open_past_parents.find(parent->inode->ino());
+ if (p != open_past_parents.end()) {
+ ceph_assert(p->second.second.count(last) == 0);
+ p->second.second.insert(last);
+ } else {
+ open_past_parents[parent->inode->ino()].first = parent;
+ open_past_parents[parent->inode->ino()].second.insert(last);
+ parent->open_past_children.insert(this);
+ parent->inode->get(CInode::PIN_PASTSNAPPARENT);
+ }
+ ++num_open_past_parents;
+}
+
+void SnapRealm::remove_open_past_parent(inodeno_t ino, snapid_t last)
+{
+ auto p = open_past_parents.find(ino);
+ ceph_assert(p != open_past_parents.end());
+ auto q = p->second.second.find(last);
+ ceph_assert(q != p->second.second.end());
+ p->second.second.erase(q);
+ --num_open_past_parents;
+ if (p->second.second.empty()) {
+ SnapRealm *parent = p->second.first;
+ open_past_parents.erase(p);
+ parent->open_past_children.erase(this);
+ parent->inode->put(CInode::PIN_PASTSNAPPARENT);
+ }
+}
+
+struct C_SR_RetryOpenParents : public MDSContext {
+ SnapRealm *sr;
+ snapid_t first, last, parent_last;
+ inodeno_t parent;
+ MDSContext* fin;
+ C_SR_RetryOpenParents(SnapRealm *s, snapid_t f, snapid_t l, snapid_t pl,
+ inodeno_t p, MDSContext *c) :
+ sr(s), first(f), last(l), parent_last(pl), parent(p), fin(c) {
+ sr->inode->get(CInode::PIN_OPENINGSNAPPARENTS);
+ }
+ MDSRank *get_mds() override { return sr->mdcache->mds; }
+ void finish(int r) override {
+ if (r < 0)
+ sr->_remove_missing_parent(parent_last, parent, r);
+ if (sr->_open_parents(fin, first, last)) {
+ if (fin)
+ fin->complete(0);
+ }
+ sr->inode->put(CInode::PIN_OPENINGSNAPPARENTS);
+ }
+};
+
+void SnapRealm::_remove_missing_parent(snapid_t snapid, inodeno_t parent, int err)
+{
+ map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.find(snapid);
+ if (p != srnode.past_parents.end()) {
+ dout(10) << __func__ << " " << parent << " [" << p->second.first << ","
+ << p->first << "] errno " << err << dendl;
+ srnode.past_parents.erase(p);
+ past_parents_dirty = true;
+ } else {
+ dout(10) << __func__ << " " << parent << " not found" << dendl;
+ }
+}
+
+bool SnapRealm::_open_parents(MDSContext *finish, snapid_t first, snapid_t last)
+{
+ dout(10) << "open_parents [" << first << "," << last << "]" << dendl;
+ if (open)
+ return true;
+
+ // make sure my current parents' parents are open...
+ if (parent) {
+ dout(10) << " current parent [" << srnode.current_parent_since << ",head] is " << *parent
+ << " on " << *parent->inode << dendl;
+ if (last >= srnode.current_parent_since &&
+ !parent->_open_parents(finish, std::max(first, srnode.current_parent_since), last))
+ return false;
+ }
+
+ if (!srnode.past_parent_snaps.empty())
+ ceph_assert(mdcache->mds->snapclient->get_cached_version() > 0);
+
+ if (!srnode.past_parents.empty() &&
+ mdcache->mds->allows_multimds_snaps()) {
+ dout(10) << " skip non-empty past_parents since multimds_snaps is allowed" << dendl;
+ open = true;
+ return true;
+ }
+
+ // and my past parents too!
+ ceph_assert(srnode.past_parents.size() >= num_open_past_parents);
+ if (srnode.past_parents.size() > num_open_past_parents) {
+ for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.begin();
+ p != srnode.past_parents.end(); ) {
+ dout(10) << " past_parent [" << p->second.first << "," << p->first << "] is "
+ << p->second.ino << dendl;
+ CInode *parent = mdcache->get_inode(p->second.ino);
+ if (!parent) {
+ C_SR_RetryOpenParents *fin = new C_SR_RetryOpenParents(this, first, last, p->first,
+ p->second.ino, finish);
+ mdcache->open_ino(p->second.ino, mdcache->mds->mdsmap->get_metadata_pool(), fin);
+ return false;
+ }
+ if (parent->state_test(CInode::STATE_PURGING)) {
+ dout(10) << " skip purging past_parent " << *parent << dendl;
+ srnode.past_parents.erase(p++);
+ past_parents_dirty = true;
+ continue;
+ }
+ ceph_assert(parent->snaprealm); // hmm!
+ if (!parent->snaprealm->_open_parents(finish, p->second.first, p->first))
+ return false;
+ auto q = open_past_parents.find(p->second.ino);
+ if (q == open_past_parents.end() ||
+ q->second.second.count(p->first) == 0) {
+ add_open_past_parent(parent->snaprealm, p->first);
+ }
+ ++p;
+ }
+ }
+
+ open = true;
+ return true;
+}
+
+bool SnapRealm::open_parents(MDSContext *retryorfinish) {
+ if (!_open_parents(retryorfinish))
+ return false;
+ delete retryorfinish;
+ return true;
+}
+
+bool SnapRealm::have_past_parents_open(snapid_t first, snapid_t last) const
+{
+ dout(10) << "have_past_parents_open [" << first << "," << last << "]" << dendl;
+ if (open)
+ return true;
+
+ if (!srnode.past_parent_snaps.empty())
+ ceph_assert(mdcache->mds->snapclient->get_cached_version() > 0);
+
+ if (!srnode.past_parents.empty() &&
+ mdcache->mds->allows_multimds_snaps()) {
+ dout(10) << " skip non-empty past_parents since multimds_snaps is allowed" << dendl;
+ open = true;
+ return true;
+ }
+
+ for (auto p = srnode.past_parents.lower_bound(first);
+ p != srnode.past_parents.end();
+ ++p) {
+ if (p->second.first > last)
+ break;
+ dout(10) << " past parent [" << p->second.first << "," << p->first << "] was "
+ << p->second.ino << dendl;
+ auto q = open_past_parents.find(p->second.ino);
+ if (q == open_past_parents.end()) {
+ dout(10) << " past parent " << p->second.ino << " is not open" << dendl;
+ return false;
+ }
+ SnapRealm *parent_realm = q->second.first;
+ if (!parent_realm->have_past_parents_open(std::max(first, p->second.first),
+ std::min(last, p->first)))
+ return false;
+ }
+
+ open = true;
+ return true;
+}
+
+void SnapRealm::close_parents()
+{
+ for (auto p = open_past_parents.begin(); p != open_past_parents.end(); ++p) {
+ num_open_past_parents -= p->second.second.size();
+ p->second.first->inode->put(CInode::PIN_PASTSNAPPARENT);
+ p->second.first->open_past_children.erase(this);
+ }
+ open_past_parents.clear();
+}
+
+
+/*
+ * get list of snaps for this realm. we must include parents' snaps
+ * for the intervals during which they were our parent.
+ */
+void SnapRealm::build_snap_set() const
+{
+ dout(10) << "build_snap_set on " << *this << dendl;
+
+ cached_snaps.clear();
+
+ if (global) {
+ mdcache->mds->snapclient->get_snaps(cached_snaps);
+ return;
+ }
+
+ // include my snaps
+ for (const auto& p : srnode.snaps)
+ cached_snaps.insert(p.first);
+
+ if (!srnode.past_parent_snaps.empty()) {
+ set<snapid_t> snaps = mdcache->mds->snapclient->filter(srnode.past_parent_snaps);
+ if (!snaps.empty()) {
+ snapid_t last = *snaps.rbegin();
+ cached_seq = std::max(cached_seq, last);
+ cached_last_created = std::max(cached_last_created, last);
+ }
+ cached_snaps.insert(snaps.begin(), snaps.end());
+ } else {
+ // include snaps for parents
+ for (const auto& p : srnode.past_parents) {
+ const CInode *oldparent = mdcache->get_inode(p.second.ino);
+ ceph_assert(oldparent); // call open_parents first!
+ ceph_assert(oldparent->snaprealm);
+
+ const set<snapid_t>& snaps = oldparent->snaprealm->get_snaps();
+ snapid_t last = 0;
+ for (auto q = snaps.lower_bound(p.second.first);
+ q != snaps.end() && *q <= p.first;
+ q++) {
+ cached_snaps.insert(*q);
+ last = *q;
+ }
+ cached_seq = std::max(cached_seq, last);
+ cached_last_created = std::max(cached_last_created, last);
+ }
+ }
+
+ snapid_t parent_seq = parent ? parent->get_newest_seq() : snapid_t(0);
+ if (parent_seq >= srnode.current_parent_since) {
+ auto& snaps = parent->get_snaps();
+ auto p = snaps.lower_bound(srnode.current_parent_since);
+ cached_snaps.insert(p, snaps.end());
+ cached_seq = std::max(cached_seq, parent_seq);
+ cached_last_created = std::max(cached_last_created, parent->get_last_created());
+ }
+}
+
+void SnapRealm::check_cache() const
+{
+ ceph_assert(have_past_parents_open());
+ snapid_t seq;
+ snapid_t last_created;
+ snapid_t last_destroyed = mdcache->mds->snapclient->get_last_destroyed();
+ if (global || srnode.is_parent_global()) {
+ last_created = mdcache->mds->snapclient->get_last_created();
+ seq = std::max(last_created, last_destroyed);
+ } else {
+ last_created = srnode.last_created;
+ seq = srnode.seq;
+ }
+ if (cached_seq >= seq &&
+ cached_last_destroyed == last_destroyed)
+ return;
+
+ cached_snap_context.clear();
+
+ cached_seq = seq;
+ cached_last_created = last_created;
+ cached_last_destroyed = last_destroyed;
+
+ cached_subvolume_ino = 0;
+ if (parent)
+ cached_subvolume_ino = parent->get_subvolume_ino();
+ if (!cached_subvolume_ino && srnode.is_subvolume())
+ cached_subvolume_ino = inode->ino();
+
+ build_snap_set();
+
+ build_snap_trace();
+
+ dout(10) << "check_cache rebuilt " << cached_snaps
+ << " seq " << seq
+ << " cached_seq " << cached_seq
+ << " cached_last_created " << cached_last_created
+ << " cached_last_destroyed " << cached_last_destroyed
+ << ")" << dendl;
+}
+
+const set<snapid_t>& SnapRealm::get_snaps() const
+{
+ check_cache();
+ dout(10) << "get_snaps " << cached_snaps
+ << " (seq " << srnode.seq << " cached_seq " << cached_seq << ")"
+ << dendl;
+ return cached_snaps;
+}
+
+/*
+ * build vector in reverse sorted order
+ */
+const SnapContext& SnapRealm::get_snap_context() const
+{
+ check_cache();
+
+ if (!cached_snap_context.seq) {
+ cached_snap_context.seq = cached_seq;
+ cached_snap_context.snaps.resize(cached_snaps.size());
+ unsigned i = 0;
+ for (set<snapid_t>::reverse_iterator p = cached_snaps.rbegin();
+ p != cached_snaps.rend();
+ ++p)
+ cached_snap_context.snaps[i++] = *p;
+ }
+
+ return cached_snap_context;
+}
+
+void SnapRealm::get_snap_info(map<snapid_t, const SnapInfo*>& infomap, snapid_t first, snapid_t last)
+{
+ const set<snapid_t>& snaps = get_snaps();
+ dout(10) << "get_snap_info snaps " << snaps << dendl;
+
+ // include my snaps within interval [first,last]
+ for (auto p = srnode.snaps.lower_bound(first); // first element >= first
+ p != srnode.snaps.end() && p->first <= last;
+ ++p)
+ infomap[p->first] = &p->second;
+
+ if (!srnode.past_parent_snaps.empty()) {
+ set<snapid_t> snaps;
+ for (auto p = srnode.past_parent_snaps.lower_bound(first); // first element >= first
+ p != srnode.past_parent_snaps.end() && *p <= last;
+ ++p) {
+ snaps.insert(*p);
+ }
+
+ map<snapid_t, const SnapInfo*> _infomap;
+ mdcache->mds->snapclient->get_snap_infos(_infomap, snaps);
+ infomap.insert(_infomap.begin(), _infomap.end());
+ } else {
+ // include snaps for parents during intervals that intersect [first,last]
+ for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first);
+ p != srnode.past_parents.end() && p->first >= first && p->second.first <= last;
+ ++p) {
+ CInode *oldparent = mdcache->get_inode(p->second.ino);
+ ceph_assert(oldparent); // call open_parents first!
+ ceph_assert(oldparent->snaprealm);
+ oldparent->snaprealm->get_snap_info(infomap,
+ std::max(first, p->second.first),
+ std::min(last, p->first));
+ }
+ }
+
+ if (srnode.current_parent_since <= last && parent)
+ parent->get_snap_info(infomap, std::max(first, srnode.current_parent_since), last);
+}
+
+std::string_view SnapRealm::get_snapname(snapid_t snapid, inodeno_t atino)
+{
+ auto srnode_snaps_entry = srnode.snaps.find(snapid);
+ if (srnode_snaps_entry != srnode.snaps.end()) {
+ if (atino == inode->ino())
+ return srnode_snaps_entry->second.name;
+ else
+ return srnode_snaps_entry->second.get_long_name();
+ }
+
+ if (!srnode.past_parent_snaps.empty()) {
+ if (srnode.past_parent_snaps.count(snapid)) {
+ const SnapInfo *sinfo = mdcache->mds->snapclient->get_snap_info(snapid);
+ if (sinfo) {
+ if (atino == sinfo->ino)
+ return sinfo->name;
+ else
+ return sinfo->get_long_name();
+ }
+ }
+ } else {
+ map<snapid_t,snaplink_t>::iterator p = srnode.past_parents.lower_bound(snapid);
+ if (p != srnode.past_parents.end() && p->second.first <= snapid) {
+ CInode *oldparent = mdcache->get_inode(p->second.ino);
+ ceph_assert(oldparent); // call open_parents first!
+ ceph_assert(oldparent->snaprealm);
+ return oldparent->snaprealm->get_snapname(snapid, atino);
+ }
+ }
+
+ ceph_assert(srnode.current_parent_since <= snapid);
+ ceph_assert(parent);
+ return parent->get_snapname(snapid, atino);
+}
+
+snapid_t SnapRealm::resolve_snapname(std::string_view n, inodeno_t atino, snapid_t first, snapid_t last)
+{
+ // first try me
+ dout(10) << "resolve_snapname '" << n << "' in [" << first << "," << last << "]" << dendl;
+
+ bool actual = (atino == inode->ino());
+ string pname;
+ inodeno_t pino;
+ if (n.length() && n[0] == '_') {
+ size_t next_ = n.find_last_of('_');
+ if (next_ > 1 && next_ + 1 < n.length()) {
+ pname = n.substr(1, next_ - 1);
+ pino = atoll(n.data() + next_ + 1);
+ dout(10) << " " << n << " parses to name '" << pname << "' dirino " << pino << dendl;
+ }
+ }
+
+ for (auto p = srnode.snaps.lower_bound(first); // first element >= first
+ p != srnode.snaps.end() && p->first <= last;
+ ++p) {
+ dout(15) << " ? " << p->second << dendl;
+ //if (num && p->second.snapid == num)
+ //return p->first;
+ if (actual && p->second.name == n)
+ return p->first;
+ if (!actual && p->second.name == pname && p->second.ino == pino)
+ return p->first;
+ }
+
+ if (!srnode.past_parent_snaps.empty()) {
+ set<snapid_t> snaps;
+ for (auto p = srnode.past_parent_snaps.lower_bound(first); // first element >= first
+ p != srnode.past_parent_snaps.end() && *p <= last;
+ ++p)
+ snaps.insert(*p);
+
+ map<snapid_t, const SnapInfo*> _infomap;
+ mdcache->mds->snapclient->get_snap_infos(_infomap, snaps);
+
+ for (auto& it : _infomap) {
+ dout(15) << " ? " << *it.second << dendl;
+ actual = (it.second->ino == atino);
+ if (actual && it.second->name == n)
+ return it.first;
+ if (!actual && it.second->name == pname && it.second->ino == pino)
+ return it.first;
+ }
+ } else {
+ // include snaps for parents during intervals that intersect [first,last]
+ for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first);
+ p != srnode.past_parents.end() && p->first >= first && p->second.first <= last;
+ ++p) {
+ CInode *oldparent = mdcache->get_inode(p->second.ino);
+ ceph_assert(oldparent); // call open_parents first!
+ ceph_assert(oldparent->snaprealm);
+ snapid_t r = oldparent->snaprealm->resolve_snapname(n, atino,
+ std::max(first, p->second.first),
+ std::min(last, p->first));
+ if (r)
+ return r;
+ }
+ }
+
+ if (parent && srnode.current_parent_since <= last)
+ return parent->resolve_snapname(n, atino, std::max(first, srnode.current_parent_since), last);
+ return 0;
+}
+
+
+void SnapRealm::adjust_parent()
+{
+ SnapRealm *newparent;
+ if (srnode.is_parent_global()) {
+ newparent = mdcache->get_global_snaprealm();
+ } else {
+ CDentry *pdn = inode->get_parent_dn();
+ newparent = pdn ? pdn->get_dir()->get_inode()->find_snaprealm() : NULL;
+ }
+ if (newparent != parent) {
+ dout(10) << "adjust_parent " << parent << " -> " << newparent << dendl;
+ if (parent)
+ parent->open_children.erase(this);
+ parent = newparent;
+ if (parent)
+ parent->open_children.insert(this);
+
+ invalidate_cached_snaps();
+ }
+}
+
+void SnapRealm::split_at(SnapRealm *child)
+{
+ dout(10) << "split_at " << *child
+ << " on " << *child->inode << dendl;
+
+ if (inode->is_mdsdir() || !child->inode->is_dir()) {
+ // it's not a dir.
+ if (child->inode->containing_realm) {
+ // - no open children.
+ // - only need to move this child's inode's caps.
+ child->inode->move_to_realm(child);
+ } else {
+ // no caps, nothing to move/split.
+ dout(20) << " split no-op, no caps to move on file " << *child->inode << dendl;
+ ceph_assert(!child->inode->is_any_caps());
+ }
+ return;
+ }
+
+ // it's a dir.
+
+ // split open_children
+ dout(10) << " open_children are " << open_children << dendl;
+ for (set<SnapRealm*>::iterator p = open_children.begin();
+ p != open_children.end(); ) {
+ SnapRealm *realm = *p;
+ if (realm != child &&
+ child->inode->is_ancestor_of(realm->inode)) {
+ dout(20) << " child gets child realm " << *realm << " on " << *realm->inode << dendl;
+ realm->parent = child;
+ child->open_children.insert(realm);
+ open_children.erase(p++);
+ } else {
+ dout(20) << " keeping child realm " << *realm << " on " << *realm->inode << dendl;
+ ++p;
+ }
+ }
+
+ // split inodes_with_caps
+ for (elist<CInode*>::iterator p = inodes_with_caps.begin(member_offset(CInode, item_caps));
+ !p.end(); ) {
+ CInode *in = *p;
+ ++p;
+ // does inode fall within the child realm?
+ if (child->inode->is_ancestor_of(in)) {
+ dout(20) << " child gets " << *in << dendl;
+ in->move_to_realm(child);
+ } else {
+ dout(20) << " keeping " << *in << dendl;
+ }
+ }
+}
+
+void SnapRealm::merge_to(SnapRealm *newparent)
+{
+ if (!newparent)
+ newparent = parent;
+ dout(10) << "merge to " << *newparent << " on " << *newparent->inode << dendl;
+
+ ceph_assert(open_past_children.empty());
+
+ dout(10) << " open_children are " << open_children << dendl;
+ for (auto realm : open_children) {
+ dout(20) << " child realm " << *realm << " on " << *realm->inode << dendl;
+ newparent->open_children.insert(realm);
+ realm->parent = newparent;
+ }
+ open_children.clear();
+
+ elist<CInode*>::iterator p = inodes_with_caps.begin(member_offset(CInode, item_caps));
+ while (!p.end()) {
+ CInode *in = *p;
+ ++p;
+ in->move_to_realm(newparent);
+ }
+ ceph_assert(inodes_with_caps.empty());
+
+ // delete this
+ inode->close_snaprealm();
+}
+
+const bufferlist& SnapRealm::get_snap_trace() const
+{
+ check_cache();
+ return cached_snap_trace;
+}
+
+void SnapRealm::build_snap_trace() const
+{
+ cached_snap_trace.clear();
+
+ if (global) {
+ SnapRealmInfo info(inode->ino(), 0, cached_seq, 0);
+ info.my_snaps.reserve(cached_snaps.size());
+ for (auto p = cached_snaps.rbegin(); p != cached_snaps.rend(); ++p)
+ info.my_snaps.push_back(*p);
+
+ dout(10) << "build_snap_trace my_snaps " << info.my_snaps << dendl;
+ encode(info, cached_snap_trace);
+ return;
+ }
+
+ SnapRealmInfo info(inode->ino(), srnode.created, srnode.seq, srnode.current_parent_since);
+ if (parent) {
+ info.h.parent = parent->inode->ino();
+
+ set<snapid_t> past;
+ if (!srnode.past_parent_snaps.empty()) {
+ past = mdcache->mds->snapclient->filter(srnode.past_parent_snaps);
+ if (srnode.is_parent_global()) {
+ auto p = past.lower_bound(srnode.current_parent_since);
+ past.erase(p, past.end());
+ }
+ } else if (!srnode.past_parents.empty()) {
+ const set<snapid_t>& snaps = get_snaps();
+ for (const auto& p : srnode.past_parents) {
+ for (auto q = snaps.lower_bound(p.second.first);
+ q != snaps.end() && *q <= p.first;
+ q++) {
+ if (srnode.snaps.count(*q))
+ continue;
+ past.insert(*q);
+ }
+ }
+ }
+
+ if (!past.empty()) {
+ info.prior_parent_snaps.reserve(past.size());
+ for (set<snapid_t>::reverse_iterator p = past.rbegin(); p != past.rend(); ++p)
+ info.prior_parent_snaps.push_back(*p);
+ dout(10) << "build_snap_trace prior_parent_snaps from [1," << *past.rbegin() << "] "
+ << info.prior_parent_snaps << dendl;
+ }
+ }
+
+ info.my_snaps.reserve(srnode.snaps.size());
+ for (auto p = srnode.snaps.rbegin();
+ p != srnode.snaps.rend();
+ ++p)
+ info.my_snaps.push_back(p->first);
+ dout(10) << "build_snap_trace my_snaps " << info.my_snaps << dendl;
+
+ encode(info, cached_snap_trace);
+
+ if (parent)
+ cached_snap_trace.append(parent->get_snap_trace());
+}
+
+void SnapRealm::prune_past_parents()
+{
+ dout(10) << "prune_past_parents" << dendl;
+ check_cache();
+
+ // convert past_parents to past_parent_snaps
+ if (!srnode.past_parents.empty()) {
+ for (auto p = cached_snaps.begin();
+ p != cached_snaps.end() && *p < srnode.current_parent_since;
+ ++p) {
+ if (!srnode.snaps.count(*p))
+ srnode.past_parent_snaps.insert(*p);
+ }
+ srnode.past_parents.clear();
+ past_parents_dirty = true;
+ }
+
+ for (auto p = srnode.past_parent_snaps.begin();
+ p != srnode.past_parent_snaps.end(); ) {
+ auto q = cached_snaps.find(*p);
+ if (q == cached_snaps.end()) {
+ dout(10) << "prune_past_parents pruning " << *p << dendl;
+ srnode.past_parent_snaps.erase(p++);
+ } else {
+ dout(10) << "prune_past_parents keeping " << *p << dendl;
+ ++p;
+ }
+ }
+}
+
diff --git a/src/mds/SnapRealm.h b/src/mds/SnapRealm.h
new file mode 100644
index 00000000..582daa2d
--- /dev/null
+++ b/src/mds/SnapRealm.h
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_SNAPREALM_H
+#define CEPH_MDS_SNAPREALM_H
+
+#include <string_view>
+
+#include "mdstypes.h"
+#include "snap.h"
+#include "include/xlist.h"
+#include "include/elist.h"
+#include "common/snap_types.h"
+#include "MDSContext.h"
+
+struct SnapRealm {
+protected:
+ // cache
+ mutable snapid_t cached_seq; // max seq over self and all past+present parents.
+ mutable snapid_t cached_last_created; // max last_created over all past+present parents
+ mutable snapid_t cached_last_destroyed;
+ mutable set<snapid_t> cached_snaps;
+ mutable SnapContext cached_snap_context;
+ mutable bufferlist cached_snap_trace;
+ mutable inodeno_t cached_subvolume_ino = 0;
+
+ void check_cache() const;
+
+public:
+ // realm state
+ sr_t srnode;
+
+ // in-memory state
+ MDCache *mdcache;
+ CInode *inode;
+
+ mutable bool open = false; // set to true once all past_parents are opened
+ bool past_parents_dirty = false;
+ bool global;
+
+ SnapRealm *parent;
+ set<SnapRealm*> open_children; // active children that are currently open
+ set<SnapRealm*> open_past_children; // past children who has pinned me
+ map<inodeno_t, pair<SnapRealm*, set<snapid_t> > > open_past_parents; // these are explicitly pinned.
+ unsigned num_open_past_parents;
+
+ elist<CInode*> inodes_with_caps; // for efficient realm splits
+ map<client_t, xlist<Capability*>* > client_caps; // to identify clients who need snap notifications
+
+ SnapRealm(MDCache *c, CInode *in);
+
+ bool exists(std::string_view name) const {
+ for (map<snapid_t,SnapInfo>::const_iterator p = srnode.snaps.begin();
+ p != srnode.snaps.end();
+ ++p) {
+ if (p->second.name == name)
+ return true;
+ }
+ return false;
+ }
+
+ bool _open_parents(MDSContext *retryorfinish, snapid_t first=1, snapid_t last=CEPH_NOSNAP);
+ bool open_parents(MDSContext *retryorfinish);
+ void _remove_missing_parent(snapid_t snapid, inodeno_t parent, int err);
+ bool have_past_parents_open(snapid_t first=1, snapid_t last=CEPH_NOSNAP) const;
+ void add_open_past_parent(SnapRealm *parent, snapid_t last);
+ void remove_open_past_parent(inodeno_t ino, snapid_t last);
+ void close_parents();
+
+ void prune_past_parents();
+ bool has_past_parents() const {
+ return !srnode.past_parent_snaps.empty() ||
+ !srnode.past_parents.empty();
+ }
+
+ void build_snap_set() const;
+ void get_snap_info(map<snapid_t, const SnapInfo*>& infomap, snapid_t first=0, snapid_t last=CEPH_NOSNAP);
+
+ const bufferlist& get_snap_trace() const;
+ void build_snap_trace() const;
+
+ std::string_view get_snapname(snapid_t snapid, inodeno_t atino);
+ snapid_t resolve_snapname(std::string_view name, inodeno_t atino, snapid_t first=0, snapid_t last=CEPH_NOSNAP);
+
+ const set<snapid_t>& get_snaps() const;
+ const SnapContext& get_snap_context() const;
+ void invalidate_cached_snaps() {
+ cached_seq = 0;
+ }
+ snapid_t get_last_created() {
+ check_cache();
+ return cached_last_created;
+ }
+ snapid_t get_last_destroyed() {
+ check_cache();
+ return cached_last_destroyed;
+ }
+ snapid_t get_newest_snap() {
+ check_cache();
+ if (cached_snaps.empty())
+ return 0;
+ else
+ return *cached_snaps.rbegin();
+ }
+ snapid_t get_newest_seq() {
+ check_cache();
+ return cached_seq;
+ }
+
+ snapid_t get_snap_following(snapid_t follows) {
+ check_cache();
+ const set<snapid_t>& s = get_snaps();
+ set<snapid_t>::const_iterator p = s.upper_bound(follows);
+ if (p != s.end())
+ return *p;
+ return CEPH_NOSNAP;
+ }
+
+ bool has_snaps_in_range(snapid_t first, snapid_t last) {
+ check_cache();
+ const set<snapid_t>& s = get_snaps();
+ set<snapid_t>::const_iterator p = s.lower_bound(first);
+ return (p != s.end() && *p <= last);
+ }
+
+ inodeno_t get_subvolume_ino() {
+ check_cache();
+ return cached_subvolume_ino;
+ }
+
+ void adjust_parent();
+
+ void split_at(SnapRealm *child);
+ void merge_to(SnapRealm *newparent);
+
+ void add_cap(client_t client, Capability *cap) {
+ auto client_caps_entry = client_caps.find(client);
+ if (client_caps_entry == client_caps.end())
+ client_caps_entry = client_caps.emplace(client,
+ new xlist<Capability*>).first;
+ client_caps_entry->second->push_back(&cap->item_snaprealm_caps);
+ }
+ void remove_cap(client_t client, Capability *cap) {
+ cap->item_snaprealm_caps.remove_myself();
+ auto found = client_caps.find(client);
+ if (found != client_caps.end() && found->second->empty()) {
+ delete found->second;
+ client_caps.erase(found);
+ }
+ }
+};
+
+ostream& operator<<(ostream& out, const SnapRealm &realm);
+
+#endif
diff --git a/src/mds/SnapServer.cc b/src/mds/SnapServer.cc
new file mode 100644
index 00000000..d9690d40
--- /dev/null
+++ b/src/mds/SnapServer.cc
@@ -0,0 +1,476 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "SnapServer.h"
+#include "MDSRank.h"
+#include "osd/OSDMap.h"
+#include "osdc/Objecter.h"
+#include "mon/MonClient.h"
+
+#include "include/types.h"
+#include "messages/MMDSTableRequest.h"
+#include "messages/MRemoveSnaps.h"
+
+#include "msg/Messenger.h"
+
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << rank << ".snap "
+
+
+void SnapServer::reset_state()
+{
+ last_snap = 1; /* snapid 1 reserved for initial root snaprealm */
+ snaps.clear();
+ need_to_purge.clear();
+ pending_update.clear();
+ pending_destroy.clear();
+ pending_noop.clear();
+
+ // find any removed snapshot in data pools
+ if (mds) { // only if I'm running in a live MDS
+ snapid_t first_free = 0;
+ mds->objecter->with_osdmap([&](const OSDMap& o) {
+ for (const auto p : mds->mdsmap->get_data_pools()) {
+ const pg_pool_t *pi = o.get_pg_pool(p);
+ if (!pi) {
+ // If pool isn't in OSDMap yet then can't have any snaps
+ // needing removal, skip.
+ continue;
+ }
+ if (pi->snap_seq > first_free) {
+ first_free = pi->snap_seq;
+ }
+ }
+ });
+ if (first_free > last_snap)
+ last_snap = first_free;
+ }
+ last_created = last_snap;
+ last_destroyed = last_snap;
+ snaprealm_v2_since = last_snap + 1;
+
+ MDSTableServer::reset_state();
+}
+
+
+// SERVER
+
+void SnapServer::_prepare(const bufferlist& bl, uint64_t reqid, mds_rank_t bymds, bufferlist& out)
+{
+ using ceph::decode;
+ using ceph::encode;
+ auto p = bl.cbegin();
+ __u32 op;
+ decode(op, p);
+
+ switch (op) {
+ case TABLE_OP_CREATE:
+ {
+ SnapInfo info;
+ decode(info.ino, p);
+ if (!p.end()) {
+ decode(info.name, p);
+ decode(info.stamp, p);
+ info.snapid = ++last_snap;
+ pending_update[version] = info;
+ dout(10) << "prepare v" << version << " create " << info << dendl;
+ } else {
+ pending_noop.insert(version);
+ dout(10) << "prepare v" << version << " noop" << dendl;
+ }
+
+ encode(last_snap, out);
+ }
+ break;
+
+ case TABLE_OP_DESTROY:
+ {
+ inodeno_t ino;
+ snapid_t snapid;
+ decode(ino, p); // not used, currently.
+ decode(snapid, p);
+
+ // bump last_snap... we use it as a version value on the snaprealm.
+ ++last_snap;
+
+ pending_destroy[version] = pair<snapid_t,snapid_t>(snapid, last_snap);
+ dout(10) << "prepare v" << version << " destroy " << snapid << " seq " << last_snap << dendl;
+
+ encode(last_snap, out);
+ }
+ break;
+
+ case TABLE_OP_UPDATE:
+ {
+ SnapInfo info;
+ decode(info.ino, p);
+ decode(info.snapid, p);
+ decode(info.name, p);
+ decode(info.stamp, p);
+
+ pending_update[version] = info;
+ dout(10) << "prepare v" << version << " update " << info << dendl;
+ }
+ break;
+
+ default:
+ ceph_abort();
+ }
+ //dump();
+}
+
+void SnapServer::_get_reply_buffer(version_t tid, bufferlist *pbl) const
+{
+ using ceph::encode;
+ auto p = pending_update.find(tid);
+ if (p != pending_update.end()) {
+ if (pbl && !snaps.count(p->second.snapid)) // create
+ encode(p->second.snapid, *pbl);
+ return;
+ }
+ auto q = pending_destroy.find(tid);
+ if (q != pending_destroy.end()) {
+ if (pbl)
+ encode(q->second.second, *pbl);
+ return;
+ }
+ auto r = pending_noop.find(tid);
+ if (r != pending_noop.end()) {
+ if (pbl)
+ encode(last_snap, *pbl);
+ return;
+ }
+ assert (0 == "tid not found");
+}
+
+void SnapServer::_commit(version_t tid, MMDSTableRequest::const_ref req)
+{
+ if (pending_update.count(tid)) {
+ SnapInfo &info = pending_update[tid];
+ string opname;
+ if (snaps.count(info.snapid)) {
+ opname = "update";
+ if (info.stamp == utime_t())
+ info.stamp = snaps[info.snapid].stamp;
+ } else {
+ opname = "create";
+ if (info.snapid > last_created)
+ last_created = info.snapid;
+ }
+ dout(7) << "commit " << tid << " " << opname << " " << info << dendl;
+ snaps[info.snapid] = info;
+ pending_update.erase(tid);
+ }
+
+ else if (pending_destroy.count(tid)) {
+ snapid_t sn = pending_destroy[tid].first;
+ snapid_t seq = pending_destroy[tid].second;
+ dout(7) << "commit " << tid << " destroy " << sn << " seq " << seq << dendl;
+ snaps.erase(sn);
+ if (seq > last_destroyed)
+ last_destroyed = seq;
+
+ for (const auto p : mds->mdsmap->get_data_pools()) {
+ need_to_purge[p].insert(sn);
+ need_to_purge[p].insert(seq);
+ }
+
+ pending_destroy.erase(tid);
+ }
+ else if (pending_noop.count(tid)) {
+ dout(7) << "commit " << tid << " noop" << dendl;
+ pending_noop.erase(tid);
+ }
+ else
+ ceph_abort();
+
+ //dump();
+}
+
+void SnapServer::_rollback(version_t tid)
+{
+ if (pending_update.count(tid)) {
+ SnapInfo &info = pending_update[tid];
+ string opname;
+ if (snaps.count(info.snapid))
+ opname = "update";
+ else
+ opname = "create";
+ dout(7) << "rollback " << tid << " " << opname << " " << info << dendl;
+ pending_update.erase(tid);
+ }
+
+ else if (pending_destroy.count(tid)) {
+ dout(7) << "rollback " << tid << " destroy " << pending_destroy[tid] << dendl;
+ pending_destroy.erase(tid);
+ }
+
+ else if (pending_noop.count(tid)) {
+ dout(7) << "rollback " << tid << " noop" << dendl;
+ pending_noop.erase(tid);
+ }
+
+ else
+ ceph_abort();
+
+ //dump();
+}
+
+void SnapServer::_server_update(bufferlist& bl)
+{
+ using ceph::decode;
+ auto p = bl.cbegin();
+ map<int, vector<snapid_t> > purge;
+ decode(purge, p);
+
+ dout(7) << "_server_update purged " << purge << dendl;
+ for (map<int, vector<snapid_t> >::iterator p = purge.begin();
+ p != purge.end();
+ ++p) {
+ for (vector<snapid_t>::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q)
+ need_to_purge[p->first].erase(*q);
+ if (need_to_purge[p->first].empty())
+ need_to_purge.erase(p->first);
+ }
+}
+
+bool SnapServer::_notify_prep(version_t tid)
+{
+ using ceph::encode;
+ bufferlist bl;
+ char type = 'F';
+ encode(type, bl);
+ encode(snaps, bl);
+ encode(pending_update, bl);
+ encode(pending_destroy, bl);
+ encode(last_created, bl);
+ encode(last_destroyed, bl);
+ ceph_assert(version == tid);
+
+ for (auto &p : active_clients) {
+ auto m = MMDSTableRequest::create(table, TABLESERVER_OP_NOTIFY_PREP, 0, version);
+ m->bl = bl;
+ mds->send_message_mds(m, p);
+ }
+ return true;
+}
+
+void SnapServer::handle_query(const MMDSTableRequest::const_ref &req)
+{
+ using ceph::encode;
+ using ceph::decode;
+ char op;
+ auto p = req->bl.cbegin();
+ decode(op, p);
+
+ auto reply = MMDSTableRequest::create(table, TABLESERVER_OP_QUERY_REPLY, req->reqid, version);
+
+ switch (op) {
+ case 'F': // full
+ version_t have_version;
+ decode(have_version, p);
+ ceph_assert(have_version <= version);
+ if (have_version == version) {
+ char type = 'U';
+ encode(type, reply->bl);
+ } else {
+ char type = 'F';
+ encode(type, reply->bl);
+ encode(snaps, reply->bl);
+ encode(pending_update, reply->bl);
+ encode(pending_destroy, reply->bl);
+ encode(last_created, reply->bl);
+ encode(last_destroyed, reply->bl);
+ }
+ // FIXME: implement incremental change
+ break;
+ default:
+ ceph_abort();
+ };
+
+ mds->send_message(reply, req->get_connection());
+}
+
+void SnapServer::check_osd_map(bool force)
+{
+ if (!force && version == last_checked_osdmap) {
+ dout(10) << "check_osd_map - version unchanged" << dendl;
+ return;
+ }
+ dout(10) << "check_osd_map need_to_purge=" << need_to_purge << dendl;
+
+ map<int, vector<snapid_t> > all_purge;
+ map<int, vector<snapid_t> > all_purged;
+
+ mds->objecter->with_osdmap(
+ [this, &all_purged, &all_purge](const OSDMap& osdmap) {
+ for (const auto& p : need_to_purge) {
+ int id = p.first;
+ const pg_pool_t *pi = osdmap.get_pg_pool(id);
+ if (pi == NULL) {
+ // The pool is gone. So are the snapshots.
+ all_purged[id] = std::vector<snapid_t>(p.second.begin(),
+ p.second.end());
+ continue;
+ }
+
+ for (const auto& q : p.second) {
+ if (pi->is_removed_snap(q)) {
+ dout(10) << " osdmap marks " << q << " as removed" << dendl;
+ all_purged[id].push_back(q);
+ } else {
+ all_purge[id].push_back(q);
+ }
+ }
+ }
+ });
+
+ if (!all_purged.empty()) {
+ // prepare to remove from need_to_purge list
+ bufferlist bl;
+ using ceph::encode;
+ encode(all_purged, bl);
+ do_server_update(bl);
+ }
+
+ if (!all_purge.empty()) {
+ dout(10) << "requesting removal of " << all_purge << dendl;
+ auto m = MRemoveSnaps::create(all_purge);
+ mon_client->send_mon_message(m.detach());
+ }
+
+ last_checked_osdmap = version;
+}
+
+
+void SnapServer::dump(Formatter *f) const
+{
+ f->open_object_section("snapserver");
+
+ f->dump_int("last_snap", last_snap);
+ f->dump_int("last_created", last_created);
+ f->dump_int("last_destroyed", last_destroyed);
+
+ f->open_array_section("pending_noop");
+ for(set<version_t>::const_iterator i = pending_noop.begin(); i != pending_noop.end(); ++i) {
+ f->dump_unsigned("version", *i);
+ }
+ f->close_section();
+
+ f->open_array_section("snaps");
+ for (map<snapid_t, SnapInfo>::const_iterator i = snaps.begin(); i != snaps.end(); ++i) {
+ f->open_object_section("snap");
+ i->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_object_section("need_to_purge");
+ for (map<int, set<snapid_t> >::const_iterator i = need_to_purge.begin(); i != need_to_purge.end(); ++i) {
+ stringstream pool_id;
+ pool_id << i->first;
+ f->open_array_section(pool_id.str().c_str());
+ for (set<snapid_t>::const_iterator s = i->second.begin(); s != i->second.end(); ++s) {
+ f->dump_unsigned("snapid", s->val);
+ }
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("pending_update");
+ for(map<version_t, SnapInfo>::const_iterator i = pending_update.begin(); i != pending_update.end(); ++i) {
+ f->open_object_section("snap");
+ f->dump_unsigned("version", i->first);
+ f->open_object_section("snapinfo");
+ i->second.dump(f);
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("pending_destroy");
+ for(map<version_t, pair<snapid_t, snapid_t> >::const_iterator i = pending_destroy.begin(); i != pending_destroy.end(); ++i) {
+ f->open_object_section("snap");
+ f->dump_unsigned("version", i->first);
+ f->dump_unsigned("removed_snap", i->second.first);
+ f->dump_unsigned("seq", i->second.second);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->close_section();
+}
+
+void SnapServer::generate_test_instances(list<SnapServer*>& ls)
+{
+ list<SnapInfo*> snapinfo_instances;
+ SnapInfo::generate_test_instances(snapinfo_instances);
+ SnapInfo populated_snapinfo = *(snapinfo_instances.back());
+ for (list<SnapInfo*>::iterator i = snapinfo_instances.begin(); i != snapinfo_instances.end(); ++i) {
+ delete *i;
+ }
+
+ SnapServer *blank = new SnapServer();
+ ls.push_back(blank);
+ SnapServer *populated = new SnapServer();
+ populated->last_snap = 123;
+ populated->snaps[456] = populated_snapinfo;
+ populated->need_to_purge[2].insert(012);
+ populated->pending_update[234] = populated_snapinfo;
+ populated->pending_destroy[345].first = 567;
+ populated->pending_destroy[345].second = 768;
+ populated->pending_noop.insert(890);
+
+ ls.push_back(populated);
+}
+
+bool SnapServer::force_update(snapid_t last, snapid_t v2_since,
+ map<snapid_t, SnapInfo>& _snaps)
+{
+ bool modified = false;
+ if (last > last_snap) {
+ derr << " updating last_snap " << last_snap << " -> " << last << dendl;
+ last_snap = last;
+ last_created = last;
+ last_destroyed = last;
+ modified = true;
+ }
+ if (v2_since > snaprealm_v2_since) {
+ derr << " updating snaprealm_v2_since " << snaprealm_v2_since
+ << " -> " << v2_since << dendl;
+ snaprealm_v2_since = v2_since;
+ modified = true;
+ }
+ if (snaps != _snaps) {
+ derr << " updating snaps {" << snaps << "} -> {" << _snaps << "}" << dendl;
+ snaps = _snaps;
+ modified = true;
+ }
+
+ if (modified) {
+ need_to_purge.clear();
+ pending_update.clear();
+ pending_destroy.clear();
+ pending_noop.clear();
+ MDSTableServer::reset_state();
+ }
+ return modified;
+}
diff --git a/src/mds/SnapServer.h b/src/mds/SnapServer.h
new file mode 100644
index 00000000..f0a92ce8
--- /dev/null
+++ b/src/mds/SnapServer.h
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_SNAPSERVER_H
+#define CEPH_SNAPSERVER_H
+
+#include "MDSTableServer.h"
+#include "snap.h"
+
+class MDSRank;
+class MonClient;
+
+class SnapServer : public MDSTableServer {
+protected:
+ MonClient *mon_client = nullptr;
+ snapid_t last_snap = 0;
+ snapid_t last_created, last_destroyed;
+ snapid_t snaprealm_v2_since;
+ map<snapid_t, SnapInfo> snaps;
+ map<int, set<snapid_t> > need_to_purge;
+
+ map<version_t, SnapInfo> pending_update;
+ map<version_t, pair<snapid_t,snapid_t> > pending_destroy; // (removed_snap, seq)
+ set<version_t> pending_noop;
+
+ version_t last_checked_osdmap;
+
+ bool root_scrubbed = false; // all snaprealms under root are converted?
+ bool mdsdir_scrubbed = false; // all snaprealms under ~mds0 are converted?
+
+ void encode_server_state(bufferlist& bl) const override {
+ ENCODE_START(5, 3, bl);
+ encode(last_snap, bl);
+ encode(snaps, bl);
+ encode(need_to_purge, bl);
+ encode(pending_update, bl);
+ encode(pending_destroy, bl);
+ encode(pending_noop, bl);
+ encode(last_created, bl);
+ encode(last_destroyed, bl);
+ encode(snaprealm_v2_since, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode_server_state(bufferlist::const_iterator& bl) override {
+ DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
+ decode(last_snap, bl);
+ decode(snaps, bl);
+ decode(need_to_purge, bl);
+ decode(pending_update, bl);
+ if (struct_v >= 2)
+ decode(pending_destroy, bl);
+ else {
+ map<version_t, snapid_t> t;
+ decode(t, bl);
+ for (map<version_t, snapid_t>::iterator p = t.begin(); p != t.end(); ++p)
+ pending_destroy[p->first].first = p->second;
+ }
+ decode(pending_noop, bl);
+ if (struct_v >= 4) {
+ decode(last_created, bl);
+ decode(last_destroyed, bl);
+ } else {
+ last_created = last_snap;
+ last_destroyed = last_snap;
+ }
+ if (struct_v >= 5)
+ decode(snaprealm_v2_since, bl);
+ else
+ snaprealm_v2_since = CEPH_NOSNAP;
+
+ DECODE_FINISH(bl);
+ }
+
+ // server bits
+ void _prepare(const bufferlist &bl, uint64_t reqid, mds_rank_t bymds, bufferlist &out) override;
+ void _get_reply_buffer(version_t tid, bufferlist *pbl) const override;
+ void _commit(version_t tid, MMDSTableRequest::const_ref req) override;
+ void _rollback(version_t tid) override;
+ void _server_update(bufferlist& bl) override;
+ bool _notify_prep(version_t tid) override;
+ void handle_query(const MMDSTableRequest::const_ref &m) override;
+
+public:
+ SnapServer(MDSRank *m, MonClient *monc)
+ : MDSTableServer(m, TABLE_SNAP), mon_client(monc), last_checked_osdmap(0) {}
+ SnapServer() : MDSTableServer(NULL, TABLE_SNAP), last_checked_osdmap(0) {}
+
+ void reset_state() override;
+
+ bool upgrade_format() {
+ // upgraded from old filesystem
+ ceph_assert(is_active());
+ ceph_assert(last_snap > 0);
+ bool upgraded = false;
+ if (get_version() == 0) {
+ // version 0 confuses snapclient code
+ reset();
+ upgraded = true;
+ }
+ if (snaprealm_v2_since == CEPH_NOSNAP) {
+ // new snapshots will have new format snaprealms
+ snaprealm_v2_since = last_snap + 1;
+ upgraded = true;
+ }
+ return upgraded;
+ }
+
+ void check_osd_map(bool force);
+
+ void mark_base_recursively_scrubbed(inodeno_t ino) {
+ if (ino == MDS_INO_ROOT)
+ root_scrubbed = true;
+ else if (ino == MDS_INO_MDSDIR(rank))
+ mdsdir_scrubbed = true;
+ else
+ ceph_abort();
+ }
+ bool can_allow_multimds_snaps() const {
+ return (root_scrubbed && mdsdir_scrubbed) ||
+ snaps.empty() || snaps.begin()->first >= snaprealm_v2_since;
+ }
+
+ void encode(bufferlist& bl) const {
+ encode_server_state(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ decode_server_state(bl);
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<SnapServer*>& ls);
+
+ bool force_update(snapid_t last, snapid_t v2_since,
+ map<snapid_t, SnapInfo>& _snaps);
+};
+WRITE_CLASS_ENCODER(SnapServer)
+
+#endif
diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc
new file mode 100644
index 00000000..444e4ccc
--- /dev/null
+++ b/src/mds/StrayManager.cc
@@ -0,0 +1,759 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "common/perf_counters.h"
+
+#include "mds/MDSRank.h"
+#include "mds/MDCache.h"
+#include "mds/MDLog.h"
+#include "mds/CDir.h"
+#include "mds/CDentry.h"
+#include "events/EUpdate.h"
+#include "messages/MClientRequest.h"
+
+#include "StrayManager.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+ return *_dout << "mds." << mds->get_nodeid() << ".cache.strays ";
+}
+
+class StrayManagerIOContext : public virtual MDSIOContextBase {
+protected:
+ StrayManager *sm;
+ MDSRank *get_mds() override
+ {
+ return sm->mds;
+ }
+public:
+ explicit StrayManagerIOContext(StrayManager *sm_) : sm(sm_) {}
+};
+
+class StrayManagerLogContext : public virtual MDSLogContextBase {
+protected:
+ StrayManager *sm;
+ MDSRank *get_mds() override
+ {
+ return sm->mds;
+ }
+public:
+ explicit StrayManagerLogContext(StrayManager *sm_) : sm(sm_) {}
+};
+
+class StrayManagerContext : public virtual MDSContext {
+protected:
+ StrayManager *sm;
+ MDSRank *get_mds() override
+ {
+ return sm->mds;
+ }
+public:
+ explicit StrayManagerContext(StrayManager *sm_) : sm(sm_) {}
+};
+
+
+/**
+ * Context wrapper for _purge_stray_purged completion
+ */
+class C_IO_PurgeStrayPurged : public StrayManagerIOContext {
+ CDentry *dn;
+ bool only_head;
+public:
+ C_IO_PurgeStrayPurged(StrayManager *sm_, CDentry *d, bool oh) :
+ StrayManagerIOContext(sm_), dn(d), only_head(oh) { }
+ void finish(int r) override {
+ ceph_assert(r == 0 || r == -ENOENT);
+ sm->_purge_stray_purged(dn, only_head);
+ }
+ void print(ostream& out) const override {
+ CInode *in = dn->get_projected_linkage()->get_inode();
+ out << "purge_stray(" << in->ino() << ")";
+ }
+};
+
+
+void StrayManager::purge(CDentry *dn)
+{
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ CInode *in = dnl->get_inode();
+ dout(10) << __func__ << " " << *dn << " " << *in << dendl;
+ ceph_assert(!dn->is_replicated());
+
+ // CHEAT. there's no real need to journal our intent to purge, since
+ // that is implicit in the dentry's presence and non-use in the stray
+ // dir. on recovery, we'll need to re-eval all strays anyway.
+
+ SnapContext nullsnapc;
+
+ PurgeItem item;
+ item.ino = in->inode.ino;
+ item.stamp = ceph_clock_now();
+ if (in->is_dir()) {
+ item.action = PurgeItem::PURGE_DIR;
+ item.fragtree = in->dirfragtree;
+ } else {
+ item.action = PurgeItem::PURGE_FILE;
+
+ const SnapContext *snapc;
+ SnapRealm *realm = in->find_snaprealm();
+ if (realm) {
+ dout(10) << " realm " << *realm << dendl;
+ snapc = &realm->get_snap_context();
+ } else {
+ dout(10) << " NO realm, using null context" << dendl;
+ snapc = &nullsnapc;
+ ceph_assert(in->last == CEPH_NOSNAP);
+ }
+
+ uint64_t to = 0;
+ if (in->is_file()) {
+ to = in->inode.get_max_size();
+ to = std::max(in->inode.size, to);
+ // when truncating a file, the filer does not delete stripe objects that are
+ // truncated to zero. so we need to purge stripe objects up to the max size
+ // the file has ever been.
+ to = std::max(in->inode.max_size_ever, to);
+ }
+
+ auto pi = in->get_projected_inode();
+
+ item.size = to;
+ item.layout = pi->layout;
+ item.old_pools.clear();
+ for (const auto &p : pi->old_pools)
+ item.old_pools.insert(p);
+ item.snapc = *snapc;
+ }
+
+ purge_queue.push(item, new C_IO_PurgeStrayPurged(
+ this, dn, false));
+}
+
+class C_PurgeStrayLogged : public StrayManagerLogContext {
+ CDentry *dn;
+ version_t pdv;
+ LogSegment *ls;
+public:
+ C_PurgeStrayLogged(StrayManager *sm_, CDentry *d, version_t v, LogSegment *s) :
+ StrayManagerLogContext(sm_), dn(d), pdv(v), ls(s) { }
+ void finish(int r) override {
+ sm->_purge_stray_logged(dn, pdv, ls);
+ }
+};
+
+class C_TruncateStrayLogged : public StrayManagerLogContext {
+ CDentry *dn;
+ LogSegment *ls;
+public:
+ C_TruncateStrayLogged(StrayManager *sm, CDentry *d, LogSegment *s) :
+ StrayManagerLogContext(sm), dn(d), ls(s) { }
+ void finish(int r) override {
+ sm->_truncate_stray_logged(dn, ls);
+ }
+};
+
+void StrayManager::_purge_stray_purged(
+ CDentry *dn, bool only_head)
+{
+ CInode *in = dn->get_projected_linkage()->get_inode();
+ dout(10) << "_purge_stray_purged " << *dn << " " << *in << dendl;
+
+ logger->inc(l_mdc_strays_enqueued);
+ num_strays_enqueuing--;
+ logger->set(l_mdc_num_strays_enqueuing, num_strays_enqueuing);
+
+ if (only_head) {
+ /* This was a ::truncate */
+ EUpdate *le = new EUpdate(mds->mdlog, "purge_stray truncate");
+ mds->mdlog->start_entry(le);
+
+ auto &pi = in->project_inode();
+ pi.inode.size = 0;
+ pi.inode.max_size_ever = 0;
+ pi.inode.client_ranges.clear();
+ pi.inode.truncate_size = 0;
+ pi.inode.truncate_from = 0;
+ pi.inode.version = in->pre_dirty();
+
+ le->metablob.add_dir_context(dn->dir);
+ le->metablob.add_primary_dentry(dn, in, true);
+
+ mds->mdlog->submit_entry(le,
+ new C_TruncateStrayLogged(
+ this, dn, mds->mdlog->get_current_segment()));
+ } else {
+ if (in->get_num_ref() != (int)in->is_dirty() ||
+ dn->get_num_ref() != (int)dn->is_dirty() + !!in->get_num_ref() + 1/*PIN_PURGING*/) {
+ // Nobody should be taking new references to an inode when it
+ // is being purged (aside from it were
+
+ derr << "Rogue reference after purge to " << *dn << dendl;
+ ceph_abort_msg("rogue reference to purging inode");
+ }
+
+ // kill dentry.
+ version_t pdv = dn->pre_dirty();
+ dn->push_projected_linkage(); // NULL
+
+ EUpdate *le = new EUpdate(mds->mdlog, "purge_stray");
+ mds->mdlog->start_entry(le);
+
+ // update dirfrag fragstat, rstat
+ CDir *dir = dn->get_dir();
+ fnode_t *pf = dir->project_fnode();
+ pf->version = dir->pre_dirty();
+ if (in->is_dir())
+ pf->fragstat.nsubdirs--;
+ else
+ pf->fragstat.nfiles--;
+ pf->rstat.sub(in->inode.accounted_rstat);
+
+ le->metablob.add_dir_context(dn->dir);
+ EMetaBlob::dirlump& dl = le->metablob.add_dir(dn->dir, true);
+ le->metablob.add_null_dentry(dl, dn, true);
+ le->metablob.add_destroyed_inode(in->ino());
+
+ mds->mdlog->submit_entry(le, new C_PurgeStrayLogged(this, dn, pdv,
+ mds->mdlog->get_current_segment()));
+ }
+}
+
+void StrayManager::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls)
+{
+ CInode *in = dn->get_linkage()->get_inode();
+ dout(10) << "_purge_stray_logged " << *dn << " " << *in << dendl;
+
+ ceph_assert(!in->state_test(CInode::STATE_RECOVERING));
+
+ bool new_dn = dn->is_new();
+
+ // unlink
+ ceph_assert(dn->get_projected_linkage()->is_null());
+ dn->dir->unlink_inode(dn, !new_dn);
+ dn->pop_projected_linkage();
+ dn->mark_dirty(pdv, ls);
+
+ dn->dir->pop_and_dirty_projected_fnode(ls);
+
+ in->state_clear(CInode::STATE_ORPHAN);
+ dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED);
+ dn->put(CDentry::PIN_PURGING);
+
+ // drop dentry?
+ if (new_dn) {
+ dout(20) << " dn is new, removing" << dendl;
+ dn->mark_clean();
+ dn->dir->remove_dentry(dn);
+ }
+
+ // drop inode
+ inodeno_t ino = in->ino();
+ if (in->is_dirty())
+ in->mark_clean();
+ mds->mdcache->remove_inode(in);
+
+ if (mds->is_stopping())
+ mds->mdcache->shutdown_export_stray_finish(ino);
+}
+
+void StrayManager::enqueue(CDentry *dn, bool trunc)
+{
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ ceph_assert(dnl);
+ CInode *in = dnl->get_inode();
+ ceph_assert(in);
+
+ /* We consider a stray to be purging as soon as it is enqueued, to avoid
+ * enqueing it twice */
+ dn->state_set(CDentry::STATE_PURGING);
+ in->state_set(CInode::STATE_PURGING);
+
+ /* We must clear this as soon as enqueuing it, to prevent the journal
+ * expiry code from seeing a dirty parent and trying to write a backtrace */
+ if (!trunc) {
+ if (in->is_dirty_parent()) {
+ in->clear_dirty_parent();
+ }
+ }
+
+ dout(20) << __func__ << ": purging dn: " << *dn << dendl;
+
+ if (!dn->state_test(CDentry::STATE_PURGINGPINNED)) {
+ dn->get(CDentry::PIN_PURGING);
+ dn->state_set(CDentry::STATE_PURGINGPINNED);
+ }
+
+ ++num_strays_enqueuing;
+ logger->set(l_mdc_num_strays_enqueuing, num_strays_enqueuing);
+
+ // Resources are available, acquire them and execute the purge
+ _enqueue(dn, trunc);
+
+ dout(10) << __func__ << ": purging this dentry immediately: "
+ << *dn << dendl;
+}
+
+class C_OpenSnapParents : public StrayManagerContext {
+ CDentry *dn;
+ bool trunc;
+ public:
+ C_OpenSnapParents(StrayManager *sm_, CDentry *dn_, bool t) :
+ StrayManagerContext(sm_), dn(dn_), trunc(t) { }
+ void finish(int r) override {
+ sm->_enqueue(dn, trunc);
+ }
+};
+
+void StrayManager::_enqueue(CDentry *dn, bool trunc)
+{
+ ceph_assert(started);
+
+ CInode *in = dn->get_linkage()->get_inode();
+ if (in->snaprealm &&
+ !in->snaprealm->have_past_parents_open() &&
+ !in->snaprealm->open_parents(new C_OpenSnapParents(this, dn, trunc))) {
+ // this can happen if the dentry had been trimmed from cache.
+ return;
+ }
+
+ if (trunc) {
+ truncate(dn);
+ } else {
+ purge(dn);
+ }
+}
+
+void StrayManager::queue_delayed(CDentry *dn)
+{
+ if (!started)
+ return;
+
+ if (dn->state_test(CDentry::STATE_EVALUATINGSTRAY))
+ return;
+
+ if (!dn->item_stray.is_on_list()) {
+ delayed_eval_stray.push_back(&dn->item_stray);
+ num_strays_delayed++;
+ logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
+ }
+}
+
+void StrayManager::advance_delayed()
+{
+ if (!started)
+ return;
+
+ while (!delayed_eval_stray.empty()) {
+ CDentry *dn = delayed_eval_stray.front();
+ dn->item_stray.remove_myself();
+ num_strays_delayed--;
+
+ if (dn->get_projected_linkage()->is_null()) {
+ /* A special case: a stray dentry can go null if its inode is being
+ * re-linked into another MDS's stray dir during a shutdown migration. */
+ dout(4) << __func__ << ": delayed dentry is now null: " << *dn << dendl;
+ continue;
+ }
+
+ eval_stray(dn);
+ }
+ logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
+}
+
+void StrayManager::set_num_strays(uint64_t num)
+{
+ ceph_assert(!started);
+ num_strays = num;
+ logger->set(l_mdc_num_strays, num_strays);
+}
+
+void StrayManager::notify_stray_created()
+{
+ num_strays++;
+ logger->set(l_mdc_num_strays, num_strays);
+ logger->inc(l_mdc_strays_created);
+}
+
+void StrayManager::notify_stray_removed()
+{
+ num_strays--;
+ logger->set(l_mdc_num_strays, num_strays);
+}
+
+struct C_EvalStray : public StrayManagerContext {
+ CDentry *dn;
+ C_EvalStray(StrayManager *sm_, CDentry *d) : StrayManagerContext(sm_), dn(d) {}
+ void finish(int r) override {
+ sm->eval_stray(dn);
+ }
+};
+
+struct C_MDC_EvalStray : public StrayManagerContext {
+ CDentry *dn;
+ C_MDC_EvalStray(StrayManager *sm_, CDentry *d) : StrayManagerContext(sm_), dn(d) {}
+ void finish(int r) override {
+ sm->eval_stray(dn);
+ }
+};
+
+bool StrayManager::_eval_stray(CDentry *dn)
+{
+ dout(10) << "eval_stray " << *dn << dendl;
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ ceph_assert(dnl->is_primary());
+ dout(10) << " inode is " << *dnl->get_inode() << dendl;
+ CInode *in = dnl->get_inode();
+ ceph_assert(in);
+ ceph_assert(!in->state_test(CInode::STATE_REJOINUNDEF));
+
+ // The only dentries elegible for purging are those
+ // in the stray directories
+ ceph_assert(dn->get_dir()->get_inode()->is_stray());
+
+ // Inode may not pass through this function if it
+ // was already identified for purging (i.e. cannot
+ // call eval_stray() after purge()
+ ceph_assert(!dn->state_test(CDentry::STATE_PURGING));
+
+ if (!dn->is_auth())
+ return false;
+
+ if (!started)
+ return false;
+
+ if (dn->item_stray.is_on_list()) {
+ dn->item_stray.remove_myself();
+ num_strays_delayed--;
+ logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
+ }
+
+ // purge?
+ if (in->inode.nlink == 0) {
+ // past snaprealm parents imply snapped dentry remote links.
+ // only important for directories. normal file data snaps are handled
+ // by the object store.
+ if (in->snaprealm) {
+ if (!in->snaprealm->have_past_parents_open() &&
+ !in->snaprealm->open_parents(new C_MDC_EvalStray(this, dn))) {
+ return false;
+ }
+ in->snaprealm->prune_past_parents();
+ in->purge_stale_snap_data(in->snaprealm->get_snaps());
+ }
+ if (in->is_dir()) {
+ if (in->snaprealm && in->snaprealm->has_past_parents()) {
+ dout(20) << " directory has past parents "
+ << in->snaprealm << dendl;
+ if (in->state_test(CInode::STATE_MISSINGOBJS)) {
+ mds->clog->error() << "previous attempt at committing dirfrag of ino "
+ << in->ino() << " has failed, missing object";
+ mds->handle_write_error(-ENOENT);
+ }
+ return false; // not until some snaps are deleted.
+ }
+
+ mds->mdcache->clear_dirty_bits_for_stray(in);
+
+ if (!in->remote_parents.empty()) {
+ // unlink any stale remote snap dentry.
+ for (auto it = in->remote_parents.begin(); it != in->remote_parents.end(); ) {
+ CDentry *remote_dn = *it;
+ ++it;
+ ceph_assert(remote_dn->last != CEPH_NOSNAP);
+ remote_dn->unlink_remote(remote_dn->get_linkage());
+ }
+ }
+ }
+ if (dn->is_replicated()) {
+ dout(20) << " replicated" << dendl;
+ return false;
+ }
+ if (dn->is_any_leases() || in->is_any_caps()) {
+ dout(20) << " caps | leases" << dendl;
+ return false; // wait
+ }
+ if (in->state_test(CInode::STATE_NEEDSRECOVER) ||
+ in->state_test(CInode::STATE_RECOVERING)) {
+ dout(20) << " pending recovery" << dendl;
+ return false; // don't mess with file size probing
+ }
+ if (in->get_num_ref() > (int)in->is_dirty() + (int)in->is_dirty_parent()) {
+ dout(20) << " too many inode refs" << dendl;
+ return false;
+ }
+ if (dn->get_num_ref() > (int)dn->is_dirty() + !!in->get_num_ref()) {
+ dout(20) << " too many dn refs" << dendl;
+ return false;
+ }
+ // don't purge multiversion inode with snap data
+ if (in->snaprealm && in->snaprealm->has_past_parents() &&
+ !in->old_inodes.empty()) {
+ // A file with snapshots: we will truncate the HEAD revision
+ // but leave the metadata intact.
+ ceph_assert(!in->is_dir());
+ dout(20) << " file has past parents "
+ << in->snaprealm << dendl;
+ if (in->is_file() && in->get_projected_inode()->size > 0) {
+ enqueue(dn, true); // truncate head objects
+ }
+ } else {
+ // A straightforward file, ready to be purged. Enqueue it.
+ if (in->is_dir()) {
+ in->close_dirfrags();
+ }
+
+ enqueue(dn, false);
+ }
+
+ return true;
+ } else {
+ /*
+ * Where a stray has some links, they should be remotes, check
+ * if we can do anything with them if we happen to have them in
+ * cache.
+ */
+ _eval_stray_remote(dn, NULL);
+ return false;
+ }
+}
+
+void StrayManager::activate()
+{
+ dout(10) << __func__ << dendl;
+ started = true;
+ purge_queue.activate();
+}
+
+bool StrayManager::eval_stray(CDentry *dn)
+{
+ // avoid nested eval_stray
+ if (dn->state_test(CDentry::STATE_EVALUATINGSTRAY))
+ return false;
+
+ dn->state_set(CDentry::STATE_EVALUATINGSTRAY);
+ bool ret = _eval_stray(dn);
+ dn->state_clear(CDentry::STATE_EVALUATINGSTRAY);
+ return ret;
+}
+
+void StrayManager::eval_remote(CDentry *remote_dn)
+{
+ dout(10) << __func__ << " " << *remote_dn << dendl;
+
+ CDentry::linkage_t *dnl = remote_dn->get_projected_linkage();
+ ceph_assert(dnl->is_remote());
+ CInode *in = dnl->get_inode();
+
+ if (!in) {
+ dout(20) << __func__ << ": no inode, cannot evaluate" << dendl;
+ return;
+ }
+
+ if (remote_dn->last != CEPH_NOSNAP) {
+ dout(20) << __func__ << ": snap dentry, cannot evaluate" << dendl;
+ return;
+ }
+
+ // refers to stray?
+ CDentry *primary_dn = in->get_projected_parent_dn();
+ ceph_assert(primary_dn != NULL);
+ if (primary_dn->get_dir()->get_inode()->is_stray()) {
+ _eval_stray_remote(primary_dn, remote_dn);
+ } else {
+ dout(20) << __func__ << ": inode's primary dn not stray" << dendl;
+ }
+}
+
+class C_RetryEvalRemote : public StrayManagerContext {
+ CDentry *dn;
+ public:
+ C_RetryEvalRemote(StrayManager *sm_, CDentry *dn_) :
+ StrayManagerContext(sm_), dn(dn_) {
+ dn->get(CDentry::PIN_PTRWAITER);
+ }
+ void finish(int r) override {
+ if (dn->get_projected_linkage()->is_remote())
+ sm->eval_remote(dn);
+ dn->put(CDentry::PIN_PTRWAITER);
+ }
+};
+
+void StrayManager::_eval_stray_remote(CDentry *stray_dn, CDentry *remote_dn)
+{
+ dout(20) << __func__ << " " << *stray_dn << dendl;
+ ceph_assert(stray_dn != NULL);
+ ceph_assert(stray_dn->get_dir()->get_inode()->is_stray());
+ CDentry::linkage_t *stray_dnl = stray_dn->get_projected_linkage();
+ ceph_assert(stray_dnl->is_primary());
+ CInode *stray_in = stray_dnl->get_inode();
+ ceph_assert(stray_in->inode.nlink >= 1);
+ ceph_assert(stray_in->last == CEPH_NOSNAP);
+
+ /* If no remote_dn hinted, pick one arbitrarily */
+ if (remote_dn == NULL) {
+ if (!stray_in->remote_parents.empty()) {
+ for (const auto &dn : stray_in->remote_parents) {
+ if (dn->last == CEPH_NOSNAP && !dn->is_projected()) {
+ if (dn->is_auth()) {
+ remote_dn = dn;
+ if (remote_dn->dir->can_auth_pin())
+ break;
+ } else if (!remote_dn) {
+ remote_dn = dn;
+ }
+ }
+ }
+ }
+ if (!remote_dn) {
+ dout(20) << __func__ << ": not reintegrating (no remote parents in cache)" << dendl;
+ return;
+ }
+ }
+ ceph_assert(remote_dn->last == CEPH_NOSNAP);
+ // NOTE: we repeat this check in _rename(), since our submission path is racey.
+ if (!remote_dn->is_projected()) {
+ if (remote_dn->is_auth()) {
+ if (remote_dn->dir->can_auth_pin()) {
+ reintegrate_stray(stray_dn, remote_dn);
+ } else {
+ remote_dn->dir->add_waiter(CDir::WAIT_UNFREEZE, new C_RetryEvalRemote(this, remote_dn));
+ dout(20) << __func__ << ": not reintegrating (can't authpin remote parent)" << dendl;
+ }
+
+ } else if (!remote_dn->is_auth() && stray_dn->is_auth()) {
+ migrate_stray(stray_dn, remote_dn->authority().first);
+ } else {
+ dout(20) << __func__ << ": not reintegrating" << dendl;
+ }
+ } else {
+ // don't do anything if the remote parent is projected, or we may
+ // break user-visible semantics!
+ dout(20) << __func__ << ": not reintegrating (projected)" << dendl;
+ }
+}
+
+void StrayManager::reintegrate_stray(CDentry *straydn, CDentry *rdn)
+{
+ dout(10) << __func__ << " " << *straydn << " into " << *rdn << dendl;
+
+ logger->inc(l_mdc_strays_reintegrated);
+
+ // rename it to another mds.
+ filepath src;
+ straydn->make_path(src);
+ filepath dst;
+ rdn->make_path(dst);
+
+ auto req = MClientRequest::create(CEPH_MDS_OP_RENAME);
+ req->set_filepath(dst);
+ req->set_filepath2(src);
+ req->set_tid(mds->issue_tid());
+
+ mds->send_message_mds(req, rdn->authority().first);
+}
+
+void StrayManager::migrate_stray(CDentry *dn, mds_rank_t to)
+{
+ CInode *in = dn->get_projected_linkage()->get_inode();
+ ceph_assert(in);
+ CInode *diri = dn->dir->get_inode();
+ ceph_assert(diri->is_stray());
+ dout(10) << "migrate_stray from mds." << MDS_INO_STRAY_OWNER(diri->inode.ino)
+ << " to mds." << to
+ << " " << *dn << " " << *in << dendl;
+
+ logger->inc(l_mdc_strays_migrated);
+
+ // rename it to another mds.
+ filepath src;
+ dn->make_path(src);
+ ceph_assert(src.depth() == 2);
+
+ filepath dst(MDS_INO_MDSDIR(to));
+ dst.push_dentry(src[0]);
+ dst.push_dentry(src[1]);
+
+ auto req = MClientRequest::create(CEPH_MDS_OP_RENAME);
+ req->set_filepath(dst);
+ req->set_filepath2(src);
+ req->set_tid(mds->issue_tid());
+
+ mds->send_message_mds(req, to);
+}
+
+StrayManager::StrayManager(MDSRank *mds, PurgeQueue &purge_queue_)
+ : delayed_eval_stray(member_offset(CDentry, item_stray)),
+ mds(mds), logger(NULL), started(false), num_strays(0),
+ num_strays_delayed(0), num_strays_enqueuing(0),
+ purge_queue(purge_queue_)
+{
+ ceph_assert(mds != NULL);
+}
+
+void StrayManager::truncate(CDentry *dn)
+{
+ const CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ const CInode *in = dnl->get_inode();
+ ceph_assert(in);
+ dout(10) << __func__ << ": " << *dn << " " << *in << dendl;
+ ceph_assert(!dn->is_replicated());
+
+ const SnapRealm *realm = in->find_snaprealm();
+ ceph_assert(realm);
+ dout(10) << " realm " << *realm << dendl;
+ const SnapContext *snapc = &realm->get_snap_context();
+
+ uint64_t to = in->inode.get_max_size();
+ to = std::max(in->inode.size, to);
+ // when truncating a file, the filer does not delete stripe objects that are
+ // truncated to zero. so we need to purge stripe objects up to the max size
+ // the file has ever been.
+ to = std::max(in->inode.max_size_ever, to);
+
+ ceph_assert(to > 0);
+
+ PurgeItem item;
+ item.action = PurgeItem::TRUNCATE_FILE;
+ item.ino = in->inode.ino;
+ item.layout = in->inode.layout;
+ item.snapc = *snapc;
+ item.size = to;
+ item.stamp = ceph_clock_now();
+
+ purge_queue.push(item, new C_IO_PurgeStrayPurged(
+ this, dn, true));
+}
+
+void StrayManager::_truncate_stray_logged(CDentry *dn, LogSegment *ls)
+{
+ CInode *in = dn->get_projected_linkage()->get_inode();
+
+ dout(10) << __func__ << ": " << *dn << " " << *in << dendl;
+
+ in->pop_and_dirty_projected_inode(ls);
+
+ in->state_clear(CInode::STATE_PURGING);
+ dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED);
+ dn->put(CDentry::PIN_PURGING);
+
+ eval_stray(dn);
+
+ if (!dn->state_test(CDentry::STATE_PURGING) && mds->is_stopping())
+ mds->mdcache->shutdown_export_stray_finish(in->ino());
+}
+
diff --git a/src/mds/StrayManager.h b/src/mds/StrayManager.h
new file mode 100644
index 00000000..53e42110
--- /dev/null
+++ b/src/mds/StrayManager.h
@@ -0,0 +1,197 @@
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef STRAY_MANAGER_H
+#define STRAY_MANAGER_H
+
+#include "include/elist.h"
+#include <list>
+#include "mds/PurgeQueue.h"
+
+class MDSRank;
+class PerfCounters;
+class CInode;
+class CDentry;
+
+class StrayManager
+{
+ protected:
+ // Has passed through eval_stray and still has refs
+ elist<CDentry*> delayed_eval_stray;
+
+ // strays that have been trimmed from cache
+ std::set<std::string> trimmed_strays;
+
+ // Global references for doing I/O
+ MDSRank *mds;
+ PerfCounters *logger;
+
+ bool started;
+
+ // Stray dentries for this rank (including those not in cache)
+ uint64_t num_strays;
+
+ // Stray dentries
+ uint64_t num_strays_delayed;
+
+ // Entries that have entered enqueue() but not been persistently
+ // recorded by PurgeQueue yet
+ uint64_t num_strays_enqueuing;
+
+ PurgeQueue &purge_queue;
+
+ void truncate(CDentry *dn);
+
+ /**
+ * Purge a dentry from a stray directory. This function
+ * is called once eval_stray is satisfied and StrayManager
+ * throttling is also satisfied. There is no going back
+ * at this stage!
+ */
+ void purge(CDentry *dn);
+
+ /**
+ * Completion handler for a Filer::purge on a stray inode.
+ */
+ void _purge_stray_purged(CDentry *dn, bool only_head);
+
+ void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls);
+
+ /**
+ * Callback: we have logged the update to an inode's metadata
+ * reflecting it's newly-zeroed length.
+ */
+ void _truncate_stray_logged(CDentry *dn, LogSegment *ls);
+
+ friend class StrayManagerIOContext;
+ friend class StrayManagerLogContext;
+ friend class StrayManagerContext;
+
+ friend class C_StraysFetched;
+ friend class C_OpenSnapParents;
+ friend class C_PurgeStrayLogged;
+ friend class C_TruncateStrayLogged;
+ friend class C_IO_PurgeStrayPurged;
+
+
+ // Call this on a dentry that has been identified as
+ // eligible for purging. It will be passed on to PurgeQueue.
+ void enqueue(CDentry *dn, bool trunc);
+
+ // Final part of enqueue() which we may have to retry
+ // after opening snap parents.
+ void _enqueue(CDentry *dn, bool trunc);
+
+
+ /**
+ * When hard links exist to an inode whose primary dentry
+ * is unlinked, the inode gets a stray primary dentry.
+ *
+ * We may later "reintegrate" the inode into a remaining
+ * non-stray dentry (one of what was previously a remote
+ * dentry) by issuing a rename from the stray to the other
+ * dentry.
+ */
+ void reintegrate_stray(CDentry *dn, CDentry *rlink);
+
+ /**
+ * Evaluate a stray dentry for purging or reintegration.
+ *
+ * purging: If the inode has no linkage, and no more references, then
+ * we may decide to purge it.
+ *
+ * reintegration: If the inode still has linkage, then it means someone else
+ * (a hard link) is still referring to it, and we should
+ * think about reintegrating that inode into the remote dentry.
+ *
+ * @returns true if the dentry will be purged (caller should never
+ * take more refs after this happens), else false.
+ */
+ bool _eval_stray(CDentry *dn);
+
+ void _eval_stray_remote(CDentry *stray_dn, CDentry *remote_dn);
+
+ // My public interface is for consumption by MDCache
+ public:
+ explicit StrayManager(MDSRank *mds, PurgeQueue &purge_queue_);
+ void set_logger(PerfCounters *l) {logger = l;}
+ void activate();
+
+ bool eval_stray(CDentry *dn);
+
+ void set_num_strays(uint64_t num);
+ uint64_t get_num_strays() const { return num_strays; }
+
+ /**
+ * Queue dentry for later evaluation. (evaluate it while not in the
+ * middle of another metadata operation)
+ */
+ void queue_delayed(CDentry *dn);
+
+ /**
+ * Eval strays in the delayed_eval_stray list
+ */
+ void advance_delayed();
+
+ /**
+ * Remote dentry potentially points to a stray. When it is touched,
+ * call in here to evaluate it for migration (move a stray residing
+ * on another MDS to this MDS) or reintegration (move a stray dentry's
+ * inode into a non-stray hardlink dentry and clean up the stray).
+ *
+ * @param stray_dn a stray dentry whose inode has been referenced
+ * by a remote dentry
+ * @param remote_dn (optional) which remote dentry was touched
+ * in an operation that led us here: this is used
+ * as a hint for which remote to reintegrate into
+ * if there are multiple remotes.
+ */
+ void eval_remote(CDentry *remote_dn);
+
+ /**
+ * Given a dentry within one of my stray directories,
+ * send it off to a stray directory in another MDS.
+ *
+ * This is for use:
+ * * Case A: when shutting down a rank, we migrate strays
+ * away from ourselves rather than waiting for purge
+ * * Case B: when a client request has a trace that refers to
+ * a stray inode on another MDS, we migrate that inode from
+ * there to here, in order that we can later re-integrate it
+ * here.
+ *
+ * In case B, the receiver should be calling into eval_stray
+ * on completion of mv (i.e. inode put), resulting in a subsequent
+ * reintegration.
+ */
+ void migrate_stray(CDentry *dn, mds_rank_t dest);
+
+ /**
+ * Update stats to reflect a newly created stray dentry. Needed
+ * because stats on strays live here, but creation happens
+ * in Server or MDCache. For our purposes "creation" includes
+ * loading a stray from a dirfrag and migrating a stray from
+ * another MDS, in addition to creations per-se.
+ */
+ void notify_stray_created();
+
+ /**
+ * Update stats to reflect a removed stray dentry. Needed because
+ * stats on strays live here, but removal happens in Server or
+ * MDCache. Also includes migration (rename) of strays from
+ * this MDS to another MDS.
+ */
+ void notify_stray_removed();
+};
+
+#endif // STRAY_MANAGER_H
diff --git a/src/mds/balancers/greedyspill.lua b/src/mds/balancers/greedyspill.lua
new file mode 100644
index 00000000..20576cdb
--- /dev/null
+++ b/src/mds/balancers/greedyspill.lua
@@ -0,0 +1,49 @@
+local metrics = {"auth.meta_load", "all.meta_load", "req_rate", "queue_len", "cpu_load_avg"}
+
+-- Metric for balancing is the workload; also dumps metrics
+local function mds_load()
+ for rank, mds in pairs(mds) do
+ local s = "MDS"..rank..": < "
+ for _, metric in ipairs(metrics) do
+ s = s..metric.."="..mds[metric].." "
+ end
+ mds.load = mds["all.meta_load"]
+ BAL_LOG(5, s.."> load="..mds.load)
+ end
+end
+
+-- Shed load when you have load and your neighbor doesn't
+local function when()
+ if not mds[whoami+1] then
+ -- i'm the last rank
+ BAL_LOG(5, "when: not migrating! I am the last rank, nothing to spill to.");
+ return false
+ end
+ my_load = mds[whoami]["load"]
+ his_load = mds[whoami+1]["load"]
+ if my_load > 0.01 and his_load < 0.01 then
+ BAL_LOG(5, "when: migrating! my_load="..my_load.." hisload="..his_load)
+ return true
+ end
+ BAL_LOG(5, "when: not migrating! my_load="..my_load.." hisload="..his_load)
+ return false
+end
+
+-- Shed half your load to your neighbor
+-- neighbor=whoami+2 because Lua tables are indexed starting at 1
+local function where(targets)
+ targets[whoami+1] = mds[whoami]["load"]/2
+ return targets
+end
+
+local targets = {}
+for rank in pairs(mds) do
+ targets[rank] = 0
+end
+
+mds_load()
+if when() then
+ where(targets)
+end
+
+return targets
diff --git a/src/mds/cephfs_features.h b/src/mds/cephfs_features.h
new file mode 100644
index 00000000..66752af2
--- /dev/null
+++ b/src/mds/cephfs_features.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPHFS_FEATURES_H
+#define CEPHFS_FEATURES_H
+
+// Please add feature bits for later ceph releases and update
+// Server::update_required_client_features().
+
+// The first 5 bits are reserved for old ceph releases.
+#define CEPHFS_FEATURE_JEWEL 5
+#define CEPHFS_FEATURE_KRAKEN 6
+#define CEPHFS_FEATURE_LUMINOUS 7
+#define CEPHFS_FEATURE_MIMIC 8
+#define CEPHFS_FEATURE_REPLY_ENCODING 9
+#define CEPHFS_FEATURE_RECLAIM_CLIENT 10
+#define CEPHFS_FEATURE_LAZY_CAP_WANTED 11
+#define CEPHFS_FEATURE_MULTI_RECONNECT 12
+#define CEPHFS_FEATURE_NAUTILUS 12
+
+#define CEPHFS_FEATURES_ALL { \
+ 0, 1, 2, 3, 4, \
+ CEPHFS_FEATURE_JEWEL, \
+ CEPHFS_FEATURE_KRAKEN, \
+ CEPHFS_FEATURE_LUMINOUS, \
+ CEPHFS_FEATURE_MIMIC, \
+ CEPHFS_FEATURE_REPLY_ENCODING, \
+ CEPHFS_FEATURE_RECLAIM_CLIENT, \
+ CEPHFS_FEATURE_LAZY_CAP_WANTED, \
+ CEPHFS_FEATURE_MULTI_RECONNECT, \
+ CEPHFS_FEATURE_NAUTILUS, \
+}
+
+#define CEPHFS_FEATURES_MDS_SUPPORTED CEPHFS_FEATURES_ALL
+#define CEPHFS_FEATURES_MDS_REQUIRED {}
+
+#define CEPHFS_FEATURES_CLIENT_SUPPORTED CEPHFS_FEATURES_ALL
+#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
+
+#endif
diff --git a/src/mds/events/ECommitted.h b/src/mds/events/ECommitted.h
new file mode 100644
index 00000000..0459f9d0
--- /dev/null
+++ b/src/mds/events/ECommitted.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_ECOMMITTED_H
+#define CEPH_MDS_ECOMMITTED_H
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+class ECommitted : public LogEvent {
+public:
+ metareqid_t reqid;
+
+ ECommitted() : LogEvent(EVENT_COMMITTED) { }
+ explicit ECommitted(metareqid_t r) :
+ LogEvent(EVENT_COMMITTED), reqid(r) { }
+
+ void print(ostream& out) const override {
+ out << "ECommitted " << reqid;
+ }
+
+ void encode(bufferlist &bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator &bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<ECommitted*>& ls);
+
+ void update_segment() override {}
+ void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(ECommitted)
+
+#endif
diff --git a/src/mds/events/EExport.h b/src/mds/events/EExport.h
new file mode 100644
index 00000000..94e39a84
--- /dev/null
+++ b/src/mds/events/EExport.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_EEXPORT_H
+#define CEPH_EEXPORT_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../MDSRank.h"
+
+#include "EMetaBlob.h"
+#include "../LogEvent.h"
+
+class EExport : public LogEvent {
+public:
+ EMetaBlob metablob; // exported dir
+protected:
+ dirfrag_t base;
+ set<dirfrag_t> bounds;
+ mds_rank_t target;
+
+public:
+ EExport() :
+ LogEvent(EVENT_EXPORT), target(MDS_RANK_NONE) { }
+ EExport(MDLog *mdlog, CDir *dir, mds_rank_t t) :
+ LogEvent(EVENT_EXPORT),
+ base(dir->dirfrag()), target(t) { }
+
+ set<dirfrag_t> &get_bounds() { return bounds; }
+
+ void print(ostream& out) const override {
+ out << "EExport " << base << " to mds." << target << " " << metablob;
+ }
+
+ EMetaBlob *get_metablob() override { return &metablob; }
+
+ void encode(bufferlist& bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator &bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<EExport*>& ls);
+ void replay(MDSRank *mds) override;
+
+};
+WRITE_CLASS_ENCODER_FEATURES(EExport)
+
+#endif
diff --git a/src/mds/events/EFragment.h b/src/mds/events/EFragment.h
new file mode 100644
index 00000000..90d9238b
--- /dev/null
+++ b/src/mds/events/EFragment.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_EFRAGMENT_H
+#define CEPH_MDS_EFRAGMENT_H
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+struct dirfrag_rollback {
+ fnode_t fnode;
+ dirfrag_rollback() { }
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& bl);
+};
+WRITE_CLASS_ENCODER(dirfrag_rollback)
+
+class EFragment : public LogEvent {
+public:
+ EMetaBlob metablob;
+ __u8 op{0};
+ inodeno_t ino;
+ frag_t basefrag;
+ __s32 bits{0}; // positive for split (from basefrag), negative for merge (to basefrag)
+ frag_vec_t orig_frags;
+ bufferlist rollback;
+
+ EFragment() : LogEvent(EVENT_FRAGMENT) { }
+ EFragment(MDLog *mdlog, int o, dirfrag_t df, int b) :
+ LogEvent(EVENT_FRAGMENT),
+ op(o), ino(df.ino), basefrag(df.frag), bits(b) { }
+
+ void print(ostream& out) const override {
+ out << "EFragment " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << " " << metablob;
+ }
+
+ enum {
+ OP_PREPARE = 1,
+ OP_COMMIT = 2,
+ OP_ROLLBACK = 3,
+ OP_FINISH = 4 // finish deleting orphan dirfrags
+ };
+ static std::string_view op_name(int o) {
+ switch (o) {
+ case OP_PREPARE: return "prepare";
+ case OP_COMMIT: return "commit";
+ case OP_ROLLBACK: return "rollback";
+ case OP_FINISH: return "finish";
+ default: return "???";
+ }
+ }
+
+ void add_orig_frag(frag_t df, dirfrag_rollback *drb=NULL) {
+ using ceph::encode;
+ orig_frags.push_back(df);
+ if (drb)
+ encode(*drb, rollback);
+ }
+
+ EMetaBlob *get_metablob() override { return &metablob; }
+
+ void encode(bufferlist &bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator &bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<EFragment*>& ls);
+ void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(EFragment)
+
+#endif
diff --git a/src/mds/events/EImportFinish.h b/src/mds/events/EImportFinish.h
new file mode 100644
index 00000000..699c0527
--- /dev/null
+++ b/src/mds/events/EImportFinish.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_EIMPORTFINISH_H
+#define CEPH_EIMPORTFINISH_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../MDSRank.h"
+#include "../LogEvent.h"
+
+class EImportFinish : public LogEvent {
+ protected:
+ dirfrag_t base; // imported dir
+ bool success;
+
+ public:
+ EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH),
+ base(dir->dirfrag()),
+ success(s) { }
+ EImportFinish() : LogEvent(EVENT_IMPORTFINISH), base(), success(false) { }
+
+ void print(ostream& out) const override {
+ out << "EImportFinish " << base;
+ if (success)
+ out << " success";
+ else
+ out << " failed";
+ }
+
+ void encode(bufferlist& bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator &bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<EImportFinish*>& ls);
+
+ void replay(MDSRank *mds) override;
+
+};
+WRITE_CLASS_ENCODER_FEATURES(EImportFinish)
+
+#endif
diff --git a/src/mds/events/EImportStart.h b/src/mds/events/EImportStart.h
new file mode 100644
index 00000000..276469e8
--- /dev/null
+++ b/src/mds/events/EImportStart.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_EIMPORTSTART_H
+#define CEPH_EIMPORTSTART_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+class MDLog;
+class MDSRank;
+
+#include "EMetaBlob.h"
+#include "../LogEvent.h"
+
+class EImportStart : public LogEvent {
+protected:
+ dirfrag_t base;
+ vector<dirfrag_t> bounds;
+ mds_rank_t from;
+
+public:
+ EMetaBlob metablob;
+ bufferlist client_map; // encoded map<__u32,entity_inst_t>
+ version_t cmapv{0};
+
+ EImportStart(MDLog *log, dirfrag_t di, const vector<dirfrag_t>& b, mds_rank_t f) :
+ LogEvent(EVENT_IMPORTSTART),
+ base(di), bounds(b), from(f) { }
+ EImportStart() :
+ LogEvent(EVENT_IMPORTSTART), from(MDS_RANK_NONE) { }
+
+ void print(ostream& out) const override {
+ out << "EImportStart " << base << " from mds." << from << " " << metablob;
+ }
+
+ EMetaBlob *get_metablob() override { return &metablob; }
+
+ void encode(bufferlist &bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator &bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<EImportStart*>& ls);
+
+ void update_segment() override;
+ void replay(MDSRank *mds) override;
+
+};
+WRITE_CLASS_ENCODER_FEATURES(EImportStart)
+
+#endif
diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h
new file mode 100644
index 00000000..ac09a8fe
--- /dev/null
+++ b/src/mds/events/EMetaBlob.h
@@ -0,0 +1,600 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_EMETABLOB_H
+#define CEPH_MDS_EMETABLOB_H
+
+#include <string_view>
+
+#include <stdlib.h>
+
+#include "../CInode.h"
+#include "../CDir.h"
+#include "../CDentry.h"
+#include "../LogSegment.h"
+
+#include "include/interval_set.h"
+
+class MDSRank;
+class MDLog;
+class LogSegment;
+struct MDSlaveUpdate;
+
+/*
+ * a bunch of metadata in the journal
+ */
+
+/* notes:
+ *
+ * - make sure you adjust the inode.version for any modified inode you
+ * journal. CDir and CDentry maintain a projected_version, but CInode
+ * doesn't, since the journaled inode usually has to be modified
+ * manually anyway (to delay the change in the MDS's cache until after
+ * it is journaled).
+ *
+ */
+
+
+class EMetaBlob {
+
+public:
+ /* fullbit - a regular dentry + inode
+ *
+ * We encode this one a bit weirdly, just because (also, it's marginally faster
+ * on multiple encodes, which I think can happen):
+ * Encode a bufferlist on struct creation with all data members, without a struct_v.
+ * When encode is called, encode struct_v and then append the bufferlist.
+ * Decode straight into the appropriate variables.
+ *
+ * So, if you add members, encode them in the constructor and then change
+ * the struct_v in the encode function!
+ */
+ struct fullbit {
+ static const int STATE_DIRTY = (1<<0);
+ static const int STATE_DIRTYPARENT = (1<<1);
+ static const int STATE_DIRTYPOOL = (1<<2);
+ static const int STATE_NEED_SNAPFLUSH = (1<<3);
+ std::string dn; // dentry
+ snapid_t dnfirst, dnlast;
+ version_t dnv{0};
+ CInode::mempool_inode inode; // if it's not XXX should not be part of mempool; wait for std::pmr to simplify
+ fragtree_t dirfragtree;
+ CInode::mempool_xattr_map xattrs;
+ std::string symlink;
+ snapid_t oldest_snap;
+ bufferlist snapbl;
+ __u8 state{0};
+ CInode::mempool_old_inode_map old_inodes; // XXX should not be part of mempool; wait for std::pmr to simplify
+
+ fullbit(std::string_view d, snapid_t df, snapid_t dl,
+ version_t v, const CInode::mempool_inode& i, const fragtree_t &dft,
+ const CInode::mempool_xattr_map &xa, std::string_view sym,
+ snapid_t os, const bufferlist &sbl, __u8 st,
+ const CInode::mempool_old_inode_map *oi = NULL) :
+ dn(d), dnfirst(df), dnlast(dl), dnv(v), inode(i), xattrs(xa),
+ oldest_snap(os), state(st)
+ {
+ if (i.is_symlink())
+ symlink = sym;
+ if (i.is_dir())
+ dirfragtree = dft;
+ if (oi)
+ old_inodes = *oi;
+ snapbl = sbl;
+ }
+ explicit fullbit(bufferlist::const_iterator &p) {
+ decode(p);
+ }
+ fullbit() {}
+ fullbit(const fullbit&) = delete;
+ ~fullbit() {}
+ fullbit& operator=(const fullbit&) = delete;
+
+ void encode(bufferlist& bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<EMetaBlob::fullbit*>& ls);
+
+ void update_inode(MDSRank *mds, CInode *in);
+ bool is_dirty() const { return (state & STATE_DIRTY); }
+ bool is_dirty_parent() const { return (state & STATE_DIRTYPARENT); }
+ bool is_dirty_pool() const { return (state & STATE_DIRTYPOOL); }
+ bool need_snapflush() const { return (state & STATE_NEED_SNAPFLUSH); }
+
+ void print(ostream& out) const {
+ out << " fullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
+ << " inode " << inode.ino
+ << " state=" << state << std::endl;
+ }
+ string state_string() const {
+ string state_string;
+ bool marked_already = false;
+ if (is_dirty()) {
+ state_string.append("dirty");
+ marked_already = true;
+ }
+ if (is_dirty_parent()) {
+ state_string.append(marked_already ? "+dirty_parent" : "dirty_parent");
+ if (is_dirty_pool())
+ state_string.append("+dirty_pool");
+ }
+ return state_string;
+ }
+ };
+ WRITE_CLASS_ENCODER_FEATURES(fullbit)
+
+ /* remotebit - a dentry + remote inode link (i.e. just an ino)
+ */
+ struct remotebit {
+ std::string dn;
+ snapid_t dnfirst, dnlast;
+ version_t dnv;
+ inodeno_t ino;
+ unsigned char d_type;
+ bool dirty;
+
+ remotebit(std::string_view d, snapid_t df, snapid_t dl, version_t v, inodeno_t i, unsigned char dt, bool dr) :
+ dn(d), dnfirst(df), dnlast(dl), dnv(v), ino(i), d_type(dt), dirty(dr) { }
+ explicit remotebit(bufferlist::const_iterator &p) { decode(p); }
+ remotebit(): dnfirst(0), dnlast(0), dnv(0), ino(0),
+ d_type('\0'), dirty(false) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator &bl);
+ void print(ostream& out) const {
+ out << " remotebit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
+ << " ino " << ino
+ << " dirty=" << dirty << std::endl;
+ }
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<remotebit*>& ls);
+ };
+ WRITE_CLASS_ENCODER(remotebit)
+
+ /*
+ * nullbit - a null dentry
+ */
+ struct nullbit {
+ std::string dn;
+ snapid_t dnfirst, dnlast;
+ version_t dnv;
+ bool dirty;
+
+ nullbit(std::string_view d, snapid_t df, snapid_t dl, version_t v, bool dr) :
+ dn(d), dnfirst(df), dnlast(dl), dnv(v), dirty(dr) { }
+ explicit nullbit(bufferlist::const_iterator &p) { decode(p); }
+ nullbit(): dnfirst(0), dnlast(0), dnv(0), dirty(false) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<nullbit*>& ls);
+ void print(ostream& out) const {
+ out << " nullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
+ << " dirty=" << dirty << std::endl;
+ }
+ };
+ WRITE_CLASS_ENCODER(nullbit)
+
+
+ /* dirlump - contains metadata for any dir we have contents for.
+ */
+public:
+ struct dirlump {
+ static const int STATE_COMPLETE = (1<<1);
+ static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is!
+ static const int STATE_NEW = (1<<3); // new directory
+ static const int STATE_IMPORTING = (1<<4); // importing directory
+ static const int STATE_DIRTYDFT = (1<<5); // dirty dirfragtree
+
+ //version_t dirv;
+ fnode_t fnode;
+ __u32 state;
+ __u32 nfull, nremote, nnull;
+
+ private:
+ mutable bufferlist dnbl;
+ mutable bool dn_decoded;
+ mutable list<fullbit> dfull;
+ mutable vector<remotebit> dremote;
+ mutable vector<nullbit> dnull;
+
+ public:
+ dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { }
+ dirlump(const dirlump&) = delete;
+ dirlump& operator=(const dirlump&) = delete;
+
+ bool is_complete() const { return state & STATE_COMPLETE; }
+ void mark_complete() { state |= STATE_COMPLETE; }
+ bool is_dirty() const { return state & STATE_DIRTY; }
+ void mark_dirty() { state |= STATE_DIRTY; }
+ bool is_new() const { return state & STATE_NEW; }
+ void mark_new() { state |= STATE_NEW; }
+ bool is_importing() { return state & STATE_IMPORTING; }
+ void mark_importing() { state |= STATE_IMPORTING; }
+ bool is_dirty_dft() { return state & STATE_DIRTYDFT; }
+ void mark_dirty_dft() { state |= STATE_DIRTYDFT; }
+
+ const list<fullbit> &get_dfull() const { return dfull; }
+ list<fullbit> &_get_dfull() { return dfull; }
+ const vector<remotebit> &get_dremote() const { return dremote; }
+ const vector<nullbit> &get_dnull() const { return dnull; }
+
+ template< class... Args>
+ void add_dfull(Args&&... args) {
+ dfull.emplace_back(std::forward<Args>(args)...);
+ }
+ template< class... Args>
+ void add_dremote(Args&&... args) {
+ dremote.emplace_back(std::forward<Args>(args)...);
+ }
+ template< class... Args>
+ void add_dnull(Args&&... args) {
+ dnull.emplace_back(std::forward<Args>(args)...);
+ }
+
+ void print(dirfrag_t dirfrag, ostream& out) const {
+ out << "dirlump " << dirfrag << " v " << fnode.version
+ << " state " << state
+ << " num " << nfull << "/" << nremote << "/" << nnull
+ << std::endl;
+ _decode_bits();
+ for (const auto& p : dfull)
+ p.print(out);
+ for (const auto& p : dremote)
+ p.print(out);
+ for (const auto& p : dnull)
+ p.print(out);
+ }
+
+ string state_string() const {
+ string state_string;
+ bool marked_already = false;
+ if (is_complete()) {
+ state_string.append("complete");
+ marked_already = true;
+ }
+ if (is_dirty()) {
+ state_string.append(marked_already ? "+dirty" : "dirty");
+ marked_already = true;
+ }
+ if (is_new()) {
+ state_string.append(marked_already ? "+new" : "new");
+ }
+ return state_string;
+ }
+
+ // if this changes, update the versioning in encode for it!
+ void _encode_bits(uint64_t features) const {
+ using ceph::encode;
+ if (!dn_decoded) return;
+ encode(dfull, dnbl, features);
+ encode(dremote, dnbl);
+ encode(dnull, dnbl);
+ }
+ void _decode_bits() const {
+ using ceph::decode;
+ if (dn_decoded) return;
+ auto p = dnbl.cbegin();
+ decode(dfull, p);
+ decode(dremote, p);
+ decode(dnull, p);
+ dn_decoded = true;
+ }
+
+ void encode(bufferlist& bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<dirlump*>& ls);
+ };
+ WRITE_CLASS_ENCODER_FEATURES(dirlump)
+
+ // my lumps. preserve the order we added them in a list.
+ vector<dirfrag_t> lump_order;
+ map<dirfrag_t, dirlump> lump_map;
+ list<fullbit> roots;
+public:
+ vector<pair<__u8,version_t> > table_tids; // tableclient transactions
+
+ inodeno_t opened_ino;
+public:
+ inodeno_t renamed_dirino;
+ vector<frag_t> renamed_dir_frags;
+private:
+
+ // ino (pre)allocation. may involve both inotable AND session state.
+ version_t inotablev, sessionmapv;
+ inodeno_t allocated_ino; // inotable
+ interval_set<inodeno_t> preallocated_inos; // inotable + session
+ inodeno_t used_preallocated_ino; // session
+ entity_name_t client_name; // session
+
+ // inodes i've truncated
+ vector<inodeno_t> truncate_start; // start truncate
+ map<inodeno_t, LogSegment::seq_t> truncate_finish; // finished truncate (started in segment blah)
+
+public:
+ vector<inodeno_t> destroyed_inodes;
+private:
+
+ // idempotent op(s)
+ vector<pair<metareqid_t,uint64_t> > client_reqs;
+ vector<pair<metareqid_t,uint64_t> > client_flushes;
+
+ public:
+ void encode(bufferlist& bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator& bl);
+ void get_inodes(std::set<inodeno_t> &inodes) const;
+ void get_paths(std::vector<std::string> &paths) const;
+ void get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const;
+ entity_name_t get_client_name() const {return client_name;}
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<EMetaBlob*>& ls);
+ // soft stateadd
+ uint64_t last_subtree_map;
+ uint64_t event_seq;
+
+ // for replay, in certain cases
+ //LogSegment *_segment;
+
+ EMetaBlob() : opened_ino(0), renamed_dirino(0),
+ inotablev(0), sessionmapv(0), allocated_ino(0),
+ last_subtree_map(0), event_seq(0)
+ {}
+ EMetaBlob(const EMetaBlob&) = delete;
+ ~EMetaBlob() { }
+ EMetaBlob& operator=(const EMetaBlob&) = delete;
+
+ void print(ostream& out) {
+ for (const auto &p : lump_order)
+ lump_map[p].print(p, out);
+ }
+
+ void add_client_req(metareqid_t r, uint64_t tid=0) {
+ client_reqs.push_back(pair<metareqid_t,uint64_t>(r, tid));
+ }
+ void add_client_flush(metareqid_t r, uint64_t tid=0) {
+ client_flushes.push_back(pair<metareqid_t,uint64_t>(r, tid));
+ }
+
+ void add_table_transaction(int table, version_t tid) {
+ table_tids.push_back(pair<__u8, version_t>(table, tid));
+ }
+
+ void add_opened_ino(inodeno_t ino) {
+ ceph_assert(!opened_ino);
+ opened_ino = ino;
+ }
+
+ void set_ino_alloc(inodeno_t alloc,
+ inodeno_t used_prealloc,
+ interval_set<inodeno_t>& prealloc,
+ entity_name_t client,
+ version_t sv, version_t iv) {
+ allocated_ino = alloc;
+ used_preallocated_ino = used_prealloc;
+ preallocated_inos = prealloc;
+ client_name = client;
+ sessionmapv = sv;
+ inotablev = iv;
+ }
+
+ void add_truncate_start(inodeno_t ino) {
+ truncate_start.push_back(ino);
+ }
+ void add_truncate_finish(inodeno_t ino, uint64_t segoff) {
+ truncate_finish[ino] = segoff;
+ }
+
+ bool rewrite_truncate_finish(MDSRank const *mds, std::map<uint64_t, uint64_t> const &old_to_new);
+
+ void add_destroyed_inode(inodeno_t ino) {
+ destroyed_inodes.push_back(ino);
+ }
+
+ void add_null_dentry(CDentry *dn, bool dirty) {
+ add_null_dentry(add_dir(dn->get_dir(), false), dn, dirty);
+ }
+ void add_null_dentry(dirlump& lump, CDentry *dn, bool dirty) {
+ // add the dir
+ lump.nnull++;
+ lump.add_dnull(dn->get_name(), dn->first, dn->last,
+ dn->get_projected_version(), dirty);
+ }
+
+ void add_remote_dentry(CDentry *dn, bool dirty) {
+ add_remote_dentry(add_dir(dn->get_dir(), false), dn, dirty, 0, 0);
+ }
+ void add_remote_dentry(CDentry *dn, bool dirty, inodeno_t rino, int rdt) {
+ add_remote_dentry(add_dir(dn->get_dir(), false), dn, dirty, rino, rdt);
+ }
+ void add_remote_dentry(dirlump& lump, CDentry *dn, bool dirty,
+ inodeno_t rino=0, unsigned char rdt=0) {
+ if (!rino) {
+ rino = dn->get_projected_linkage()->get_remote_ino();
+ rdt = dn->get_projected_linkage()->get_remote_d_type();
+ }
+ lump.nremote++;
+ lump.add_dremote(dn->get_name(), dn->first, dn->last,
+ dn->get_projected_version(), rino, rdt, dirty);
+ }
+
+ // return remote pointer to to-be-journaled inode
+ void add_primary_dentry(CDentry *dn, CInode *in, bool dirty,
+ bool dirty_parent=false, bool dirty_pool=false,
+ bool need_snapflush=false) {
+ __u8 state = 0;
+ if (dirty) state |= fullbit::STATE_DIRTY;
+ if (dirty_parent) state |= fullbit::STATE_DIRTYPARENT;
+ if (dirty_pool) state |= fullbit::STATE_DIRTYPOOL;
+ if (need_snapflush) state |= fullbit::STATE_NEED_SNAPFLUSH;
+ add_primary_dentry(add_dir(dn->get_dir(), false), dn, in, state);
+ }
+ void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, __u8 state) {
+ if (!in)
+ in = dn->get_projected_linkage()->get_inode();
+
+ // make note of where this inode was last journaled
+ in->last_journaled = event_seq;
+ //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
+
+ const auto pi = in->get_projected_inode();
+ if ((state & fullbit::STATE_DIRTY) && pi->is_backtrace_updated())
+ state |= fullbit::STATE_DIRTYPARENT;
+
+ bufferlist snapbl;
+ const sr_t *sr = in->get_projected_srnode();
+ if (sr)
+ sr->encode(snapbl);
+
+ lump.nfull++;
+ lump.add_dfull(dn->get_name(), dn->first, dn->last, dn->get_projected_version(),
+ *pi, in->dirfragtree, *in->get_projected_xattrs(), in->symlink,
+ in->oldest_snap, snapbl, state, &in->old_inodes);
+ }
+
+ // convenience: primary or remote? figure it out.
+ void add_dentry(CDentry *dn, bool dirty) {
+ dirlump& lump = add_dir(dn->get_dir(), false);
+ add_dentry(lump, dn, dirty, false, false);
+ }
+ void add_import_dentry(CDentry *dn) {
+ bool dirty_parent = false;
+ bool dirty_pool = false;
+ if (dn->get_linkage()->is_primary()) {
+ dirty_parent = dn->get_linkage()->get_inode()->is_dirty_parent();
+ dirty_pool = dn->get_linkage()->get_inode()->is_dirty_pool();
+ }
+ dirlump& lump = add_dir(dn->get_dir(), false);
+ add_dentry(lump, dn, dn->is_dirty(), dirty_parent, dirty_pool);
+ }
+ void add_dentry(dirlump& lump, CDentry *dn, bool dirty, bool dirty_parent, bool dirty_pool) {
+ // primary or remote
+ if (dn->get_projected_linkage()->is_remote()) {
+ add_remote_dentry(dn, dirty);
+ return;
+ } else if (dn->get_projected_linkage()->is_null()) {
+ add_null_dentry(dn, dirty);
+ return;
+ }
+ ceph_assert(dn->get_projected_linkage()->is_primary());
+ add_primary_dentry(dn, 0, dirty, dirty_parent, dirty_pool);
+ }
+
+ void add_root(bool dirty, CInode *in) {
+ in->last_journaled = event_seq;
+ //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
+
+ const auto& pi = *(in->get_projected_inode());
+ const auto& pdft = in->dirfragtree;
+ const auto& px = *(in->get_projected_xattrs());
+
+ bufferlist snapbl;
+ const sr_t *sr = in->get_projected_srnode();
+ if (sr)
+ sr->encode(snapbl);
+
+ for (auto p = roots.begin(); p != roots.end(); ++p) {
+ if (p->inode.ino == in->ino()) {
+ roots.erase(p);
+ break;
+ }
+ }
+
+ string empty;
+ roots.emplace_back(empty, in->first, in->last, 0, pi, pdft, px, in->symlink,
+ in->oldest_snap, snapbl, (dirty ? fullbit::STATE_DIRTY : 0),
+ &in->old_inodes);
+ }
+
+ dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) {
+ return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
+ dirty, complete);
+ }
+ dirlump& add_new_dir(CDir *dir) {
+ return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
+ true, true, true); // dirty AND complete AND new
+ }
+ dirlump& add_import_dir(CDir *dir) {
+ // dirty=false would be okay in some cases
+ return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
+ dir->is_dirty(), dir->is_complete(), false, true, dir->is_dirty_dft());
+ }
+ dirlump& add_fragmented_dir(CDir *dir, bool dirty, bool dirtydft) {
+ return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
+ dirty, false, false, false, dirtydft);
+ }
+ dirlump& add_dir(dirfrag_t df, const fnode_t *pf, version_t pv, bool dirty,
+ bool complete=false, bool isnew=false,
+ bool importing=false, bool dirty_dft=false) {
+ if (lump_map.count(df) == 0)
+ lump_order.push_back(df);
+
+ dirlump& l = lump_map[df];
+ l.fnode = *pf;
+ l.fnode.version = pv;
+ if (complete) l.mark_complete();
+ if (dirty) l.mark_dirty();
+ if (isnew) l.mark_new();
+ if (importing) l.mark_importing();
+ if (dirty_dft) l.mark_dirty_dft();
+ return l;
+ }
+
+ static const int TO_AUTH_SUBTREE_ROOT = 0; // default.
+ static const int TO_ROOT = 1;
+
+ void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT);
+
+ bool empty() {
+ return roots.empty() && lump_order.empty() && table_tids.empty() &&
+ truncate_start.empty() && truncate_finish.empty() &&
+ destroyed_inodes.empty() && client_reqs.empty() &&
+ opened_ino == 0 && inotablev == 0 && sessionmapv == 0;
+ }
+
+ void print(ostream& out) const {
+ out << "[metablob";
+ if (!lump_order.empty())
+ out << " " << lump_order.front() << ", " << lump_map.size() << " dirs";
+ if (!table_tids.empty())
+ out << " table_tids=" << table_tids;
+ if (allocated_ino || preallocated_inos.size()) {
+ if (allocated_ino)
+ out << " alloc_ino=" << allocated_ino;
+ if (preallocated_inos.size())
+ out << " prealloc_ino=" << preallocated_inos;
+ if (used_preallocated_ino)
+ out << " used_prealloc_ino=" << used_preallocated_ino;
+ out << " v" << inotablev;
+ }
+ out << "]";
+ }
+
+ void update_segment(LogSegment *ls);
+ void replay(MDSRank *mds, LogSegment *ls, MDSlaveUpdate *su=NULL);
+};
+WRITE_CLASS_ENCODER_FEATURES(EMetaBlob)
+WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::fullbit)
+WRITE_CLASS_ENCODER(EMetaBlob::remotebit)
+WRITE_CLASS_ENCODER(EMetaBlob::nullbit)
+WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::dirlump)
+
+inline ostream& operator<<(ostream& out, const EMetaBlob& t) {
+ t.print(out);
+ return out;
+}
+
+#endif
diff --git a/src/mds/events/ENoOp.h b/src/mds/events/ENoOp.h
new file mode 100644
index 00000000..1bf5161e
--- /dev/null
+++ b/src/mds/events/ENoOp.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_ENOOP_H
+#define CEPH_MDS_ENOOP_H
+
+#include "../LogEvent.h"
+
+class ENoOp : public LogEvent {
+ uint32_t pad_size;
+
+public:
+ ENoOp() : LogEvent(EVENT_NOOP), pad_size(0) { }
+ explicit ENoOp(uint32_t size_) : LogEvent(EVENT_NOOP), pad_size(size_){ }
+
+ void encode(bufferlist& bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator& bl) override;
+ void dump(Formatter *f) const override {}
+
+ void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(ENoOp)
+
+#endif
diff --git a/src/mds/events/EOpen.h b/src/mds/events/EOpen.h
new file mode 100644
index 00000000..192745d9
--- /dev/null
+++ b/src/mds/events/EOpen.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_EOPEN_H
+#define CEPH_MDS_EOPEN_H
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+class EOpen : public LogEvent {
+public:
+ EMetaBlob metablob;
+ vector<inodeno_t> inos;
+ vector<vinodeno_t> snap_inos;
+
+ EOpen() : LogEvent(EVENT_OPEN) { }
+ explicit EOpen(MDLog *mdlog) :
+ LogEvent(EVENT_OPEN) { }
+
+ void print(ostream& out) const override {
+ out << "EOpen " << metablob << ", " << inos.size() << " open files";
+ }
+
+ EMetaBlob *get_metablob() override { return &metablob; }
+
+ void add_clean_inode(CInode *in) {
+ if (!in->is_base()) {
+ metablob.add_dir_context(in->get_projected_parent_dn()->get_dir());
+ metablob.add_primary_dentry(in->get_projected_parent_dn(), 0, false);
+ if (in->last == CEPH_NOSNAP)
+ inos.push_back(in->ino());
+ else
+ snap_inos.push_back(in->vino());
+ }
+ }
+ void add_ino(inodeno_t ino) {
+ inos.push_back(ino);
+ }
+
+ void encode(bufferlist& bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator& bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<EOpen*>& ls);
+
+ void update_segment() override;
+ void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(EOpen)
+
+#endif
diff --git a/src/mds/events/EResetJournal.h b/src/mds/events/EResetJournal.h
new file mode 100644
index 00000000..3004978a
--- /dev/null
+++ b/src/mds/events/EResetJournal.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_MDS_ERESETJOURNAL_H
+#define CEPH_MDS_ERESETJOURNAL_H
+
+#include "../LogEvent.h"
+
+// generic log event
+class EResetJournal : public LogEvent {
+ public:
+ EResetJournal() : LogEvent(EVENT_RESETJOURNAL) { }
+ ~EResetJournal() override {}
+
+ void encode(bufferlist& bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator& bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<EResetJournal*>& ls);
+ void print(ostream& out) const override {
+ out << "EResetJournal";
+ }
+
+ void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(EResetJournal)
+
+#endif
diff --git a/src/mds/events/ESession.h b/src/mds/events/ESession.h
new file mode 100644
index 00000000..0b65765e
--- /dev/null
+++ b/src/mds/events/ESession.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_ESESSION_H
+#define CEPH_MDS_ESESSION_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+
+class ESession : public LogEvent {
+ protected:
+ entity_inst_t client_inst;
+ bool open; // open or close
+ version_t cmapv{0}; // client map version
+
+ interval_set<inodeno_t> inos;
+ version_t inotablev{0};
+
+ // Client metadata stored during open
+ client_metadata_t client_metadata;
+
+ public:
+ ESession() : LogEvent(EVENT_SESSION), open(false) { }
+ ESession(const entity_inst_t& inst, bool o, version_t v,
+ const client_metadata_t& cm) :
+ LogEvent(EVENT_SESSION),
+ client_inst(inst), open(o), cmapv(v), inotablev(0),
+ client_metadata(cm) { }
+ ESession(const entity_inst_t& inst, bool o, version_t v,
+ const interval_set<inodeno_t>& i, version_t iv) :
+ LogEvent(EVENT_SESSION),
+ client_inst(inst), open(o), cmapv(v), inos(i), inotablev(iv) { }
+
+ void encode(bufferlist& bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator& bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<ESession*>& ls);
+
+ void print(ostream& out) const override {
+ if (open)
+ out << "ESession " << client_inst << " open cmapv " << cmapv;
+ else
+ out << "ESession " << client_inst << " close cmapv " << cmapv;
+ if (inos.size())
+ out << " (" << inos.size() << " inos, v" << inotablev << ")";
+ }
+
+ void update_segment() override;
+ void replay(MDSRank *mds) override;
+ entity_inst_t get_client_inst() const {return client_inst;}
+};
+WRITE_CLASS_ENCODER_FEATURES(ESession)
+
+#endif
diff --git a/src/mds/events/ESessions.h b/src/mds/events/ESessions.h
new file mode 100644
index 00000000..aa0eeff8
--- /dev/null
+++ b/src/mds/events/ESessions.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_ESESSIONS_H
+#define CEPH_MDS_ESESSIONS_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+
+class ESessions : public LogEvent {
+protected:
+ version_t cmapv; // client map version
+ bool old_style_encode;
+
+public:
+ map<client_t,entity_inst_t> client_map;
+ map<client_t,client_metadata_t> client_metadata_map;
+
+ ESessions() : LogEvent(EVENT_SESSIONS), cmapv(0), old_style_encode(false) { }
+ ESessions(version_t pv, map<client_t,entity_inst_t>&& cm,
+ map<client_t,client_metadata_t>&& cmm) :
+ LogEvent(EVENT_SESSIONS),
+ cmapv(pv), old_style_encode(false),
+ client_map(std::move(cm)),
+ client_metadata_map(std::move(cmm)) {}
+
+ void mark_old_encoding() { old_style_encode = true; }
+
+ void encode(bufferlist &bl, uint64_t features) const override;
+ void decode_old(bufferlist::const_iterator &bl);
+ void decode_new(bufferlist::const_iterator &bl);
+ void decode(bufferlist::const_iterator &bl) override {
+ if (old_style_encode) decode_old(bl);
+ else decode_new(bl);
+ }
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<ESessions*>& ls);
+
+ void print(ostream& out) const override {
+ out << "ESessions " << client_map.size() << " opens cmapv " << cmapv;
+ }
+
+ void update_segment() override;
+ void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(ESessions)
+
+#endif
diff --git a/src/mds/events/ESlaveUpdate.h b/src/mds/events/ESlaveUpdate.h
new file mode 100644
index 00000000..23ca430b
--- /dev/null
+++ b/src/mds/events/ESlaveUpdate.h
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_ESLAVEUPDATE_H
+#define CEPH_MDS_ESLAVEUPDATE_H
+
+#include <string_view>
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+/*
+ * rollback records, for remote/slave updates, which may need to be manually
+ * rolled back during journal replay. (or while active if master fails, but in
+ * that case these records aren't needed.)
+ */
+struct link_rollback {
+ metareqid_t reqid;
+ inodeno_t ino;
+ bool was_inc;
+ utime_t old_ctime;
+ utime_t old_dir_mtime;
+ utime_t old_dir_rctime;
+ bufferlist snapbl;
+
+ link_rollback() : ino(0), was_inc(false) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<link_rollback*>& ls);
+};
+WRITE_CLASS_ENCODER(link_rollback)
+
+/*
+ * this is only used on an empty dir with a dirfrag on a remote node.
+ * we are auth for nothing. all we need to do is relink the directory
+ * in the hierarchy properly during replay to avoid breaking the
+ * subtree map.
+ */
+struct rmdir_rollback {
+ metareqid_t reqid;
+ dirfrag_t src_dir;
+ string src_dname;
+ dirfrag_t dest_dir;
+ string dest_dname;
+ bufferlist snapbl;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<rmdir_rollback*>& ls);
+};
+WRITE_CLASS_ENCODER(rmdir_rollback)
+
+struct rename_rollback {
+ struct drec {
+ dirfrag_t dirfrag;
+ utime_t dirfrag_old_mtime;
+ utime_t dirfrag_old_rctime;
+ inodeno_t ino, remote_ino;
+ string dname;
+ char remote_d_type;
+ utime_t old_ctime;
+
+ drec() : remote_d_type((char)S_IFREG) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<drec*>& ls);
+ };
+ WRITE_CLASS_MEMBER_ENCODER(drec)
+
+ metareqid_t reqid;
+ drec orig_src, orig_dest;
+ drec stray; // we know this is null, but we want dname, old mtime/rctime
+ utime_t ctime;
+ bufferlist srci_snapbl;
+ bufferlist desti_snapbl;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<rename_rollback*>& ls);
+};
+WRITE_CLASS_ENCODER(rename_rollback::drec)
+WRITE_CLASS_ENCODER(rename_rollback)
+
+
+class ESlaveUpdate : public LogEvent {
+public:
+ const static int OP_PREPARE = 1;
+ const static int OP_COMMIT = 2;
+ const static int OP_ROLLBACK = 3;
+
+ const static int LINK = 1;
+ const static int RENAME = 2;
+ const static int RMDIR = 3;
+
+ /*
+ * we journal a rollback metablob that contains the unmodified metadata
+ * too, because we may be updating previously dirty metadata, which
+ * will allow old log segments to be trimmed. if we end of rolling back,
+ * those updates could be lost.. so we re-journal the unmodified metadata,
+ * and replay will apply _either_ commit or rollback.
+ */
+ EMetaBlob commit;
+ bufferlist rollback;
+ string type;
+ metareqid_t reqid;
+ mds_rank_t master;
+ __u8 op; // prepare, commit, abort
+ __u8 origop; // link | rename
+
+ ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE), master(0), op(0), origop(0) { }
+ ESlaveUpdate(MDLog *mdlog, std::string_view s, metareqid_t ri, int mastermds, int o, int oo) :
+ LogEvent(EVENT_SLAVEUPDATE),
+ type(s),
+ reqid(ri),
+ master(mastermds),
+ op(o), origop(oo) { }
+
+ void print(ostream& out) const override {
+ if (type.length())
+ out << type << " ";
+ out << " " << (int)op;
+ if (origop == LINK) out << " link";
+ if (origop == RENAME) out << " rename";
+ out << " " << reqid;
+ out << " for mds." << master;
+ out << commit;
+ }
+
+ EMetaBlob *get_metablob() override { return &commit; }
+
+ void encode(bufferlist& bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator& bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<ESlaveUpdate*>& ls);
+
+ void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(ESlaveUpdate)
+
+#endif
diff --git a/src/mds/events/ESubtreeMap.h b/src/mds/events/ESubtreeMap.h
new file mode 100644
index 00000000..08d4a581
--- /dev/null
+++ b/src/mds/events/ESubtreeMap.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_ESUBTREEMAP_H
+#define CEPH_MDS_ESUBTREEMAP_H
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+class ESubtreeMap : public LogEvent {
+public:
+ EMetaBlob metablob;
+ map<dirfrag_t, vector<dirfrag_t> > subtrees;
+ set<dirfrag_t> ambiguous_subtrees;
+ uint64_t expire_pos;
+ uint64_t event_seq;
+
+ ESubtreeMap() : LogEvent(EVENT_SUBTREEMAP), expire_pos(0), event_seq(0) { }
+
+ void print(ostream& out) const override {
+ out << "ESubtreeMap " << subtrees.size() << " subtrees "
+ << ", " << ambiguous_subtrees.size() << " ambiguous "
+ << metablob;
+ }
+
+ EMetaBlob *get_metablob() override { return &metablob; }
+
+ void encode(bufferlist& bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator& bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<ESubtreeMap*>& ls);
+
+ void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(ESubtreeMap)
+
+#endif
diff --git a/src/mds/events/ETableClient.h b/src/mds/events/ETableClient.h
new file mode 100644
index 00000000..bf3e752d
--- /dev/null
+++ b/src/mds/events/ETableClient.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_ETABLECLIENT_H
+#define CEPH_MDS_ETABLECLIENT_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../mds_table_types.h"
+#include "../LogEvent.h"
+
+struct ETableClient : public LogEvent {
+ __u16 table;
+ __s16 op;
+ version_t tid;
+
+ ETableClient() : LogEvent(EVENT_TABLECLIENT), table(0), op(0), tid(0) { }
+ ETableClient(int t, int o, version_t ti) :
+ LogEvent(EVENT_TABLECLIENT),
+ table(t), op(o), tid(ti) { }
+
+ void encode(bufferlist& bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator& bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<ETableClient*>& ls);
+
+ void print(ostream& out) const override {
+ out << "ETableClient " << get_mdstable_name(table) << " " << get_mdstableserver_opname(op);
+ if (tid) out << " tid " << tid;
+ }
+
+ //void update_segment();
+ void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(ETableClient)
+
+#endif
diff --git a/src/mds/events/ETableServer.h b/src/mds/events/ETableServer.h
new file mode 100644
index 00000000..0005b132
--- /dev/null
+++ b/src/mds/events/ETableServer.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_ETABLESERVER_H
+#define CEPH_MDS_ETABLESERVER_H
+
+#include "common/config.h"
+#include "include/types.h"
+
+#include "../mds_table_types.h"
+#include "../LogEvent.h"
+
+struct ETableServer : public LogEvent {
+ __u16 table;
+ __s16 op;
+ uint64_t reqid;
+ mds_rank_t bymds;
+ bufferlist mutation;
+ version_t tid;
+ version_t version;
+
+ ETableServer() : LogEvent(EVENT_TABLESERVER), table(0), op(0),
+ reqid(0), bymds(MDS_RANK_NONE), tid(0), version(0) { }
+ ETableServer(int t, int o, uint64_t ri, mds_rank_t m, version_t ti, version_t v) :
+ LogEvent(EVENT_TABLESERVER),
+ table(t), op(o), reqid(ri), bymds(m), tid(ti), version(v) { }
+
+ void encode(bufferlist& bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator& bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<ETableServer*>& ls);
+
+ void print(ostream& out) const override {
+ out << "ETableServer " << get_mdstable_name(table)
+ << " " << get_mdstableserver_opname(op);
+ if (reqid) out << " reqid " << reqid;
+ if (bymds >= 0) out << " mds." << bymds;
+ if (tid) out << " tid " << tid;
+ if (version) out << " version " << version;
+ if (mutation.length()) out << " mutation=" << mutation.length() << " bytes";
+ }
+
+ void update_segment() override;
+ void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(ETableServer)
+
+#endif
diff --git a/src/mds/events/EUpdate.h b/src/mds/events/EUpdate.h
new file mode 100644
index 00000000..dc710d52
--- /dev/null
+++ b/src/mds/events/EUpdate.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_EUPDATE_H
+#define CEPH_MDS_EUPDATE_H
+
+#include <string_view>
+
+#include "../LogEvent.h"
+#include "EMetaBlob.h"
+
+class EUpdate : public LogEvent {
+public:
+ EMetaBlob metablob;
+ string type;
+ bufferlist client_map;
+ version_t cmapv;
+ metareqid_t reqid;
+ bool had_slaves;
+
+ EUpdate() : LogEvent(EVENT_UPDATE), cmapv(0), had_slaves(false) { }
+ EUpdate(MDLog *mdlog, std::string_view s) :
+ LogEvent(EVENT_UPDATE),
+ type(s), cmapv(0), had_slaves(false) { }
+
+ void print(ostream& out) const override {
+ if (type.length())
+ out << "EUpdate " << type << " ";
+ out << metablob;
+ }
+
+ EMetaBlob *get_metablob() override { return &metablob; }
+
+ void encode(bufferlist& bl, uint64_t features) const override;
+ void decode(bufferlist::const_iterator& bl) override;
+ void dump(Formatter *f) const override;
+ static void generate_test_instances(list<EUpdate*>& ls);
+
+ void update_segment() override;
+ void replay(MDSRank *mds) override;
+};
+WRITE_CLASS_ENCODER_FEATURES(EUpdate)
+
+#endif
diff --git a/src/mds/flock.cc b/src/mds/flock.cc
new file mode 100644
index 00000000..2382322b
--- /dev/null
+++ b/src/mds/flock.cc
@@ -0,0 +1,596 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <errno.h>
+
+#include "common/debug.h"
+#include "mdstypes.h"
+#include "mds/flock.h"
+
+#define dout_subsys ceph_subsys_mds
+
+static multimap<ceph_filelock, ceph_lock_state_t*> global_waiting_locks;
+
+static void remove_global_waiting(ceph_filelock &fl, ceph_lock_state_t *lock_state)
+{
+ for (auto p = global_waiting_locks.find(fl);
+ p != global_waiting_locks.end(); ) {
+ if (p->first != fl)
+ break;
+ if (p->second == lock_state) {
+ global_waiting_locks.erase(p);
+ break;
+ }
+ ++p;
+ }
+}
+
+ceph_lock_state_t::~ceph_lock_state_t()
+{
+ if (type == CEPH_LOCK_FCNTL) {
+ for (auto p = waiting_locks.begin(); p != waiting_locks.end(); ++p) {
+ remove_global_waiting(p->second, this);
+ }
+ }
+}
+
+bool ceph_lock_state_t::is_waiting(const ceph_filelock &fl) const
+{
+ multimap<uint64_t, ceph_filelock>::const_iterator p = waiting_locks.find(fl.start);
+ while (p != waiting_locks.end()) {
+ if (p->second.start > fl.start)
+ return false;
+ if (p->second.length == fl.length &&
+ ceph_filelock_owner_equal(p->second, fl))
+ return true;
+ ++p;
+ }
+ return false;
+}
+
+void ceph_lock_state_t::remove_waiting(const ceph_filelock& fl)
+{
+ for (auto p = waiting_locks.find(fl.start);
+ p != waiting_locks.end(); ) {
+ if (p->second.start > fl.start)
+ break;
+ if (p->second.length == fl.length &&
+ ceph_filelock_owner_equal(p->second, fl)) {
+ if (type == CEPH_LOCK_FCNTL) {
+ remove_global_waiting(p->second, this);
+ }
+ waiting_locks.erase(p);
+ --client_waiting_lock_counts[(client_t)fl.client];
+ if (!client_waiting_lock_counts[(client_t)fl.client]) {
+ client_waiting_lock_counts.erase((client_t)fl.client);
+ }
+ break;
+ }
+ ++p;
+ }
+}
+
+bool ceph_lock_state_t::is_deadlock(const ceph_filelock& fl,
+ list<multimap<uint64_t, ceph_filelock>::iterator>&
+ overlapping_locks,
+ const ceph_filelock *first_fl, unsigned depth) const
+{
+ ldout(cct,15) << "is_deadlock " << fl << dendl;
+
+ // only for posix lock
+ if (type != CEPH_LOCK_FCNTL)
+ return false;
+
+ // find conflict locks' owners
+ set<ceph_filelock> lock_owners;
+ for (auto p = overlapping_locks.begin();
+ p != overlapping_locks.end();
+ ++p) {
+
+ if (fl.type == CEPH_LOCK_SHARED &&
+ (*p)->second.type == CEPH_LOCK_SHARED)
+ continue;
+
+ // circle detected
+ if (first_fl && ceph_filelock_owner_equal(*first_fl, (*p)->second)) {
+ ldout(cct,15) << " detect deadlock" << dendl;
+ return true;
+ }
+
+ ceph_filelock tmp = (*p)->second;
+ tmp.start = 0;
+ tmp.length = 0;
+ tmp.type = 0;
+ lock_owners.insert(tmp);
+ }
+
+ if (depth >= MAX_DEADLK_DEPTH)
+ return false;
+
+ first_fl = first_fl ? first_fl : &fl;
+ for (auto p = lock_owners.begin();
+ p != lock_owners.end();
+ ++p) {
+ ldout(cct,15) << " conflict lock owner " << *p << dendl;
+ // if conflict lock' owner is waiting for other lock?
+ for (auto q = global_waiting_locks.lower_bound(*p);
+ q != global_waiting_locks.end();
+ ++q) {
+ if (!ceph_filelock_owner_equal(q->first, *p))
+ break;
+
+ list<multimap<uint64_t, ceph_filelock>::iterator>
+ _overlapping_locks, _self_overlapping_locks;
+ ceph_lock_state_t& state = *(q->second);
+ if (state.get_overlapping_locks(q->first, _overlapping_locks)) {
+ state.split_by_owner(q->first, _overlapping_locks, _self_overlapping_locks);
+ }
+ if (!_overlapping_locks.empty()) {
+ if (is_deadlock(q->first, _overlapping_locks, first_fl, depth + 1))
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void ceph_lock_state_t::add_waiting(const ceph_filelock& fl)
+{
+ waiting_locks.insert(pair<uint64_t, ceph_filelock>(fl.start, fl));
+ ++client_waiting_lock_counts[(client_t)fl.client];
+ if (type == CEPH_LOCK_FCNTL) {
+ global_waiting_locks.insert(pair<ceph_filelock,ceph_lock_state_t*>(fl, this));
+ }
+}
+
+bool ceph_lock_state_t::add_lock(ceph_filelock& new_lock,
+ bool wait_on_fail, bool replay,
+ bool *deadlock)
+{
+ ldout(cct,15) << "add_lock " << new_lock << dendl;
+ bool ret = false;
+ list<multimap<uint64_t, ceph_filelock>::iterator>
+ overlapping_locks, self_overlapping_locks, neighbor_locks;
+
+ // first, get any overlapping locks and split them into owned-by-us and not
+ if (get_overlapping_locks(new_lock, overlapping_locks, &neighbor_locks)) {
+ ldout(cct,15) << "got overlapping lock, splitting by owner" << dendl;
+ split_by_owner(new_lock, overlapping_locks, self_overlapping_locks);
+ }
+ if (!overlapping_locks.empty()) { //overlapping locks owned by others :(
+ if (CEPH_LOCK_EXCL == new_lock.type) {
+ //can't set, we want an exclusive
+ ldout(cct,15) << "overlapping lock, and this lock is exclusive, can't set"
+ << dendl;
+ if (wait_on_fail && !replay) {
+ if (is_deadlock(new_lock, overlapping_locks))
+ *deadlock = true;
+ else
+ add_waiting(new_lock);
+ }
+ } else { //shared lock, check for any exclusive locks blocking us
+ if (contains_exclusive_lock(overlapping_locks)) { //blocked :(
+ ldout(cct,15) << " blocked by exclusive lock in overlapping_locks" << dendl;
+ if (wait_on_fail && !replay) {
+ if (is_deadlock(new_lock, overlapping_locks))
+ *deadlock = true;
+ else
+ add_waiting(new_lock);
+ }
+ } else {
+ //yay, we can insert a shared lock
+ ldout(cct,15) << "inserting shared lock" << dendl;
+ remove_waiting(new_lock);
+ adjust_locks(self_overlapping_locks, new_lock, neighbor_locks);
+ held_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
+ ret = true;
+ }
+ }
+ } else { //no overlapping locks except our own
+ remove_waiting(new_lock);
+ adjust_locks(self_overlapping_locks, new_lock, neighbor_locks);
+ ldout(cct,15) << "no conflicts, inserting " << new_lock << dendl;
+ held_locks.insert(pair<uint64_t, ceph_filelock>
+ (new_lock.start, new_lock));
+ ret = true;
+ }
+ if (ret) {
+ ++client_held_lock_counts[(client_t)new_lock.client];
+ }
+ return ret;
+}
+
+void ceph_lock_state_t::look_for_lock(ceph_filelock& testing_lock)
+{
+ list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks,
+ self_overlapping_locks;
+ if (get_overlapping_locks(testing_lock, overlapping_locks)) {
+ split_by_owner(testing_lock, overlapping_locks, self_overlapping_locks);
+ }
+ if (!overlapping_locks.empty()) { //somebody else owns overlapping lock
+ if (CEPH_LOCK_EXCL == testing_lock.type) { //any lock blocks it
+ testing_lock = (*overlapping_locks.begin())->second;
+ } else {
+ ceph_filelock *blocking_lock;
+ if ((blocking_lock = contains_exclusive_lock(overlapping_locks))) {
+ testing_lock = *blocking_lock;
+ } else { //nothing blocking!
+ testing_lock.type = CEPH_LOCK_UNLOCK;
+ }
+ }
+ return;
+ }
+ //if we get here, only our own locks block
+ testing_lock.type = CEPH_LOCK_UNLOCK;
+}
+
+void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock,
+ list<ceph_filelock>& activated_locks)
+{
+ list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks,
+ self_overlapping_locks;
+ if (get_overlapping_locks(removal_lock, overlapping_locks)) {
+ ldout(cct,15) << "splitting by owner" << dendl;
+ split_by_owner(removal_lock, overlapping_locks, self_overlapping_locks);
+ } else ldout(cct,15) << "attempt to remove lock at " << removal_lock.start
+ << " but no locks there!" << dendl;
+ bool remove_to_end = (0 == removal_lock.length);
+ uint64_t removal_start = removal_lock.start;
+ uint64_t removal_end = removal_start + removal_lock.length - 1;
+ __s64 old_lock_client = 0;
+ ceph_filelock *old_lock;
+
+ ldout(cct,15) << "examining " << self_overlapping_locks.size()
+ << " self-overlapping locks for removal" << dendl;
+ for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+ iter = self_overlapping_locks.begin();
+ iter != self_overlapping_locks.end();
+ ++iter) {
+ ldout(cct,15) << "self overlapping lock " << (*iter)->second << dendl;
+ old_lock = &(*iter)->second;
+ bool old_lock_to_end = (0 == old_lock->length);
+ uint64_t old_lock_end = old_lock->start + old_lock->length - 1;
+ old_lock_client = old_lock->client;
+ if (remove_to_end) {
+ if (old_lock->start < removal_start) {
+ old_lock->length = removal_start - old_lock->start;
+ } else {
+ ldout(cct,15) << "erasing " << (*iter)->second << dendl;
+ held_locks.erase(*iter);
+ --client_held_lock_counts[old_lock_client];
+ }
+ } else if (old_lock_to_end) {
+ ceph_filelock append_lock = *old_lock;
+ append_lock.start = removal_end+1;
+ held_locks.insert(pair<uint64_t, ceph_filelock>
+ (append_lock.start, append_lock));
+ ++client_held_lock_counts[(client_t)old_lock->client];
+ if (old_lock->start >= removal_start) {
+ ldout(cct,15) << "erasing " << (*iter)->second << dendl;
+ held_locks.erase(*iter);
+ --client_held_lock_counts[old_lock_client];
+ } else old_lock->length = removal_start - old_lock->start;
+ } else {
+ if (old_lock_end > removal_end) {
+ ceph_filelock append_lock = *old_lock;
+ append_lock.start = removal_end + 1;
+ append_lock.length = old_lock_end - append_lock.start + 1;
+ held_locks.insert(pair<uint64_t, ceph_filelock>
+ (append_lock.start, append_lock));
+ ++client_held_lock_counts[(client_t)old_lock->client];
+ }
+ if (old_lock->start < removal_start) {
+ old_lock->length = removal_start - old_lock->start;
+ } else {
+ ldout(cct,15) << "erasing " << (*iter)->second << dendl;
+ held_locks.erase(*iter);
+ --client_held_lock_counts[old_lock_client];
+ }
+ }
+ if (!client_held_lock_counts[old_lock_client]) {
+ client_held_lock_counts.erase(old_lock_client);
+ }
+ }
+}
+
+bool ceph_lock_state_t::remove_all_from (client_t client)
+{
+ bool cleared_any = false;
+ if (client_held_lock_counts.count(client)) {
+ multimap<uint64_t, ceph_filelock>::iterator iter = held_locks.begin();
+ while (iter != held_locks.end()) {
+ if ((client_t)iter->second.client == client) {
+ held_locks.erase(iter++);
+ } else
+ ++iter;
+ }
+ client_held_lock_counts.erase(client);
+ cleared_any = true;
+ }
+
+ if (client_waiting_lock_counts.count(client)) {
+ multimap<uint64_t, ceph_filelock>::iterator iter = waiting_locks.begin();
+ while (iter != waiting_locks.end()) {
+ if ((client_t)iter->second.client != client) {
+ ++iter;
+ continue;
+ }
+ if (type == CEPH_LOCK_FCNTL) {
+ remove_global_waiting(iter->second, this);
+ }
+ waiting_locks.erase(iter++);
+ }
+ client_waiting_lock_counts.erase(client);
+ }
+ return cleared_any;
+}
+
+void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::iterator> old_locks,
+ ceph_filelock& new_lock,
+ list<multimap<uint64_t, ceph_filelock>::iterator>
+ neighbor_locks)
+{
+ ldout(cct,15) << "adjust_locks" << dendl;
+ bool new_lock_to_end = (0 == new_lock.length);
+ __s64 old_lock_client = 0;
+ ceph_filelock *old_lock;
+ for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+ iter = old_locks.begin();
+ iter != old_locks.end();
+ ++iter) {
+ old_lock = &(*iter)->second;
+ ldout(cct,15) << "adjusting lock: " << *old_lock << dendl;
+ bool old_lock_to_end = (0 == old_lock->length);
+ uint64_t old_lock_start = old_lock->start;
+ uint64_t old_lock_end = old_lock->start + old_lock->length - 1;
+ uint64_t new_lock_start = new_lock.start;
+ uint64_t new_lock_end = new_lock.start + new_lock.length - 1;
+ old_lock_client = old_lock->client;
+ if (new_lock_to_end || old_lock_to_end) {
+ //special code path to deal with a length set at 0
+ ldout(cct,15) << "one lock extends forever" << dendl;
+ if (old_lock->type == new_lock.type) {
+ //just unify them in new lock, remove old lock
+ ldout(cct,15) << "same lock type, unifying" << dendl;
+ new_lock.start = (new_lock_start < old_lock_start) ? new_lock_start :
+ old_lock_start;
+ new_lock.length = 0;
+ held_locks.erase(*iter);
+ --client_held_lock_counts[old_lock_client];
+ } else { //not same type, have to keep any remains of old lock around
+ ldout(cct,15) << "shrinking old lock" << dendl;
+ if (new_lock_to_end) {
+ if (old_lock_start < new_lock_start) {
+ old_lock->length = new_lock_start - old_lock_start;
+ } else {
+ held_locks.erase(*iter);
+ --client_held_lock_counts[old_lock_client];
+ }
+ } else { //old lock extends past end of new lock
+ ceph_filelock appended_lock = *old_lock;
+ appended_lock.start = new_lock_end + 1;
+ held_locks.insert(pair<uint64_t, ceph_filelock>
+ (appended_lock.start, appended_lock));
+ ++client_held_lock_counts[(client_t)old_lock->client];
+ if (old_lock_start < new_lock_start) {
+ old_lock->length = new_lock_start - old_lock_start;
+ } else {
+ held_locks.erase(*iter);
+ --client_held_lock_counts[old_lock_client];
+ }
+ }
+ }
+ } else {
+ if (old_lock->type == new_lock.type) { //just merge them!
+ ldout(cct,15) << "merging locks, they're the same type" << dendl;
+ new_lock.start = (old_lock_start < new_lock_start ) ? old_lock_start :
+ new_lock_start;
+ int new_end = (new_lock_end > old_lock_end) ? new_lock_end :
+ old_lock_end;
+ new_lock.length = new_end - new_lock.start + 1;
+ ldout(cct,15) << "erasing lock " << (*iter)->second << dendl;
+ held_locks.erase(*iter);
+ --client_held_lock_counts[old_lock_client];
+ } else { //we'll have to update sizes and maybe make new locks
+ ldout(cct,15) << "locks aren't same type, changing sizes" << dendl;
+ if (old_lock_end > new_lock_end) { //add extra lock after new_lock
+ ceph_filelock appended_lock = *old_lock;
+ appended_lock.start = new_lock_end + 1;
+ appended_lock.length = old_lock_end - appended_lock.start + 1;
+ held_locks.insert(pair<uint64_t, ceph_filelock>
+ (appended_lock.start, appended_lock));
+ ++client_held_lock_counts[(client_t)old_lock->client];
+ }
+ if (old_lock_start < new_lock_start) {
+ old_lock->length = new_lock_start - old_lock_start;
+ } else { //old_lock starts inside new_lock, so remove it
+ //if it extended past new_lock_end it's been replaced
+ held_locks.erase(*iter);
+ --client_held_lock_counts[old_lock_client];
+ }
+ }
+ }
+ if (!client_held_lock_counts[old_lock_client]) {
+ client_held_lock_counts.erase(old_lock_client);
+ }
+ }
+
+ //make sure to coalesce neighboring locks
+ for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+ iter = neighbor_locks.begin();
+ iter != neighbor_locks.end();
+ ++iter) {
+ old_lock = &(*iter)->second;
+ old_lock_client = old_lock->client;
+ ldout(cct,15) << "lock to coalesce: " << *old_lock << dendl;
+ /* because if it's a neighboring lock there can't be any self-overlapping
+ locks that covered it */
+ if (old_lock->type == new_lock.type) { //merge them
+ if (0 == new_lock.length) {
+ if (old_lock->start + old_lock->length == new_lock.start) {
+ new_lock.start = old_lock->start;
+ } else ceph_abort(); /* if there's no end to new_lock, the neighbor
+ HAS TO be to left side */
+ } else if (0 == old_lock->length) {
+ if (new_lock.start + new_lock.length == old_lock->start) {
+ new_lock.length = 0;
+ } else ceph_abort(); //same as before, but reversed
+ } else {
+ if (old_lock->start + old_lock->length == new_lock.start) {
+ new_lock.start = old_lock->start;
+ new_lock.length = old_lock->length + new_lock.length;
+ } else if (new_lock.start + new_lock.length == old_lock->start) {
+ new_lock.length = old_lock->length + new_lock.length;
+ }
+ }
+ held_locks.erase(*iter);
+ --client_held_lock_counts[old_lock_client];
+ }
+ if (!client_held_lock_counts[old_lock_client]) {
+ client_held_lock_counts.erase(old_lock_client);
+ }
+ }
+}
+
+multimap<uint64_t, ceph_filelock>::iterator
+ceph_lock_state_t::get_lower_bound(uint64_t start,
+ multimap<uint64_t, ceph_filelock>& lock_map)
+{
+ multimap<uint64_t, ceph_filelock>::iterator lower_bound =
+ lock_map.lower_bound(start);
+ if ((lower_bound->first != start)
+ && (start != 0)
+ && (lower_bound != lock_map.begin())) --lower_bound;
+ if (lock_map.end() == lower_bound)
+ ldout(cct,15) << "get_lower_dout(15)eturning end()" << dendl;
+ else ldout(cct,15) << "get_lower_bound returning iterator pointing to "
+ << lower_bound->second << dendl;
+ return lower_bound;
+ }
+
+multimap<uint64_t, ceph_filelock>::iterator
+ceph_lock_state_t::get_last_before(uint64_t end,
+ multimap<uint64_t, ceph_filelock>& lock_map)
+{
+ multimap<uint64_t, ceph_filelock>::iterator last =
+ lock_map.upper_bound(end);
+ if (last != lock_map.begin()) --last;
+ if (lock_map.end() == last)
+ ldout(cct,15) << "get_last_before returning end()" << dendl;
+ else ldout(cct,15) << "get_last_before returning iterator pointing to "
+ << last->second << dendl;
+ return last;
+}
+
+bool ceph_lock_state_t::share_space(
+ multimap<uint64_t, ceph_filelock>::iterator& iter,
+ uint64_t start, uint64_t end)
+{
+ bool ret = ((iter->first >= start && iter->first <= end) ||
+ ((iter->first < start) &&
+ (((iter->first + iter->second.length - 1) >= start) ||
+ (0 == iter->second.length))));
+ ldout(cct,15) << "share_space got start: " << start << ", end: " << end
+ << ", lock: " << iter->second << ", returning " << ret << dendl;
+ return ret;
+}
+
+bool ceph_lock_state_t::get_overlapping_locks(const ceph_filelock& lock,
+ list<multimap<uint64_t,
+ ceph_filelock>::iterator> & overlaps,
+ list<multimap<uint64_t,
+ ceph_filelock>::iterator> *self_neighbors)
+{
+ ldout(cct,15) << "get_overlapping_locks" << dendl;
+ // create a lock starting one earlier and ending one later
+ // to check for neighbors
+ ceph_filelock neighbor_check_lock = lock;
+ if (neighbor_check_lock.start != 0) {
+ neighbor_check_lock.start = neighbor_check_lock.start - 1;
+ if (neighbor_check_lock.length)
+ neighbor_check_lock.length = neighbor_check_lock.length + 2;
+ } else {
+ if (neighbor_check_lock.length)
+ neighbor_check_lock.length = neighbor_check_lock.length + 1;
+ }
+ //find the last held lock starting at the point after lock
+ uint64_t endpoint = lock.start;
+ if (lock.length) {
+ endpoint += lock.length;
+ } else {
+ endpoint = uint64_t(-1); // max offset
+ }
+ multimap<uint64_t, ceph_filelock>::iterator iter =
+ get_last_before(endpoint, held_locks);
+ bool cont = iter != held_locks.end();
+ while(cont) {
+ if (share_space(iter, lock)) {
+ overlaps.push_front(iter);
+ } else if (self_neighbors &&
+ ceph_filelock_owner_equal(neighbor_check_lock, iter->second) &&
+ share_space(iter, neighbor_check_lock)) {
+ self_neighbors->push_front(iter);
+ }
+ if ((iter->first < lock.start) && (CEPH_LOCK_EXCL == iter->second.type)) {
+ //can't be any more overlapping locks or they'd interfere with this one
+ cont = false;
+ } else if (held_locks.begin() == iter) cont = false;
+ else --iter;
+ }
+ return !overlaps.empty();
+}
+
+bool ceph_lock_state_t::get_waiting_overlaps(const ceph_filelock& lock,
+ list<multimap<uint64_t,
+ ceph_filelock>::iterator>&
+ overlaps)
+{
+ ldout(cct,15) << "get_waiting_overlaps" << dendl;
+ multimap<uint64_t, ceph_filelock>::iterator iter =
+ get_last_before(lock.start + lock.length - 1, waiting_locks);
+ bool cont = iter != waiting_locks.end();
+ while(cont) {
+ if (share_space(iter, lock)) overlaps.push_front(iter);
+ if (waiting_locks.begin() == iter) cont = false;
+ --iter;
+ }
+ return !overlaps.empty();
+}
+
+void ceph_lock_state_t::split_by_owner(const ceph_filelock& owner,
+ list<multimap<uint64_t,
+ ceph_filelock>::iterator>& locks,
+ list<multimap<uint64_t,
+ ceph_filelock>::iterator>&
+ owned_locks)
+{
+ list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+ iter = locks.begin();
+ ldout(cct,15) << "owner lock: " << owner << dendl;
+ while (iter != locks.end()) {
+ ldout(cct,15) << "comparing to " << (*iter)->second << dendl;
+ if (ceph_filelock_owner_equal((*iter)->second, owner)) {
+ ldout(cct,15) << "success, pushing to owned_locks" << dendl;
+ owned_locks.push_back(*iter);
+ iter = locks.erase(iter);
+ } else {
+ ldout(cct,15) << "failure, something not equal in this group "
+ << (*iter)->second.client << ":" << owner.client << ","
+ << (*iter)->second.owner << ":" << owner.owner << ","
+ << (*iter)->second.pid << ":" << owner.pid << dendl;
+ ++iter;
+ }
+ }
+}
+
+ceph_filelock *
+ceph_lock_state_t::contains_exclusive_lock(list<multimap<uint64_t,
+ ceph_filelock>::iterator>& locks)
+{
+ for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+ iter = locks.begin();
+ iter != locks.end();
+ ++iter) {
+ if (CEPH_LOCK_EXCL == (*iter)->second.type) return &(*iter)->second;
+ }
+ return NULL;
+}
diff --git a/src/mds/flock.h b/src/mds/flock.h
new file mode 100644
index 00000000..ef1793f4
--- /dev/null
+++ b/src/mds/flock.h
@@ -0,0 +1,290 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_MDS_FLOCK_H
+#define CEPH_MDS_FLOCK_H
+
+#include <errno.h>
+
+#include "common/debug.h"
+#include "mdstypes.h"
+
+
+inline ostream& operator<<(ostream& out, const ceph_filelock& l) {
+ out << "start: " << l.start << ", length: " << l.length
+ << ", client: " << l.client << ", owner: " << l.owner
+ << ", pid: " << l.pid << ", type: " << (int)l.type
+ << std::endl;
+ return out;
+}
+
+inline bool ceph_filelock_owner_equal(const ceph_filelock& l, const ceph_filelock& r)
+{
+ if (l.client != r.client || l.owner != r.owner)
+ return false;
+ // The file lock is from old client if the most significant bit of
+ // 'owner' is not set. Old clients use both 'owner' and 'pid' to
+ // identify the owner of lock.
+ if (l.owner & (1ULL << 63))
+ return true;
+ return l.pid == r.pid;
+}
+
+inline int ceph_filelock_owner_compare(const ceph_filelock& l, const ceph_filelock& r)
+{
+ if (l.client != r.client)
+ return l.client > r.client ? 1 : -1;
+ if (l.owner != r.owner)
+ return l.owner > r.owner ? 1 : -1;
+ if (l.owner & (1ULL << 63))
+ return 0;
+ if (l.pid != r.pid)
+ return l.pid > r.pid ? 1 : -1;
+ return 0;
+}
+
+inline int ceph_filelock_compare(const ceph_filelock& l, const ceph_filelock& r)
+{
+ int ret = ceph_filelock_owner_compare(l, r);
+ if (ret)
+ return ret;
+ if (l.start != r.start)
+ return l.start > r.start ? 1 : -1;
+ if (l.length != r.length)
+ return l.length > r.length ? 1 : -1;
+ if (l.type != r.type)
+ return l.type > r.type ? 1 : -1;
+ return 0;
+}
+
+inline bool operator<(const ceph_filelock& l, const ceph_filelock& r)
+{
+ return ceph_filelock_compare(l, r) < 0;
+}
+
+inline bool operator==(const ceph_filelock& l, const ceph_filelock& r) {
+ return ceph_filelock_compare(l, r) == 0;
+}
+
+inline bool operator!=(const ceph_filelock& l, const ceph_filelock& r) {
+ return ceph_filelock_compare(l, r) != 0;
+}
+
+class ceph_lock_state_t {
+ CephContext *cct;
+ int type;
+public:
+ explicit ceph_lock_state_t(CephContext *cct_, int type_) : cct(cct_), type(type_) {}
+ ~ceph_lock_state_t();
+ multimap<uint64_t, ceph_filelock> held_locks; // current locks
+ multimap<uint64_t, ceph_filelock> waiting_locks; // locks waiting for other locks
+ // both of the above are keyed by starting offset
+ map<client_t, int> client_held_lock_counts;
+ map<client_t, int> client_waiting_lock_counts;
+
+ /**
+ * Check if a lock is on the waiting_locks list.
+ *
+ * @param fl The filelock to check for
+ * @returns True if the lock is waiting, false otherwise
+ */
+ bool is_waiting(const ceph_filelock &fl) const;
+ /**
+ * Remove a lock from the waiting_locks list
+ *
+ * @param fl The filelock to remove
+ */
+ void remove_waiting(const ceph_filelock& fl);
+ /*
+ * Try to set a new lock. If it's blocked and wait_on_fail is true,
+ * add the lock to waiting_locks.
+ * The lock needs to be of type CEPH_LOCK_EXCL or CEPH_LOCK_SHARED.
+ * This may merge previous locks, or convert the type of already-owned
+ * locks.
+ *
+ * @param new_lock The lock to set
+ * @param wait_on_fail whether to wait until the lock can be set.
+ * Otherwise it fails immediately when blocked.
+ *
+ * @returns true if set, false if not set.
+ */
+ bool add_lock(ceph_filelock& new_lock, bool wait_on_fail, bool replay,
+ bool *deadlock);
+ /**
+ * See if a lock is blocked by existing locks. If the lock is blocked,
+ * it will be set to the value of the first blocking lock. Otherwise,
+ * it will be returned unchanged, except for setting the type field
+ * to CEPH_LOCK_UNLOCK.
+ *
+ * @param testing_lock The lock to check for conflicts on.
+ */
+ void look_for_lock(ceph_filelock& testing_lock);
+
+ /*
+ * Remove lock(s) described in old_lock. This may involve splitting a
+ * previous lock or making a previous lock smaller.
+ *
+ * @param removal_lock The lock to remove
+ * @param activated_locks A return parameter, holding activated wait locks.
+ */
+ void remove_lock(const ceph_filelock removal_lock,
+ list<ceph_filelock>& activated_locks);
+
+ bool remove_all_from(client_t client);
+private:
+ static const unsigned MAX_DEADLK_DEPTH = 5;
+
+ /**
+ * Check if adding the lock causes deadlock
+ *
+ * @param fl The blocking filelock
+ * @param overlapping_locks list of all overlapping locks
+ * @param first_fl
+ * @depth recursion call depth
+ */
+ bool is_deadlock(const ceph_filelock& fl,
+ list<multimap<uint64_t, ceph_filelock>::iterator>&
+ overlapping_locks,
+ const ceph_filelock *first_fl=NULL, unsigned depth=0) const;
+
+ /**
+ * Add a lock to the waiting_locks list
+ *
+ * @param fl The filelock to add
+ */
+ void add_waiting(const ceph_filelock& fl);
+
+ /**
+ * Adjust old locks owned by a single process so that process can set
+ * a new lock of different type. Handle any changes needed to the old locks
+ * (and the new lock) so that once the new lock is inserted into the
+ * held_locks list the process has a coherent, non-fragmented set of lock
+ * ranges. Make sure any overlapping locks are combined, trimmed, and removed
+ * as needed.
+ * This function should only be called once you know the lock will be
+ * inserted, as it DOES adjust new_lock. You can call this function
+ * on an empty list, in which case it does nothing.
+ * This function does not remove elements from old_locks, so regard the list
+ * as bad information following function invocation.
+ *
+ * @param new_lock The new lock the process has requested.
+ * @param old_locks list of all locks currently held by same
+ * client/process that overlap new_lock.
+ * @param neighbor_locks locks owned by same process that neighbor new_lock on
+ * left or right side.
+ */
+ void adjust_locks(list<multimap<uint64_t, ceph_filelock>::iterator> old_locks,
+ ceph_filelock& new_lock,
+ list<multimap<uint64_t, ceph_filelock>::iterator>
+ neighbor_locks);
+
+ //get last lock prior to start position
+ multimap<uint64_t, ceph_filelock>::iterator
+ get_lower_bound(uint64_t start,
+ multimap<uint64_t, ceph_filelock>& lock_map);
+ //get latest-starting lock that goes over the byte "end"
+ multimap<uint64_t, ceph_filelock>::iterator
+ get_last_before(uint64_t end,
+ multimap<uint64_t, ceph_filelock>& lock_map);
+
+ /*
+ * See if an iterator's lock covers any of the same bounds as a given range
+ * Rules: locks cover "length" bytes from "start", so the last covered
+ * byte is at start + length - 1.
+ * If the length is 0, the lock covers from "start" to the end of the file.
+ */
+ bool share_space(multimap<uint64_t, ceph_filelock>::iterator& iter,
+ uint64_t start, uint64_t end);
+
+ bool share_space(multimap<uint64_t, ceph_filelock>::iterator& iter,
+ const ceph_filelock &lock) {
+ uint64_t end = lock.start;
+ if (lock.length) {
+ end += lock.length - 1;
+ } else { // zero length means end of file
+ end = uint64_t(-1);
+ }
+ return share_space(iter, lock.start, end);
+ }
+ /*
+ *get a list of all locks overlapping with the given lock's range
+ * lock: the lock to compare with.
+ * overlaps: an empty list, to be filled.
+ * Returns: true if at least one lock overlaps.
+ */
+ bool get_overlapping_locks(const ceph_filelock& lock,
+ list<multimap<uint64_t,
+ ceph_filelock>::iterator> & overlaps,
+ list<multimap<uint64_t,
+ ceph_filelock>::iterator> *self_neighbors);
+
+
+ bool get_overlapping_locks(const ceph_filelock& lock,
+ list<multimap<uint64_t, ceph_filelock>::iterator>& overlaps) {
+ return get_overlapping_locks(lock, overlaps, NULL);
+ }
+
+ /**
+ * Get a list of all waiting locks that overlap with the given lock's range.
+ * lock: specifies the range to compare with
+ * overlaps: an empty list, to be filled
+ * Returns: true if at least one waiting_lock overlaps
+ */
+ bool get_waiting_overlaps(const ceph_filelock& lock,
+ list<multimap<uint64_t,
+ ceph_filelock>::iterator>& overlaps);
+ /*
+ * split a list of locks up by whether they're owned by same
+ * process as given lock
+ * owner: the owning lock
+ * locks: the list of locks (obtained from get_overlapping_locks, probably)
+ * Will have all locks owned by owner removed
+ * owned_locks: an empty list, to be filled with the locks owned by owner
+ */
+ void split_by_owner(const ceph_filelock& owner,
+ list<multimap<uint64_t,
+ ceph_filelock>::iterator> & locks,
+ list<multimap<uint64_t,
+ ceph_filelock>::iterator> & owned_locks);
+
+ ceph_filelock *contains_exclusive_lock(list<multimap<uint64_t,
+ ceph_filelock>::iterator>& locks);
+
+public:
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(held_locks, bl);
+ encode(client_held_lock_counts, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(held_locks, bl);
+ decode(client_held_lock_counts, bl);
+ }
+ bool empty() const {
+ return held_locks.empty() && waiting_locks.empty() &&
+ client_held_lock_counts.empty() &&
+ client_waiting_lock_counts.empty();
+ }
+};
+WRITE_CLASS_ENCODER(ceph_lock_state_t)
+
+
+inline ostream& operator<<(ostream &out, const ceph_lock_state_t &l) {
+ out << "ceph_lock_state_t. held_locks.size()=" << l.held_locks.size()
+ << ", waiting_locks.size()=" << l.waiting_locks.size()
+ << ", client_held_lock_counts -- " << l.client_held_lock_counts
+ << "\n client_waiting_lock_counts -- " << l.client_waiting_lock_counts
+ << "\n held_locks -- ";
+ for (auto iter = l.held_locks.begin();
+ iter != l.held_locks.end();
+ ++iter)
+ out << iter->second;
+ out << "\n waiting_locks -- ";
+ for (auto iter =l.waiting_locks.begin();
+ iter != l.waiting_locks.end();
+ ++iter)
+ out << iter->second << "\n";
+ return out;
+}
+
+#endif
diff --git a/src/mds/inode_backtrace.cc b/src/mds/inode_backtrace.cc
new file mode 100644
index 00000000..50e986a1
--- /dev/null
+++ b/src/mds/inode_backtrace.cc
@@ -0,0 +1,163 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "inode_backtrace.h"
+
+#include "common/Formatter.h"
+
+/* inode_backpointer_t */
+
+void inode_backpointer_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(dirino, bl);
+ encode(dname, bl);
+ encode(version, bl);
+ ENCODE_FINISH(bl);
+}
+
+void inode_backpointer_t::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(dirino, bl);
+ decode(dname, bl);
+ decode(version, bl);
+ DECODE_FINISH(bl);
+}
+
+void inode_backpointer_t::decode_old(bufferlist::const_iterator& bl)
+{
+ using ceph::decode;
+ decode(dirino, bl);
+ decode(dname, bl);
+ decode(version, bl);
+}
+
+void inode_backpointer_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("dirino", dirino);
+ f->dump_string("dname", dname);
+ f->dump_unsigned("version", version);
+}
+
+void inode_backpointer_t::generate_test_instances(list<inode_backpointer_t*>& ls)
+{
+ ls.push_back(new inode_backpointer_t);
+ ls.push_back(new inode_backpointer_t);
+ ls.back()->dirino = 1;
+ ls.back()->dname = "foo";
+ ls.back()->version = 123;
+}
+
+
+/*
+ * inode_backtrace_t
+ */
+
+void inode_backtrace_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(5, 4, bl);
+ encode(ino, bl);
+ encode(ancestors, bl);
+ encode(pool, bl);
+ encode(old_pools, bl);
+ ENCODE_FINISH(bl);
+}
+
+void inode_backtrace_t::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
+ if (struct_v < 3)
+ return; // sorry, the old data was crap
+ decode(ino, bl);
+ if (struct_v >= 4) {
+ decode(ancestors, bl);
+ } else {
+ __u32 n;
+ decode(n, bl);
+ while (n--) {
+ ancestors.push_back(inode_backpointer_t());
+ ancestors.back().decode_old(bl);
+ }
+ }
+ if (struct_v >= 5) {
+ decode(pool, bl);
+ decode(old_pools, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+void inode_backtrace_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("ino", ino);
+ f->open_array_section("ancestors");
+ for (vector<inode_backpointer_t>::const_iterator p = ancestors.begin(); p != ancestors.end(); ++p) {
+ f->open_object_section("backpointer");
+ p->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->dump_int("pool", pool);
+ f->open_array_section("old_pools");
+ for (set<int64_t>::iterator p = old_pools.begin(); p != old_pools.end(); ++p) {
+ f->dump_int("old_pool", *p);
+ }
+ f->close_section();
+}
+
+void inode_backtrace_t::generate_test_instances(list<inode_backtrace_t*>& ls)
+{
+ ls.push_back(new inode_backtrace_t);
+ ls.push_back(new inode_backtrace_t);
+ ls.back()->ino = 1;
+ ls.back()->ancestors.push_back(inode_backpointer_t());
+ ls.back()->ancestors.back().dirino = 123;
+ ls.back()->ancestors.back().dname = "bar";
+ ls.back()->ancestors.back().version = 456;
+ ls.back()->pool = 0;
+ ls.back()->old_pools.insert(10);
+ ls.back()->old_pools.insert(7);
+}
+
+int inode_backtrace_t::compare(const inode_backtrace_t& other,
+ bool *equivalent, bool *divergent) const
+{
+ int min_size = std::min(ancestors.size(),other.ancestors.size());
+ *equivalent = true;
+ *divergent = false;
+ if (min_size == 0)
+ return 0;
+ int comparator = 0;
+ if (ancestors[0].version > other.ancestors[0].version)
+ comparator = 1;
+ else if (ancestors[0].version < other.ancestors[0].version)
+ comparator = -1;
+ if (ancestors[0].dirino != other.ancestors[0].dirino ||
+ ancestors[0].dname != other.ancestors[0].dname)
+ *divergent = true;
+ for (int i = 1; i < min_size; ++i) {
+ if (*divergent) {
+ /**
+ * we already know the dentries and versions are
+ * incompatible; no point checking farther
+ */
+ break;
+ }
+ if (ancestors[i].dirino != other.ancestors[i].dirino ||
+ ancestors[i].dname != other.ancestors[i].dname) {
+ *equivalent = false;
+ return comparator;
+ } else if (ancestors[i].version > other.ancestors[i].version) {
+ if (comparator < 0)
+ *divergent = true;
+ comparator = 1;
+ } else if (ancestors[i].version < other.ancestors[i].version) {
+ if (comparator > 0)
+ *divergent = true;
+ comparator = -1;
+ }
+ }
+ if (*divergent)
+ *equivalent = false;
+ return comparator;
+}
diff --git a/src/mds/inode_backtrace.h b/src/mds/inode_backtrace.h
new file mode 100644
index 00000000..7c60865c
--- /dev/null
+++ b/src/mds/inode_backtrace.h
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_INODE_BACKTRACE_H
+#define CEPH_INODE_BACKTRACE_H
+
+#include <string_view>
+
+#include "mdstypes.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+/** metadata backpointers **/
+
+/*
+ * - inode_backpointer_t is just the _pointer_ portion; it doesn't
+ * tell us who we point _from_.
+ *
+ * - it _does_ include a version of the source object, so we can look
+ * at two different pointers (from the same inode) and tell which is
+ * newer.
+ */
+struct inode_backpointer_t {
+ inodeno_t dirino; // containing directory ino
+ string dname; // linking dentry name
+ version_t version; // child's version at time of backpointer creation
+
+ inode_backpointer_t() : version(0) {}
+ inode_backpointer_t(inodeno_t i, std::string_view d, version_t v) : dirino(i), dname(d), version(v) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator &bl);
+ void decode_old(bufferlist::const_iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<inode_backpointer_t*>& ls);
+};
+WRITE_CLASS_ENCODER(inode_backpointer_t)
+
+inline bool operator==(const inode_backpointer_t& l, const inode_backpointer_t& r) {
+ return l.dirino == r.dirino && l.version == r.version && l.dname == r.dname;
+}
+
+inline ostream& operator<<(ostream& out, const inode_backpointer_t& ib) {
+ return out << "<" << ib.dirino << "/" << ib.dname << " v" << ib.version << ">";
+}
+
+/*
+ * inode_backtrace_t is a complete ancestor backtraces for a given inode.
+ * we include who _we_ are, so that the backtrace can stand alone (as, say,
+ * an xattr on an object).
+ */
+struct inode_backtrace_t {
+ inodeno_t ino; // my ino
+ vector<inode_backpointer_t> ancestors;
+ int64_t pool;
+ // we use a set for old_pools to avoid duplicate entries, e.g. setlayout 0, 1, 0
+ set<int64_t> old_pools;
+
+ inode_backtrace_t() : pool(-1) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<inode_backtrace_t*>& ls);
+
+ /**
+ * Compare two backtraces *for the same inode*.
+ * @pre The backtraces are for the same inode
+ *
+ * @param other The backtrace to compare ourselves with
+ * @param equivalent A bool pointer which will be set to true if
+ * the other backtrace is equivalent to our own (has the same dentries)
+ * @param divergent A bool pointer which will be set to true if
+ * the backtraces have differing entries without versions supporting them
+ *
+ * @returns 1 if we are newer than the other, 0 if equal, -1 if older
+ */
+ int compare(const inode_backtrace_t& other,
+ bool *equivalent, bool *divergent) const;
+};
+WRITE_CLASS_ENCODER(inode_backtrace_t)
+
+inline ostream& operator<<(ostream& out, const inode_backtrace_t& it) {
+ return out << "(" << it.pool << ")" << it.ino << ":" << it.ancestors << "//" << it.old_pools;
+}
+
+inline bool operator==(const inode_backtrace_t& l,
+ const inode_backtrace_t& r) {
+ return l.ino == r.ino &&
+ l.pool == r.pool &&
+ l.old_pools == r.old_pools &&
+ l.ancestors == r.ancestors;
+}
+
+#endif
+
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
new file mode 100644
index 00000000..3eb24af2
--- /dev/null
+++ b/src/mds/journal.cc
@@ -0,0 +1,3170 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/config.h"
+#include "osdc/Journaler.h"
+#include "events/ESubtreeMap.h"
+#include "events/ESession.h"
+#include "events/ESessions.h"
+
+#include "events/EMetaBlob.h"
+#include "events/EResetJournal.h"
+#include "events/ENoOp.h"
+
+#include "events/EUpdate.h"
+#include "events/ESlaveUpdate.h"
+#include "events/EOpen.h"
+#include "events/ECommitted.h"
+
+#include "events/EExport.h"
+#include "events/EImportStart.h"
+#include "events/EImportFinish.h"
+#include "events/EFragment.h"
+
+#include "events/ETableClient.h"
+#include "events/ETableServer.h"
+
+#include "include/stringify.h"
+
+#include "LogSegment.h"
+
+#include "MDSRank.h"
+#include "MDLog.h"
+#include "MDCache.h"
+#include "Server.h"
+#include "Migrator.h"
+#include "Mutation.h"
+
+#include "InoTable.h"
+#include "MDSTableClient.h"
+#include "MDSTableServer.h"
+
+#include "Locker.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
+
+
+// -----------------------
+// LogSegment
+
+void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio)
+{
+ set<CDir*> commit;
+
+ dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire" << dendl;
+
+ ceph_assert(g_conf()->mds_kill_journal_expire_at != 1);
+
+ // commit dirs
+ for (elist<CDir*>::iterator p = new_dirfrags.begin(); !p.end(); ++p) {
+ dout(20) << " new_dirfrag " << **p << dendl;
+ ceph_assert((*p)->is_auth());
+ commit.insert(*p);
+ }
+ for (elist<CDir*>::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) {
+ dout(20) << " dirty_dirfrag " << **p << dendl;
+ ceph_assert((*p)->is_auth());
+ commit.insert(*p);
+ }
+ for (elist<CDentry*>::iterator p = dirty_dentries.begin(); !p.end(); ++p) {
+ dout(20) << " dirty_dentry " << **p << dendl;
+ ceph_assert((*p)->is_auth());
+ commit.insert((*p)->get_dir());
+ }
+ for (elist<CInode*>::iterator p = dirty_inodes.begin(); !p.end(); ++p) {
+ dout(20) << " dirty_inode " << **p << dendl;
+ ceph_assert((*p)->is_auth());
+ if ((*p)->is_base()) {
+ (*p)->store(gather_bld.new_sub());
+ } else
+ commit.insert((*p)->get_parent_dn()->get_dir());
+ }
+
+ if (!commit.empty()) {
+ for (set<CDir*>::iterator p = commit.begin();
+ p != commit.end();
+ ++p) {
+ CDir *dir = *p;
+ ceph_assert(dir->is_auth());
+ if (dir->can_auth_pin()) {
+ dout(15) << "try_to_expire committing " << *dir << dendl;
+ dir->commit(0, gather_bld.new_sub(), false, op_prio);
+ } else {
+ dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl;
+ dir->add_waiter(CDir::WAIT_UNFREEZE, gather_bld.new_sub());
+ }
+ }
+ }
+
+ // master ops with possibly uncommitted slaves
+ for (set<metareqid_t>::iterator p = uncommitted_masters.begin();
+ p != uncommitted_masters.end();
+ ++p) {
+ dout(10) << "try_to_expire waiting for slaves to ack commit on " << *p << dendl;
+ mds->mdcache->wait_for_uncommitted_master(*p, gather_bld.new_sub());
+ }
+
+ // slave ops that haven't been committed
+ for (set<metareqid_t>::iterator p = uncommitted_slaves.begin();
+ p != uncommitted_slaves.end();
+ ++p) {
+ dout(10) << "try_to_expire waiting for master to ack OP_FINISH on " << *p << dendl;
+ mds->mdcache->wait_for_uncommitted_slave(*p, gather_bld.new_sub());
+ }
+
+ // uncommitted fragments
+ for (set<dirfrag_t>::iterator p = uncommitted_fragments.begin();
+ p != uncommitted_fragments.end();
+ ++p) {
+ dout(10) << "try_to_expire waiting for uncommitted fragment " << *p << dendl;
+ mds->mdcache->wait_for_uncommitted_fragment(*p, gather_bld.new_sub());
+ }
+
+ // nudge scatterlocks
+ for (elist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) {
+ CInode *in = *p;
+ dout(10) << "try_to_expire waiting for dirlock flush on " << *in << dendl;
+ mds->locker->scatter_nudge(&in->filelock, gather_bld.new_sub());
+ }
+ for (elist<CInode*>::iterator p = dirty_dirfrag_dirfragtree.begin(); !p.end(); ++p) {
+ CInode *in = *p;
+ dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in << dendl;
+ mds->locker->scatter_nudge(&in->dirfragtreelock, gather_bld.new_sub());
+ }
+ for (elist<CInode*>::iterator p = dirty_dirfrag_nest.begin(); !p.end(); ++p) {
+ CInode *in = *p;
+ dout(10) << "try_to_expire waiting for nest flush on " << *in << dendl;
+ mds->locker->scatter_nudge(&in->nestlock, gather_bld.new_sub());
+ }
+
+ ceph_assert(g_conf()->mds_kill_journal_expire_at != 2);
+
+ // open files and snap inodes
+ if (!open_files.empty()) {
+ ceph_assert(!mds->mdlog->is_capped()); // hmm FIXME
+ EOpen *le = 0;
+ LogSegment *ls = mds->mdlog->get_current_segment();
+ ceph_assert(ls != this);
+ elist<CInode*>::iterator p = open_files.begin(member_offset(CInode, item_open_file));
+ while (!p.end()) {
+ CInode *in = *p;
+ ++p;
+ if (in->last != CEPH_NOSNAP && in->is_auth() && !in->client_snap_caps.empty()) {
+ // journal snap inodes that need flush. This simplify the mds failover hanlding
+ dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl;
+ if (!le) {
+ le = new EOpen(mds->mdlog);
+ mds->mdlog->start_entry(le);
+ }
+ le->add_clean_inode(in);
+ ls->open_files.push_back(&in->item_open_file);
+ } else {
+ // open files are tracked by open file table, no need to journal them again
+ in->item_open_file.remove_myself();
+ }
+ }
+ if (le) {
+ mds->mdlog->submit_entry(le);
+ mds->mdlog->wait_for_safe(gather_bld.new_sub());
+ dout(10) << "try_to_expire waiting for open files to rejournal" << dendl;
+ }
+ }
+
+ ceph_assert(g_conf()->mds_kill_journal_expire_at != 3);
+
+ // backtraces to be stored/updated
+ for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) {
+ CInode *in = *p;
+ ceph_assert(in->is_auth());
+ if (in->can_auth_pin()) {
+ dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl;
+ in->store_backtrace(gather_bld.new_sub(), op_prio);
+ } else {
+ dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl;
+ in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub());
+ }
+ }
+
+ ceph_assert(g_conf()->mds_kill_journal_expire_at != 4);
+
+ // idalloc
+ if (inotablev > mds->inotable->get_committed_version()) {
+ dout(10) << "try_to_expire saving inotable table, need " << inotablev
+ << ", committed is " << mds->inotable->get_committed_version()
+ << " (" << mds->inotable->get_committing_version() << ")"
+ << dendl;
+ mds->inotable->save(gather_bld.new_sub(), inotablev);
+ }
+
+ // sessionmap
+ if (sessionmapv > mds->sessionmap.get_committed()) {
+ dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
+ << ", committed is " << mds->sessionmap.get_committed()
+ << " (" << mds->sessionmap.get_committing() << ")"
+ << dendl;
+ mds->sessionmap.save(gather_bld.new_sub(), sessionmapv);
+ }
+
+ // updates to sessions for completed_requests
+ mds->sessionmap.save_if_dirty(touched_sessions, &gather_bld);
+ touched_sessions.clear();
+
+ // pending commit atids
+ for (map<int, ceph::unordered_set<version_t> >::iterator p = pending_commit_tids.begin();
+ p != pending_commit_tids.end();
+ ++p) {
+ MDSTableClient *client = mds->get_table_client(p->first);
+ ceph_assert(client);
+ for (ceph::unordered_set<version_t>::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q) {
+ dout(10) << "try_to_expire " << get_mdstable_name(p->first) << " transaction " << *q
+ << " pending commit (not yet acked), waiting" << dendl;
+ ceph_assert(!client->has_committed(*q));
+ client->wait_for_ack(*q, gather_bld.new_sub());
+ }
+ }
+
+ // table servers
+ for (map<int, version_t>::iterator p = tablev.begin();
+ p != tablev.end();
+ ++p) {
+ MDSTableServer *server = mds->get_table_server(p->first);
+ ceph_assert(server);
+ if (p->second > server->get_committed_version()) {
+ dout(10) << "try_to_expire waiting for " << get_mdstable_name(p->first)
+ << " to save, need " << p->second << dendl;
+ server->save(gather_bld.new_sub());
+ }
+ }
+
+ // truncating
+ for (set<CInode*>::iterator p = truncating_inodes.begin();
+ p != truncating_inodes.end();
+ ++p) {
+ dout(10) << "try_to_expire waiting for truncate of " << **p << dendl;
+ (*p)->add_waiter(CInode::WAIT_TRUNC, gather_bld.new_sub());
+ }
+
+ if (gather_bld.has_subs()) {
+ dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire waiting" << dendl;
+ mds->mdlog->flush();
+ } else {
+ ceph_assert(g_conf()->mds_kill_journal_expire_at != 5);
+ dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire success" << dendl;
+ }
+}
+
+
+// -----------------------
+// EMetaBlob
+
+void EMetaBlob::add_dir_context(CDir *dir, int mode)
+{
+ MDSRank *mds = dir->cache->mds;
+
+ list<CDentry*> parents;
+
+ // it may be okay not to include the maybe items, if
+ // - we journaled the maybe child inode in this segment
+ // - that subtree turns out to be unambiguously auth
+ list<CDentry*> maybe;
+ bool maybenot = false;
+
+ while (true) {
+ // already have this dir? (we must always add in order)
+ if (lump_map.count(dir->dirfrag())) {
+ dout(20) << "EMetaBlob::add_dir_context(" << dir << ") have lump " << dir->dirfrag() << dendl;
+ break;
+ }
+
+ // stop at root/stray
+ CInode *diri = dir->get_inode();
+ CDentry *parent = diri->get_projected_parent_dn();
+
+ if (mode == TO_AUTH_SUBTREE_ROOT) {
+ // subtree root?
+ if (dir->is_subtree_root()) {
+ // match logic in MDCache::create_subtree_map()
+ if (dir->get_dir_auth().first == mds->get_nodeid()) {
+ mds_authority_t parent_auth = parent ? parent->authority() : CDIR_AUTH_UNDEF;
+ if (parent_auth.first == dir->get_dir_auth().first) {
+ if (parent_auth.second == CDIR_AUTH_UNKNOWN &&
+ !dir->is_ambiguous_dir_auth() &&
+ !dir->state_test(CDir::STATE_EXPORTBOUND) &&
+ !dir->state_test(CDir::STATE_AUXSUBTREE) &&
+ !diri->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
+ dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir << dendl;
+ ceph_abort();
+ }
+ dout(20) << "EMetaBlob::add_dir_context(" << dir << ") ambiguous or transient subtree " << dendl;
+ } else {
+ // it's an auth subtree, we don't need maybe (if any), and we're done.
+ dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
+ << " at " << *dir << dendl;
+ maybe.clear();
+ break;
+ }
+ } else {
+ dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached ambig or !auth subtree, need " << maybe
+ << " at " << *dir << dendl;
+ // we need the maybe list after all!
+ parents.splice(parents.begin(), maybe);
+ maybenot = false;
+ }
+ }
+
+ // was the inode journaled in this blob?
+ if (event_seq && diri->last_journaled == event_seq) {
+ dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri this blob " << *diri << dendl;
+ break;
+ }
+
+ // have we journaled this inode since the last subtree map?
+ if (!maybenot && last_subtree_map && diri->last_journaled >= last_subtree_map) {
+ dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri in this segment ("
+ << diri->last_journaled << " >= " << last_subtree_map << "), setting maybenot flag "
+ << *diri << dendl;
+ maybenot = true;
+ }
+ }
+
+ if (!parent)
+ break;
+
+ if (maybenot) {
+ dout(25) << "EMetaBlob::add_dir_context(" << dir << ") maybe " << *parent << dendl;
+ maybe.push_front(parent);
+ } else {
+ dout(25) << "EMetaBlob::add_dir_context(" << dir << ") definitely " << *parent << dendl;
+ parents.push_front(parent);
+ }
+
+ dir = parent->get_dir();
+ }
+
+ parents.splice(parents.begin(), maybe);
+
+ dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl;
+ for (list<CDentry*>::iterator p = parents.begin(); p != parents.end(); ++p) {
+ ceph_assert((*p)->get_projected_linkage()->is_primary());
+ add_dentry(*p, false);
+ }
+}
+
+void EMetaBlob::update_segment(LogSegment *ls)
+{
+ // dirty inode mtimes
+ // -> handled directly by Server.cc, replay()
+
+ // alloc table update?
+ if (inotablev)
+ ls->inotablev = inotablev;
+ if (sessionmapv)
+ ls->sessionmapv = sessionmapv;
+
+ // truncated inodes
+ // -> handled directly by Server.cc
+
+ // client requests
+ // note the newest request per client
+ //if (!client_reqs.empty())
+ // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
+}
+
+// EMetaBlob::fullbit
+
+void EMetaBlob::fullbit::encode(bufferlist& bl, uint64_t features) const {
+ ENCODE_START(8, 5, bl);
+ encode(dn, bl);
+ encode(dnfirst, bl);
+ encode(dnlast, bl);
+ encode(dnv, bl);
+ encode(inode, bl, features);
+ encode(xattrs, bl);
+ if (inode.is_symlink())
+ encode(symlink, bl);
+ if (inode.is_dir()) {
+ encode(dirfragtree, bl);
+ encode(snapbl, bl);
+ }
+ encode(state, bl);
+ if (old_inodes.empty()) {
+ encode(false, bl);
+ } else {
+ encode(true, bl);
+ encode(old_inodes, bl, features);
+ }
+ if (!inode.is_dir())
+ encode(snapbl, bl);
+ encode(oldest_snap, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EMetaBlob::fullbit::decode(bufferlist::const_iterator &bl) {
+ DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
+ decode(dn, bl);
+ decode(dnfirst, bl);
+ decode(dnlast, bl);
+ decode(dnv, bl);
+ decode(inode, bl);
+ decode_noshare(xattrs, bl);
+ if (inode.is_symlink())
+ decode(symlink, bl);
+ if (inode.is_dir()) {
+ decode(dirfragtree, bl);
+ decode(snapbl, bl);
+ if ((struct_v == 2) || (struct_v == 3)) {
+ bool dir_layout_exists;
+ decode(dir_layout_exists, bl);
+ if (dir_layout_exists) {
+ __u8 dir_struct_v;
+ decode(dir_struct_v, bl); // default_file_layout version
+ decode(inode.layout, bl); // and actual layout, that we care about
+ }
+ }
+ }
+ if (struct_v >= 6) {
+ decode(state, bl);
+ } else {
+ bool dirty;
+ decode(dirty, bl);
+ state = dirty ? EMetaBlob::fullbit::STATE_DIRTY : 0;
+ }
+
+ if (struct_v >= 3) {
+ bool old_inodes_present;
+ decode(old_inodes_present, bl);
+ if (old_inodes_present) {
+ decode(old_inodes, bl);
+ }
+ }
+ if (!inode.is_dir()) {
+ if (struct_v >= 7)
+ decode(snapbl, bl);
+ }
+ if (struct_v >= 8)
+ decode(oldest_snap, bl);
+ else
+ oldest_snap = CEPH_NOSNAP;
+
+ DECODE_FINISH(bl);
+}
+
+void EMetaBlob::fullbit::dump(Formatter *f) const
+{
+ f->dump_string("dentry", dn);
+ f->dump_stream("snapid.first") << dnfirst;
+ f->dump_stream("snapid.last") << dnlast;
+ f->dump_int("dentry version", dnv);
+ f->open_object_section("inode");
+ inode.dump(f);
+ f->close_section(); // inode
+ f->open_object_section("xattrs");
+ for (const auto &p : xattrs) {
+ std::string s(p.second.c_str(), p.second.length());
+ f->dump_string(p.first.c_str(), s);
+ }
+ f->close_section(); // xattrs
+ if (inode.is_symlink()) {
+ f->dump_string("symlink", symlink);
+ }
+ if (inode.is_dir()) {
+ f->dump_stream("frag tree") << dirfragtree;
+ f->dump_string("has_snapbl", snapbl.length() ? "true" : "false");
+ if (inode.has_layout()) {
+ f->open_object_section("file layout policy");
+ // FIXME
+ f->dump_string("layout", "the layout exists");
+ f->close_section(); // file layout policy
+ }
+ }
+ f->dump_string("state", state_string());
+ if (!old_inodes.empty()) {
+ f->open_array_section("old inodes");
+ for (const auto &p : old_inodes) {
+ f->open_object_section("inode");
+ f->dump_int("snapid", p.first);
+ p.second.dump(f);
+ f->close_section(); // inode
+ }
+ f->close_section(); // old inodes
+ }
+}
+
+void EMetaBlob::fullbit::generate_test_instances(list<EMetaBlob::fullbit*>& ls)
+{
+ CInode::mempool_inode inode;
+ fragtree_t fragtree;
+ CInode::mempool_xattr_map empty_xattrs;
+ bufferlist empty_snapbl;
+ fullbit *sample = new fullbit("/testdn", 0, 0, 0,
+ inode, fragtree, empty_xattrs, "", 0, empty_snapbl,
+ false, NULL);
+ ls.push_back(sample);
+}
+
+void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in)
+{
+ in->inode = inode;
+ in->xattrs = xattrs;
+ in->maybe_export_pin();
+ if (in->inode.is_dir()) {
+ if (!(in->dirfragtree == dirfragtree)) {
+ dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> "
+ << dirfragtree << " on " << *in << dendl;
+ in->dirfragtree = dirfragtree;
+ in->force_dirfrags();
+ if (in->has_dirfrags() && in->authority() == CDIR_AUTH_UNDEF) {
+ list<CDir*> ls;
+ in->get_nested_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ CDir *dir = *p;
+ if (dir->get_num_any() == 0 &&
+ mds->mdcache->can_trim_non_auth_dirfrag(dir)) {
+ dout(10) << " closing empty non-auth dirfrag " << *dir << dendl;
+ in->close_dirfrag(dir->get_frag());
+ }
+ }
+ }
+ }
+ } else if (in->inode.is_symlink()) {
+ in->symlink = symlink;
+ }
+ in->old_inodes = old_inodes;
+ if (!in->old_inodes.empty()) {
+ snapid_t min_first = in->old_inodes.rbegin()->first + 1;
+ if (min_first > in->first)
+ in->first = min_first;
+ }
+
+ /*
+ * we can do this before linking hte inode bc the split_at would
+ * be a no-op.. we have no children (namely open snaprealms) to
+ * divy up
+ */
+ in->oldest_snap = oldest_snap;
+ in->decode_snap_blob(snapbl);
+
+ /*
+ * In case there was anything malformed in the journal that we are
+ * replaying, do sanity checks on the inodes we're replaying and
+ * go damaged instead of letting any trash into a live cache
+ */
+ if (in->is_file()) {
+ // Files must have valid layouts with a pool set
+ if (in->inode.layout.pool_id == -1 || !in->inode.layout.is_valid()) {
+ dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
+ << ": " << in->inode.layout << dendl;
+ std::ostringstream oss;
+ oss << "Invalid layout for inode " << in->ino() << " in journal";
+ mds->clog->error() << oss.str();
+ mds->damaged();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+ }
+}
+
+// EMetaBlob::remotebit
+
+void EMetaBlob::remotebit::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(dn, bl);
+ encode(dnfirst, bl);
+ encode(dnlast, bl);
+ encode(dnv, bl);
+ encode(ino, bl);
+ encode(d_type, bl);
+ encode(dirty, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EMetaBlob::remotebit::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(dn, bl);
+ decode(dnfirst, bl);
+ decode(dnlast, bl);
+ decode(dnv, bl);
+ decode(ino, bl);
+ decode(d_type, bl);
+ decode(dirty, bl);
+ DECODE_FINISH(bl);
+}
+
+void EMetaBlob::remotebit::dump(Formatter *f) const
+{
+ f->dump_string("dentry", dn);
+ f->dump_int("snapid.first", dnfirst);
+ f->dump_int("snapid.last", dnlast);
+ f->dump_int("dentry version", dnv);
+ f->dump_int("inodeno", ino);
+ uint32_t type = DTTOIF(d_type) & S_IFMT; // convert to type entries
+ string type_string;
+ switch(type) {
+ case S_IFREG:
+ type_string = "file"; break;
+ case S_IFLNK:
+ type_string = "symlink"; break;
+ case S_IFDIR:
+ type_string = "directory"; break;
+ case S_IFIFO:
+ type_string = "fifo"; break;
+ case S_IFCHR:
+ type_string = "chr"; break;
+ case S_IFBLK:
+ type_string = "blk"; break;
+ case S_IFSOCK:
+ type_string = "sock"; break;
+ default:
+ assert (0 == "unknown d_type!");
+ }
+ f->dump_string("d_type", type_string);
+ f->dump_string("dirty", dirty ? "true" : "false");
+}
+
+void EMetaBlob::remotebit::
+generate_test_instances(list<EMetaBlob::remotebit*>& ls)
+{
+ remotebit *remote = new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG), false);
+ ls.push_back(remote);
+}
+
+// EMetaBlob::nullbit
+
+void EMetaBlob::nullbit::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(dn, bl);
+ encode(dnfirst, bl);
+ encode(dnlast, bl);
+ encode(dnv, bl);
+ encode(dirty, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EMetaBlob::nullbit::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(dn, bl);
+ decode(dnfirst, bl);
+ decode(dnlast, bl);
+ decode(dnv, bl);
+ decode(dirty, bl);
+ DECODE_FINISH(bl);
+}
+
+void EMetaBlob::nullbit::dump(Formatter *f) const
+{
+ f->dump_string("dentry", dn);
+ f->dump_int("snapid.first", dnfirst);
+ f->dump_int("snapid.last", dnlast);
+ f->dump_int("dentry version", dnv);
+ f->dump_string("dirty", dirty ? "true" : "false");
+}
+
+void EMetaBlob::nullbit::generate_test_instances(list<nullbit*>& ls)
+{
+ nullbit *sample = new nullbit("/test/dentry", 0, 10, 15, false);
+ nullbit *sample2 = new nullbit("/test/dirty", 10, 20, 25, true);
+ ls.push_back(sample);
+ ls.push_back(sample2);
+}
+
+// EMetaBlob::dirlump
+
+void EMetaBlob::dirlump::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(fnode, bl);
+ encode(state, bl);
+ encode(nfull, bl);
+ encode(nremote, bl);
+ encode(nnull, bl);
+ _encode_bits(features);
+ encode(dnbl, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EMetaBlob::dirlump::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl)
+ decode(fnode, bl);
+ decode(state, bl);
+ decode(nfull, bl);
+ decode(nremote, bl);
+ decode(nnull, bl);
+ decode(dnbl, bl);
+ dn_decoded = false; // don't decode bits unless we need them.
+ DECODE_FINISH(bl);
+}
+
+void EMetaBlob::dirlump::dump(Formatter *f) const
+{
+ if (!dn_decoded) {
+ dirlump *me = const_cast<dirlump*>(this);
+ me->_decode_bits();
+ }
+ f->open_object_section("fnode");
+ fnode.dump(f);
+ f->close_section(); // fnode
+ f->dump_string("state", state_string());
+ f->dump_int("nfull", nfull);
+ f->dump_int("nremote", nremote);
+ f->dump_int("nnull", nnull);
+
+ f->open_array_section("full bits");
+ for (const auto& iter : dfull) {
+ f->open_object_section("fullbit");
+ iter.dump(f);
+ f->close_section(); // fullbit
+ }
+ f->close_section(); // full bits
+ f->open_array_section("remote bits");
+ for (const auto& iter : dremote) {
+ f->open_object_section("remotebit");
+ iter.dump(f);
+ f->close_section(); // remotebit
+ }
+ f->close_section(); // remote bits
+ f->open_array_section("null bits");
+ for (const auto& iter : dnull) {
+ f->open_object_section("null bit");
+ iter.dump(f);
+ f->close_section(); // null bit
+ }
+ f->close_section(); // null bits
+}
+
+void EMetaBlob::dirlump::generate_test_instances(list<dirlump*>& ls)
+{
+ ls.push_back(new dirlump());
+}
+
+/**
+ * EMetaBlob proper
+ */
+void EMetaBlob::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(8, 5, bl);
+ encode(lump_order, bl);
+ encode(lump_map, bl, features);
+ encode(roots, bl, features);
+ encode(table_tids, bl);
+ encode(opened_ino, bl);
+ encode(allocated_ino, bl);
+ encode(used_preallocated_ino, bl);
+ encode(preallocated_inos, bl);
+ encode(client_name, bl);
+ encode(inotablev, bl);
+ encode(sessionmapv, bl);
+ encode(truncate_start, bl);
+ encode(truncate_finish, bl);
+ encode(destroyed_inodes, bl);
+ encode(client_reqs, bl);
+ encode(renamed_dirino, bl);
+ encode(renamed_dir_frags, bl);
+ {
+ // make MDSRank use v6 format happy
+ int64_t i = -1;
+ bool b = false;
+ encode(i, bl);
+ encode(b, bl);
+ }
+ encode(client_flushes, bl);
+ ENCODE_FINISH(bl);
+}
+void EMetaBlob::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
+ decode(lump_order, bl);
+ decode(lump_map, bl);
+ if (struct_v >= 4) {
+ decode(roots, bl);
+ } else {
+ bufferlist rootbl;
+ decode(rootbl, bl);
+ if (rootbl.length()) {
+ auto p = rootbl.cbegin();
+ roots.emplace_back(p);
+ }
+ }
+ decode(table_tids, bl);
+ decode(opened_ino, bl);
+ decode(allocated_ino, bl);
+ decode(used_preallocated_ino, bl);
+ decode(preallocated_inos, bl);
+ decode(client_name, bl);
+ decode(inotablev, bl);
+ decode(sessionmapv, bl);
+ decode(truncate_start, bl);
+ decode(truncate_finish, bl);
+ decode(destroyed_inodes, bl);
+ if (struct_v >= 2) {
+ decode(client_reqs, bl);
+ } else {
+ list<metareqid_t> r;
+ decode(r, bl);
+ while (!r.empty()) {
+ client_reqs.push_back(pair<metareqid_t,uint64_t>(r.front(), 0));
+ r.pop_front();
+ }
+ }
+ if (struct_v >= 3) {
+ decode(renamed_dirino, bl);
+ decode(renamed_dir_frags, bl);
+ }
+ if (struct_v >= 6) {
+ // ignore
+ int64_t i;
+ bool b;
+ decode(i, bl);
+ decode(b, bl);
+ }
+ if (struct_v >= 8) {
+ decode(client_flushes, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+
+/**
+ * Get all inodes touched by this metablob. Includes the 'bits' within
+ * dirlumps, and the inodes of the dirs themselves.
+ */
+void EMetaBlob::get_inodes(
+ std::set<inodeno_t> &inodes) const
+{
+ // For all dirlumps in this metablob
+ for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
+ // Record inode of dirlump
+ inodeno_t const dir_ino = i->first.ino;
+ inodes.insert(dir_ino);
+
+ // Decode dirlump bits
+ dirlump const &dl = i->second;
+ dl._decode_bits();
+
+ // Record inodes of fullbits
+ for (const auto& iter : dl.get_dfull()) {
+ inodes.insert(iter.inode.ino);
+ }
+
+ // Record inodes of remotebits
+ for (const auto& iter : dl.get_dremote()) {
+ inodes.insert(iter.ino);
+ }
+ }
+}
+
+
+/**
+ * Get a map of dirfrag to set of dentries in that dirfrag which are
+ * touched in this operation.
+ */
+void EMetaBlob::get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const
+{
+ for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
+ dirlump const &dl = i->second;
+ dirfrag_t const &df = i->first;
+
+ // Get all bits
+ dl._decode_bits();
+
+ // For all bits, store dentry
+ for (const auto& iter : dl.get_dfull()) {
+ dentries[df].insert(iter.dn);
+ }
+ for (const auto& iter : dl.get_dremote()) {
+ dentries[df].insert(iter.dn);
+ }
+ for (const auto& iter : dl.get_dnull()) {
+ dentries[df].insert(iter.dn);
+ }
+ }
+}
+
+
+
+/**
+ * Calculate all paths that we can infer are touched by this metablob. Only uses
+ * information local to this metablob so it may only be the path within the
+ * subtree.
+ */
+void EMetaBlob::get_paths(
+ std::vector<std::string> &paths) const
+{
+ // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
+ typedef std::pair<inodeno_t, std::string> Location;
+
+ // Whenever we see a dentry within a dirlump, we remember it as a child of
+ // the dirlump's inode
+ std::map<inodeno_t, std::list<std::string> > children;
+
+ // Whenever we see a location for an inode, remember it: this allows us to
+ // build a path given an inode
+ std::map<inodeno_t, Location> ino_locations;
+
+ // Special case: operations on root inode populate roots but not dirlumps
+ if (lump_map.empty() && !roots.empty()) {
+ paths.push_back("/");
+ return;
+ }
+
+ // First pass
+ // ==========
+ // Build a tiny local metadata cache for the path structure in this metablob
+ for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
+ inodeno_t const dir_ino = i->first.ino;
+ dirlump const &dl = i->second;
+ dl._decode_bits();
+
+ for (const auto& iter : dl.get_dfull()) {
+ std::string_view dentry = iter.dn;
+ children[dir_ino].emplace_back(dentry);
+ ino_locations[iter.inode.ino] = Location(dir_ino, dentry);
+ }
+
+ for (const auto& iter : dl.get_dremote()) {
+ std::string_view dentry = iter.dn;
+ children[dir_ino].emplace_back(dentry);
+ }
+
+ for (const auto& iter : dl.get_dnull()) {
+ std::string_view dentry = iter.dn;
+ children[dir_ino].emplace_back(dentry);
+ }
+ }
+
+ std::vector<Location> leaf_locations;
+
+ // Second pass
+ // ===========
+ // Output paths for all childless nodes in the metablob
+ for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
+ inodeno_t const dir_ino = i->first.ino;
+ dirlump const &dl = i->second;
+ dl._decode_bits();
+
+ for (const auto& iter : dl.get_dfull()) {
+ std::string_view dentry = iter.dn;
+ if (children.find(iter.inode.ino) == children.end()) {
+ leaf_locations.push_back(Location(dir_ino, dentry));
+ }
+ }
+
+ for (const auto& iter : dl.get_dremote()) {
+ std::string_view dentry = iter.dn;
+ leaf_locations.push_back(Location(dir_ino, dentry));
+ }
+
+ for (const auto& iter : dl.get_dnull()) {
+ std::string_view dentry = iter.dn;
+ leaf_locations.push_back(Location(dir_ino, dentry));
+ }
+ }
+
+ // For all the leaf locations identified, generate paths
+ for (std::vector<Location>::iterator i = leaf_locations.begin(); i != leaf_locations.end(); ++i) {
+ Location const &loc = *i;
+ std::string path = loc.second;
+ inodeno_t ino = loc.first;
+ std::map<inodeno_t, Location>::iterator iter = ino_locations.find(ino);
+ while(iter != ino_locations.end()) {
+ Location const &loc = iter->second;
+ if (!path.empty()) {
+ path = loc.second + "/" + path;
+ } else {
+ path = loc.second + path;
+ }
+ iter = ino_locations.find(loc.first);
+ }
+
+ paths.push_back(path);
+ }
+}
+
+
+void EMetaBlob::dump(Formatter *f) const
+{
+ f->open_array_section("lumps");
+ for (const auto& d : lump_order) {
+ f->open_object_section("lump");
+ f->open_object_section("dirfrag");
+ f->dump_stream("dirfrag") << d;
+ f->close_section(); // dirfrag
+ f->open_object_section("dirlump");
+ lump_map.at(d).dump(f);
+ f->close_section(); // dirlump
+ f->close_section(); // lump
+ }
+ f->close_section(); // lumps
+
+ f->open_array_section("roots");
+ for (const auto& iter : roots) {
+ f->open_object_section("root");
+ iter.dump(f);
+ f->close_section(); // root
+ }
+ f->close_section(); // roots
+
+ f->open_array_section("tableclient tranactions");
+ for (const auto& p : table_tids) {
+ f->open_object_section("transaction");
+ f->dump_int("tid", p.first);
+ f->dump_int("version", p.second);
+ f->close_section(); // transaction
+ }
+ f->close_section(); // tableclient transactions
+
+ f->dump_int("renamed directory inodeno", renamed_dirino);
+
+ f->open_array_section("renamed directory fragments");
+ for (const auto& p : renamed_dir_frags) {
+ f->dump_int("frag", p);
+ }
+ f->close_section(); // renamed directory fragments
+
+ f->dump_int("inotable version", inotablev);
+ f->dump_int("SessionMap version", sessionmapv);
+ f->dump_int("allocated ino", allocated_ino);
+
+ f->dump_stream("preallocated inos") << preallocated_inos;
+ f->dump_int("used preallocated ino", used_preallocated_ino);
+
+ f->open_object_section("client name");
+ client_name.dump(f);
+ f->close_section(); // client name
+
+ f->open_array_section("inodes starting a truncate");
+ for(const auto& ino : truncate_start) {
+ f->dump_int("inodeno", ino);
+ }
+ f->close_section(); // truncate inodes
+ f->open_array_section("inodes finishing a truncated");
+ for(const auto& p : truncate_finish) {
+ f->open_object_section("inode+segment");
+ f->dump_int("inodeno", p.first);
+ f->dump_int("truncate starting segment", p.second);
+ f->close_section(); // truncated inode
+ }
+ f->close_section(); // truncate finish inodes
+
+ f->open_array_section("destroyed inodes");
+ for(vector<inodeno_t>::const_iterator i = destroyed_inodes.begin();
+ i != destroyed_inodes.end(); ++i) {
+ f->dump_int("inodeno", *i);
+ }
+ f->close_section(); // destroyed inodes
+
+ f->open_array_section("client requests");
+ for(const auto& p : client_reqs) {
+ f->open_object_section("Client request");
+ f->dump_stream("request ID") << p.first;
+ f->dump_int("oldest request on client", p.second);
+ f->close_section(); // request
+ }
+ f->close_section(); // client requests
+}
+
+void EMetaBlob::generate_test_instances(list<EMetaBlob*>& ls)
+{
+ ls.push_back(new EMetaBlob());
+}
+
+void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
+{
+ dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
+
+ ceph_assert(logseg);
+
+ ceph_assert(g_conf()->mds_kill_journal_replay_at != 1);
+
+ for (auto& p : roots) {
+ CInode *in = mds->mdcache->get_inode(p.inode.ino);
+ bool isnew = in ? false:true;
+ if (!in)
+ in = new CInode(mds->mdcache, false, 2, CEPH_NOSNAP);
+ p.update_inode(mds, in);
+
+ if (isnew)
+ mds->mdcache->add_inode(in);
+ if (p.is_dirty()) in->_mark_dirty(logseg);
+ dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl;
+ }
+
+ CInode *renamed_diri = 0;
+ CDir *olddir = 0;
+ if (renamed_dirino) {
+ renamed_diri = mds->mdcache->get_inode(renamed_dirino);
+ if (renamed_diri)
+ dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri << dendl;
+ else
+ dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino << dendl;
+
+ int nnull = 0;
+ for (const auto& lp : lump_order) {
+ dirlump &lump = lump_map[lp];
+ if (lump.nnull) {
+ dout(10) << "EMetaBlob.replay found null dentry in dir " << lp << dendl;
+ nnull += lump.nnull;
+ }
+ }
+ ceph_assert(nnull <= 1);
+ }
+
+ // keep track of any inodes we unlink and don't relink elsewhere
+ map<CInode*, CDir*> unlinked;
+ set<CInode*> linked;
+
+ // walk through my dirs (in order!)
+ int count = 0;
+ for (const auto& lp : lump_order) {
+ dout(10) << "EMetaBlob.replay dir " << lp << dendl;
+ dirlump &lump = lump_map[lp];
+
+ // the dir
+ CDir *dir = mds->mdcache->get_force_dirfrag(lp, true);
+ if (!dir) {
+ // hmm. do i have the inode?
+ CInode *diri = mds->mdcache->get_inode((lp).ino);
+ if (!diri) {
+ if (MDS_INO_IS_MDSDIR(lp.ino)) {
+ ceph_assert(MDS_INO_MDSDIR(mds->get_nodeid()) != lp.ino);
+ diri = mds->mdcache->create_system_inode(lp.ino, S_IFDIR|0755);
+ diri->state_clear(CInode::STATE_AUTH);
+ dout(10) << "EMetaBlob.replay created base " << *diri << dendl;
+ } else {
+ dout(0) << "EMetaBlob.replay missing dir ino " << lp.ino << dendl;
+ mds->clog->error() << "failure replaying journal (EMetaBlob)";
+ mds->damaged();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+ }
+
+ // create the dirfrag
+ dir = diri->get_or_open_dirfrag(mds->mdcache, lp.frag);
+
+ if (MDS_INO_IS_BASE(lp.ino))
+ mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
+
+ dout(10) << "EMetaBlob.replay added dir " << *dir << dendl;
+ }
+ dir->set_version( lump.fnode.version );
+ dir->fnode = lump.fnode;
+
+ if (lump.is_importing()) {
+ dir->state_set(CDir::STATE_AUTH);
+ dir->state_clear(CDir::STATE_COMPLETE);
+ }
+ if (lump.is_dirty()) {
+ dir->_mark_dirty(logseg);
+
+ if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
+ dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir << dendl;
+ mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
+ logseg->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
+ } else {
+ dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir << dendl;
+ }
+ if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
+ dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir << dendl;
+ mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
+ logseg->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
+ } else {
+ dout(10) << "EMetaBlob.replay clean fragstat on " << *dir << dendl;
+ }
+ }
+ if (lump.is_dirty_dft()) {
+ dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir << dendl;
+ dir->state_set(CDir::STATE_DIRTYDFT);
+ mds->locker->mark_updated_scatterlock(&dir->inode->dirfragtreelock);
+ logseg->dirty_dirfrag_dirfragtree.push_back(&dir->inode->item_dirty_dirfrag_dirfragtree);
+ }
+ if (lump.is_new())
+ dir->mark_new(logseg);
+ if (lump.is_complete())
+ dir->mark_complete();
+
+ dout(10) << "EMetaBlob.replay updated dir " << *dir << dendl;
+
+ // decode bits
+ lump._decode_bits();
+
+ // full dentry+inode pairs
+ for (auto& fb : lump._get_dfull()) {
+ CDentry *dn = dir->lookup_exact_snap(fb.dn, fb.dnlast);
+ if (!dn) {
+ dn = dir->add_null_dentry(fb.dn, fb.dnfirst, fb.dnlast);
+ dn->set_version(fb.dnv);
+ if (fb.is_dirty()) dn->_mark_dirty(logseg);
+ dout(10) << "EMetaBlob.replay added (full) " << *dn << dendl;
+ } else {
+ dn->set_version(fb.dnv);
+ if (fb.is_dirty()) dn->_mark_dirty(logseg);
+ dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *dn << dendl;
+ dn->first = fb.dnfirst;
+ ceph_assert(dn->last == fb.dnlast);
+ }
+ if (lump.is_importing())
+ dn->state_set(CDentry::STATE_AUTH);
+
+ CInode *in = mds->mdcache->get_inode(fb.inode.ino, fb.dnlast);
+ if (!in) {
+ in = new CInode(mds->mdcache, dn->is_auth(), fb.dnfirst, fb.dnlast);
+ fb.update_inode(mds, in);
+ mds->mdcache->add_inode(in);
+ if (!dn->get_linkage()->is_null()) {
+ if (dn->get_linkage()->is_primary()) {
+ unlinked[dn->get_linkage()->get_inode()] = dir;
+ stringstream ss;
+ ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
+ << " " << *dn->get_linkage()->get_inode() << " should be " << fb.inode.ino;
+ dout(0) << ss.str() << dendl;
+ mds->clog->warn(ss);
+ }
+ dir->unlink_inode(dn, false);
+ }
+ if (unlinked.count(in))
+ linked.insert(in);
+ dir->link_primary_inode(dn, in);
+ dout(10) << "EMetaBlob.replay added " << *in << dendl;
+ } else {
+ in->first = fb.dnfirst;
+ fb.update_inode(mds, in);
+ if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) {
+ dout(10) << "EMetaBlob.replay unlinking " << *in << dendl;
+ unlinked[in] = in->get_parent_dir();
+ in->get_parent_dir()->unlink_inode(in->get_parent_dn());
+ }
+ if (dn->get_linkage()->get_inode() != in) {
+ if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
+ if (dn->get_linkage()->is_primary()) {
+ unlinked[dn->get_linkage()->get_inode()] = dir;
+ stringstream ss;
+ ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
+ << " " << *dn->get_linkage()->get_inode() << " should be " << fb.inode.ino;
+ dout(0) << ss.str() << dendl;
+ mds->clog->warn(ss);
+ }
+ dir->unlink_inode(dn, false);
+ }
+ if (unlinked.count(in))
+ linked.insert(in);
+ dir->link_primary_inode(dn, in);
+ dout(10) << "EMetaBlob.replay linked " << *in << dendl;
+ } else {
+ dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *in << dendl;
+ }
+ ceph_assert(in->first == fb.dnfirst ||
+ (in->is_multiversion() && in->first > fb.dnfirst));
+ }
+ if (fb.is_dirty())
+ in->_mark_dirty(logseg);
+ if (fb.is_dirty_parent())
+ in->mark_dirty_parent(logseg, fb.is_dirty_pool());
+ if (fb.need_snapflush())
+ logseg->open_files.push_back(&in->item_open_file);
+ if (dn->is_auth())
+ in->state_set(CInode::STATE_AUTH);
+ else
+ in->state_clear(CInode::STATE_AUTH);
+ ceph_assert(g_conf()->mds_kill_journal_replay_at != 2);
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+
+ // remote dentries
+ for (const auto& rb : lump.get_dremote()) {
+ CDentry *dn = dir->lookup_exact_snap(rb.dn, rb.dnlast);
+ if (!dn) {
+ dn = dir->add_remote_dentry(rb.dn, rb.ino, rb.d_type, rb.dnfirst, rb.dnlast);
+ dn->set_version(rb.dnv);
+ if (rb.dirty) dn->_mark_dirty(logseg);
+ dout(10) << "EMetaBlob.replay added " << *dn << dendl;
+ } else {
+ if (!dn->get_linkage()->is_null()) {
+ dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
+ if (dn->get_linkage()->is_primary()) {
+ unlinked[dn->get_linkage()->get_inode()] = dir;
+ stringstream ss;
+ ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
+ << " " << *dn->get_linkage()->get_inode() << " should be remote " << rb.ino;
+ dout(0) << ss.str() << dendl;
+ }
+ dir->unlink_inode(dn, false);
+ }
+ dir->link_remote_inode(dn, rb.ino, rb.d_type);
+ dn->set_version(rb.dnv);
+ if (rb.dirty) dn->_mark_dirty(logseg);
+ dout(10) << "EMetaBlob.replay for [" << rb.dnfirst << "," << rb.dnlast << "] had " << *dn << dendl;
+ dn->first = rb.dnfirst;
+ ceph_assert(dn->last == rb.dnlast);
+ }
+ if (lump.is_importing())
+ dn->state_set(CDentry::STATE_AUTH);
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+
+ // null dentries
+ for (const auto& nb : lump.get_dnull()) {
+ CDentry *dn = dir->lookup_exact_snap(nb.dn, nb.dnlast);
+ if (!dn) {
+ dn = dir->add_null_dentry(nb.dn, nb.dnfirst, nb.dnlast);
+ dn->set_version(nb.dnv);
+ if (nb.dirty) dn->_mark_dirty(logseg);
+ dout(10) << "EMetaBlob.replay added (nullbit) " << *dn << dendl;
+ } else {
+ dn->first = nb.dnfirst;
+ if (!dn->get_linkage()->is_null()) {
+ dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
+ CInode *in = dn->get_linkage()->get_inode();
+ // For renamed inode, We may call CInode::force_dirfrag() later.
+ // CInode::force_dirfrag() doesn't work well when inode is detached
+ // from the hierarchy.
+ if (!renamed_diri || renamed_diri != in) {
+ if (dn->get_linkage()->is_primary())
+ unlinked[in] = dir;
+ dir->unlink_inode(dn);
+ }
+ }
+ dn->set_version(nb.dnv);
+ if (nb.dirty) dn->_mark_dirty(logseg);
+ dout(10) << "EMetaBlob.replay had " << *dn << dendl;
+ ceph_assert(dn->last == nb.dnlast);
+ }
+ olddir = dir;
+ if (lump.is_importing())
+ dn->state_set(CDentry::STATE_AUTH);
+
+ // Make null dentries the first things we trim
+ dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn << dendl;
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+ }
+
+ ceph_assert(g_conf()->mds_kill_journal_replay_at != 3);
+
+ if (renamed_dirino) {
+ if (renamed_diri) {
+ ceph_assert(unlinked.count(renamed_diri));
+ ceph_assert(linked.count(renamed_diri));
+ olddir = unlinked[renamed_diri];
+ } else {
+ // we imported a diri we haven't seen before
+ renamed_diri = mds->mdcache->get_inode(renamed_dirino);
+ ceph_assert(renamed_diri); // it was in the metablob
+ }
+
+ if (olddir) {
+ if (olddir->authority() != CDIR_AUTH_UNDEF &&
+ renamed_diri->authority() == CDIR_AUTH_UNDEF) {
+ ceph_assert(slaveup); // auth to non-auth, must be slave prepare
+ frag_vec_t leaves;
+ renamed_diri->dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ CDir *dir = renamed_diri->get_dirfrag(leaf);
+ ceph_assert(dir);
+ if (dir->get_dir_auth() == CDIR_AUTH_UNDEF)
+ // preserve subtree bound until slave commit
+ slaveup->olddirs.insert(dir->inode);
+ else
+ dir->state_set(CDir::STATE_AUTH);
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+ }
+
+ mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
+
+ // see if we can discard the subtree we renamed out of
+ CDir *root = mds->mdcache->get_subtree_root(olddir);
+ if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
+ if (slaveup) // preserve the old dir until slave commit
+ slaveup->olddirs.insert(olddir->inode);
+ else
+ mds->mdcache->try_trim_non_auth_subtree(root);
+ }
+ }
+
+ // if we are the srci importer, we'll also have some dirfrags we have to open up...
+ if (renamed_diri->authority() != CDIR_AUTH_UNDEF) {
+ for (const auto& p : renamed_dir_frags) {
+ CDir *dir = renamed_diri->get_dirfrag(p);
+ if (dir) {
+ // we already had the inode before, and we already adjusted this subtree accordingly.
+ dout(10) << " already had+adjusted rename import bound " << *dir << dendl;
+ ceph_assert(olddir);
+ continue;
+ }
+ dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, p);
+ dout(10) << " creating new rename import bound " << *dir << dendl;
+ dir->state_clear(CDir::STATE_AUTH);
+ mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+ }
+
+ // rename may overwrite an empty directory and move it into stray dir.
+ unlinked.erase(renamed_diri);
+ for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
+ if (!linked.count(p->first))
+ continue;
+ ceph_assert(p->first->is_dir());
+ mds->mdcache->adjust_subtree_after_rename(p->first, p->second, false);
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+ }
+
+ if (!unlinked.empty()) {
+ for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); ++p)
+ unlinked.erase(*p);
+ dout(10) << " unlinked set contains " << unlinked << dendl;
+ for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
+ CInode *in = p->first;
+ if (slaveup) { // preserve unlinked inodes until slave commit
+ slaveup->unlinked.insert(in);
+ if (in->snaprealm)
+ in->snaprealm->adjust_parent();
+ } else
+ mds->mdcache->remove_inode_recursive(in);
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+ }
+
+ // table client transactions
+ for (const auto& p : table_tids) {
+ dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p.first)
+ << " transaction " << p.second << dendl;
+ MDSTableClient *client = mds->get_table_client(p.first);
+ if (client)
+ client->got_journaled_agree(p.second, logseg);
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+
+ // opened ino?
+ if (opened_ino) {
+ CInode *in = mds->mdcache->get_inode(opened_ino);
+ ceph_assert(in);
+ dout(10) << "EMetaBlob.replay noting opened inode " << *in << dendl;
+ logseg->open_files.push_back(&in->item_open_file);
+ }
+
+ // allocated_inos
+ if (inotablev) {
+ if (mds->inotable->get_version() >= inotablev) {
+ dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
+ << " <= table " << mds->inotable->get_version() << dendl;
+ } else {
+ dout(10) << "EMetaBlob.replay inotable v " << inotablev
+ << " - 1 == table " << mds->inotable->get_version()
+ << " allocated+used " << allocated_ino
+ << " prealloc " << preallocated_inos
+ << dendl;
+ if (allocated_ino)
+ mds->inotable->replay_alloc_id(allocated_ino);
+ if (preallocated_inos.size())
+ mds->inotable->replay_alloc_ids(preallocated_inos);
+
+ // [repair bad inotable updates]
+ if (inotablev > mds->inotable->get_version()) {
+ mds->clog->error() << "journal replay inotablev mismatch "
+ << mds->inotable->get_version() << " -> " << inotablev;
+ mds->inotable->force_replay_version(inotablev);
+ }
+
+ ceph_assert(inotablev == mds->inotable->get_version());
+ }
+ }
+ if (sessionmapv) {
+ unsigned diff = (used_preallocated_ino && !preallocated_inos.empty()) ? 2 : 1;
+ if (mds->sessionmap.get_version() >= sessionmapv) {
+ dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
+ << " <= table " << mds->sessionmap.get_version() << dendl;
+ } else if (mds->sessionmap.get_version() + diff == sessionmapv) {
+ dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
+ << " - " << diff << " == table " << mds->sessionmap.get_version()
+ << " prealloc " << preallocated_inos
+ << " used " << used_preallocated_ino
+ << dendl;
+ Session *session = mds->sessionmap.get_session(client_name);
+ if (session) {
+ dout(20) << " (session prealloc " << session->info.prealloc_inos << ")" << dendl;
+ if (used_preallocated_ino) {
+ if (!session->info.prealloc_inos.empty()) {
+ inodeno_t next = session->next_ino();
+ inodeno_t i = session->take_ino(used_preallocated_ino);
+ if (next != i)
+ mds->clog->warn() << " replayed op " << client_reqs << " used ino " << i
+ << " but session next is " << next;
+ ceph_assert(i == used_preallocated_ino);
+ session->info.used_inos.clear();
+ }
+ mds->sessionmap.replay_dirty_session(session);
+ }
+ if (!preallocated_inos.empty()) {
+ session->info.prealloc_inos.insert(preallocated_inos);
+ mds->sessionmap.replay_dirty_session(session);
+ }
+
+ } else {
+ dout(10) << "EMetaBlob.replay no session for " << client_name << dendl;
+ if (used_preallocated_ino)
+ mds->sessionmap.replay_advance_version();
+
+ if (!preallocated_inos.empty())
+ mds->sessionmap.replay_advance_version();
+ }
+ ceph_assert(sessionmapv == mds->sessionmap.get_version());
+ } else {
+ mds->clog->error() << "EMetaBlob.replay sessionmap v " << sessionmapv
+ << " - " << diff << " > table " << mds->sessionmap.get_version();
+ ceph_assert(g_conf()->mds_wipe_sessions);
+ mds->sessionmap.wipe();
+ mds->sessionmap.set_version(sessionmapv);
+ }
+ }
+
+ // truncating inodes
+ for (const auto& ino : truncate_start) {
+ CInode *in = mds->mdcache->get_inode(ino);
+ ceph_assert(in);
+ mds->mdcache->add_recovered_truncate(in, logseg);
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+ for (const auto& p : truncate_finish) {
+ LogSegment *ls = mds->mdlog->get_segment(p.second);
+ if (ls) {
+ CInode *in = mds->mdcache->get_inode(p.first);
+ ceph_assert(in);
+ mds->mdcache->remove_recovered_truncate(in, ls);
+ }
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+
+ // destroyed inodes
+ if (!destroyed_inodes.empty()) {
+ for (vector<inodeno_t>::iterator p = destroyed_inodes.begin();
+ p != destroyed_inodes.end();
+ ++p) {
+ CInode *in = mds->mdcache->get_inode(*p);
+ if (in) {
+ dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl;
+ CDentry *parent = in->get_parent_dn();
+ mds->mdcache->remove_inode(in);
+ if (parent) {
+ dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl;
+ ceph_assert(parent->get_linkage()->is_null());
+ }
+ } else {
+ dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl;
+ }
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+ mds->mdcache->open_file_table.note_destroyed_inos(logseg->seq, destroyed_inodes);
+ }
+
+ // client requests
+ for (const auto& p : client_reqs) {
+ if (p.first.name.is_client()) {
+ dout(10) << "EMetaBlob.replay request " << p.first << " trim_to " << p.second << dendl;
+ inodeno_t created = allocated_ino ? allocated_ino : used_preallocated_ino;
+ // if we allocated an inode, there should be exactly one client request id.
+ ceph_assert(created == inodeno_t() || client_reqs.size() == 1);
+
+ Session *session = mds->sessionmap.get_session(p.first.name);
+ if (session) {
+ session->add_completed_request(p.first.tid, created);
+ if (p.second)
+ session->trim_completed_requests(p.second);
+ }
+ }
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+
+ // client flushes
+ for (const auto& p : client_flushes) {
+ if (p.first.name.is_client()) {
+ dout(10) << "EMetaBlob.replay flush " << p.first << " trim_to " << p.second << dendl;
+ Session *session = mds->sessionmap.get_session(p.first.name);
+ if (session) {
+ session->add_completed_flush(p.first.tid);
+ if (p.second)
+ session->trim_completed_flushes(p.second);
+ }
+ }
+
+ if (!(++count % 1000))
+ mds->heartbeat_reset();
+ }
+
+ // update segment
+ update_segment(logseg);
+
+ ceph_assert(g_conf()->mds_kill_journal_replay_at != 4);
+}
+
+// -----------------------
+// ESession
+
+void ESession::update_segment()
+{
+ get_segment()->sessionmapv = cmapv;
+ if (inos.size() && inotablev)
+ get_segment()->inotablev = inotablev;
+}
+
+void ESession::replay(MDSRank *mds)
+{
+ if (mds->sessionmap.get_version() >= cmapv) {
+ dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
+ << " >= " << cmapv << ", noop" << dendl;
+ } else if (mds->sessionmap.get_version() + 1 == cmapv) {
+ dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
+ << " < " << cmapv << " " << (open ? "open":"close") << " " << client_inst << dendl;
+ Session *session;
+ if (open) {
+ session = mds->sessionmap.get_or_add_session(client_inst);
+ mds->sessionmap.set_state(session, Session::STATE_OPEN);
+ session->set_client_metadata(client_metadata);
+ dout(10) << " opened session " << session->info.inst << dendl;
+ } else {
+ session = mds->sessionmap.get_session(client_inst.name);
+ if (session) { // there always should be a session, but there's a bug
+ if (session->get_connection() == NULL) {
+ dout(10) << " removed session " << session->info.inst << dendl;
+ mds->sessionmap.remove_session(session);
+ session = NULL;
+ } else {
+ session->clear(); // the client has reconnected; keep the Session, but reset
+ dout(10) << " reset session " << session->info.inst << " (they reconnected)" << dendl;
+ }
+ } else {
+ mds->clog->error() << "replayed stray Session close event for " << client_inst
+ << " from time " << stamp << ", ignoring";
+ }
+ }
+ if (session) {
+ mds->sessionmap.replay_dirty_session(session);
+ } else {
+ mds->sessionmap.replay_advance_version();
+ }
+ ceph_assert(mds->sessionmap.get_version() == cmapv);
+ } else {
+ mds->clog->error() << "ESession.replay sessionmap v " << cmapv
+ << " - 1 > table " << mds->sessionmap.get_version();
+ ceph_assert(g_conf()->mds_wipe_sessions);
+ mds->sessionmap.wipe();
+ mds->sessionmap.set_version(cmapv);
+ }
+
+ if (inos.size() && inotablev) {
+ if (mds->inotable->get_version() >= inotablev) {
+ dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
+ << " >= " << inotablev << ", noop" << dendl;
+ } else {
+ dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
+ << " < " << inotablev << " " << (open ? "add":"remove") << dendl;
+ ceph_assert(!open); // for now
+ mds->inotable->replay_release_ids(inos);
+ ceph_assert(mds->inotable->get_version() == inotablev);
+ }
+ }
+
+ update_segment();
+}
+
+void ESession::encode(bufferlist &bl, uint64_t features) const
+{
+ ENCODE_START(5, 5, bl);
+ encode(stamp, bl);
+ encode(client_inst, bl, features);
+ encode(open, bl);
+ encode(cmapv, bl);
+ encode(inos, bl);
+ encode(inotablev, bl);
+ encode(client_metadata, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ESession::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
+ if (struct_v >= 2)
+ decode(stamp, bl);
+ decode(client_inst, bl);
+ decode(open, bl);
+ decode(cmapv, bl);
+ decode(inos, bl);
+ decode(inotablev, bl);
+ if (struct_v == 4) {
+ decode(client_metadata.kv_map, bl);
+ } else if (struct_v >= 5) {
+ decode(client_metadata, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+void ESession::dump(Formatter *f) const
+{
+ f->dump_stream("client instance") << client_inst;
+ f->dump_string("open", open ? "true" : "false");
+ f->dump_int("client map version", cmapv);
+ f->dump_stream("inos") << inos;
+ f->dump_int("inotable version", inotablev);
+ f->open_object_section("client_metadata");
+ client_metadata.dump(f);
+ f->close_section(); // client_metadata
+}
+
+void ESession::generate_test_instances(list<ESession*>& ls)
+{
+ ls.push_back(new ESession);
+}
+
+// -----------------------
+// ESessions
+
+void ESessions::encode(bufferlist &bl, uint64_t features) const
+{
+ ENCODE_START(2, 1, bl);
+ encode(client_map, bl, features);
+ encode(cmapv, bl);
+ encode(stamp, bl);
+ encode(client_metadata_map, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ESessions::decode_old(bufferlist::const_iterator &bl)
+{
+ using ceph::decode;
+ decode(client_map, bl);
+ decode(cmapv, bl);
+ if (!bl.end())
+ decode(stamp, bl);
+}
+
+void ESessions::decode_new(bufferlist::const_iterator &bl)
+{
+ DECODE_START(2, bl);
+ decode(client_map, bl);
+ decode(cmapv, bl);
+ decode(stamp, bl);
+ if (struct_v >= 2)
+ decode(client_metadata_map, bl);
+ DECODE_FINISH(bl);
+}
+
+void ESessions::dump(Formatter *f) const
+{
+ f->dump_int("client map version", cmapv);
+
+ f->open_array_section("client map");
+ for (map<client_t,entity_inst_t>::const_iterator i = client_map.begin();
+ i != client_map.end(); ++i) {
+ f->open_object_section("client");
+ f->dump_int("client id", i->first.v);
+ f->dump_stream("client entity") << i->second;
+ f->close_section(); // client
+ }
+ f->close_section(); // client map
+}
+
+void ESessions::generate_test_instances(list<ESessions*>& ls)
+{
+ ls.push_back(new ESessions());
+}
+
+void ESessions::update_segment()
+{
+ get_segment()->sessionmapv = cmapv;
+}
+
+void ESessions::replay(MDSRank *mds)
+{
+ if (mds->sessionmap.get_version() >= cmapv) {
+ dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
+ << " >= " << cmapv << ", noop" << dendl;
+ } else {
+ dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
+ << " < " << cmapv << dendl;
+ mds->sessionmap.replay_open_sessions(cmapv, client_map, client_metadata_map);
+ }
+ update_segment();
+}
+
+
+// -----------------------
+// ETableServer
+
+void ETableServer::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(3, 3, bl);
+ encode(stamp, bl);
+ encode(table, bl);
+ encode(op, bl);
+ encode(reqid, bl);
+ encode(bymds, bl);
+ encode(mutation, bl);
+ encode(tid, bl);
+ encode(version, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ETableServer::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+ if (struct_v >= 2)
+ decode(stamp, bl);
+ decode(table, bl);
+ decode(op, bl);
+ decode(reqid, bl);
+ decode(bymds, bl);
+ decode(mutation, bl);
+ decode(tid, bl);
+ decode(version, bl);
+ DECODE_FINISH(bl);
+}
+
+void ETableServer::dump(Formatter *f) const
+{
+ f->dump_int("table id", table);
+ f->dump_int("op", op);
+ f->dump_int("request id", reqid);
+ f->dump_int("by mds", bymds);
+ f->dump_int("tid", tid);
+ f->dump_int("version", version);
+}
+
+void ETableServer::generate_test_instances(list<ETableServer*>& ls)
+{
+ ls.push_back(new ETableServer());
+}
+
+
+void ETableServer::update_segment()
+{
+ get_segment()->tablev[table] = version;
+}
+
+void ETableServer::replay(MDSRank *mds)
+{
+ MDSTableServer *server = mds->get_table_server(table);
+ if (!server)
+ return;
+
+ if (server->get_version() >= version) {
+ dout(10) << "ETableServer.replay " << get_mdstable_name(table)
+ << " " << get_mdstableserver_opname(op)
+ << " event " << version
+ << " <= table " << server->get_version() << dendl;
+ return;
+ }
+
+ dout(10) << " ETableServer.replay " << get_mdstable_name(table)
+ << " " << get_mdstableserver_opname(op)
+ << " event " << version << " - 1 == table " << server->get_version() << dendl;
+ ceph_assert(version-1 == server->get_version());
+
+ switch (op) {
+ case TABLESERVER_OP_PREPARE: {
+ server->_note_prepare(bymds, reqid, true);
+ bufferlist out;
+ server->_prepare(mutation, reqid, bymds, out);
+ mutation = std::move(out);
+ break;
+ }
+ case TABLESERVER_OP_COMMIT:
+ server->_commit(tid, MMDSTableRequest::ref());
+ server->_note_commit(tid, true);
+ break;
+ case TABLESERVER_OP_ROLLBACK:
+ server->_rollback(tid);
+ server->_note_rollback(tid, true);
+ break;
+ case TABLESERVER_OP_SERVER_UPDATE:
+ server->_server_update(mutation);
+ server->_note_server_update(mutation, true);
+ break;
+ default:
+ mds->clog->error() << "invalid tableserver op in ETableServer";
+ mds->damaged();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+
+ ceph_assert(version == server->get_version());
+ update_segment();
+}
+
+
+// ---------------------
+// ETableClient
+
+void ETableClient::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(3, 3, bl);
+ encode(stamp, bl);
+ encode(table, bl);
+ encode(op, bl);
+ encode(tid, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ETableClient::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+ if (struct_v >= 2)
+ decode(stamp, bl);
+ decode(table, bl);
+ decode(op, bl);
+ decode(tid, bl);
+ DECODE_FINISH(bl);
+}
+
+void ETableClient::dump(Formatter *f) const
+{
+ f->dump_int("table", table);
+ f->dump_int("op", op);
+ f->dump_int("tid", tid);
+}
+
+void ETableClient::generate_test_instances(list<ETableClient*>& ls)
+{
+ ls.push_back(new ETableClient());
+}
+
+void ETableClient::replay(MDSRank *mds)
+{
+ dout(10) << " ETableClient.replay " << get_mdstable_name(table)
+ << " op " << get_mdstableserver_opname(op)
+ << " tid " << tid << dendl;
+
+ MDSTableClient *client = mds->get_table_client(table);
+ if (!client)
+ return;
+
+ ceph_assert(op == TABLESERVER_OP_ACK);
+ client->got_journaled_ack(tid);
+}
+
+
+// -----------------------
+// ESnap
+/*
+void ESnap::update_segment()
+{
+ get_segment()->tablev[TABLE_SNAP] = version;
+}
+
+void ESnap::replay(MDSRank *mds)
+{
+ if (mds->snaptable->get_version() >= version) {
+ dout(10) << "ESnap.replay event " << version
+ << " <= table " << mds->snaptable->get_version() << dendl;
+ return;
+ }
+
+ dout(10) << " ESnap.replay event " << version
+ << " - 1 == table " << mds->snaptable->get_version() << dendl;
+ ceph_assert(version-1 == mds->snaptable->get_version());
+
+ if (create) {
+ version_t v;
+ snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
+ ceph_assert(s == snap.snapid);
+ } else {
+ mds->snaptable->remove(snap.snapid);
+ }
+
+ ceph_assert(version == mds->snaptable->get_version());
+}
+*/
+
+
+
+// -----------------------
+// EUpdate
+
+void EUpdate::encode(bufferlist &bl, uint64_t features) const
+{
+ ENCODE_START(4, 4, bl);
+ encode(stamp, bl);
+ encode(type, bl);
+ encode(metablob, bl, features);
+ encode(client_map, bl);
+ encode(cmapv, bl);
+ encode(reqid, bl);
+ encode(had_slaves, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EUpdate::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl);
+ if (struct_v >= 2)
+ decode(stamp, bl);
+ decode(type, bl);
+ decode(metablob, bl);
+ decode(client_map, bl);
+ if (struct_v >= 3)
+ decode(cmapv, bl);
+ decode(reqid, bl);
+ decode(had_slaves, bl);
+ DECODE_FINISH(bl);
+}
+
+void EUpdate::dump(Formatter *f) const
+{
+ f->open_object_section("metablob");
+ metablob.dump(f);
+ f->close_section(); // metablob
+
+ f->dump_string("type", type);
+ f->dump_int("client map length", client_map.length());
+ f->dump_int("client map version", cmapv);
+ f->dump_stream("reqid") << reqid;
+ f->dump_string("had slaves", had_slaves ? "true" : "false");
+}
+
+void EUpdate::generate_test_instances(list<EUpdate*>& ls)
+{
+ ls.push_back(new EUpdate());
+}
+
+
+void EUpdate::update_segment()
+{
+ auto&& segment = get_segment();
+ metablob.update_segment(segment);
+
+ if (client_map.length())
+ segment->sessionmapv = cmapv;
+
+ if (had_slaves)
+ segment->uncommitted_masters.insert(reqid);
+}
+
+void EUpdate::replay(MDSRank *mds)
+{
+ auto&& segment = get_segment();
+ metablob.replay(mds, segment);
+
+ if (had_slaves) {
+ dout(10) << "EUpdate.replay " << reqid << " had slaves, expecting a matching ECommitted" << dendl;
+ segment->uncommitted_masters.insert(reqid);
+ set<mds_rank_t> slaves;
+ mds->mdcache->add_uncommitted_master(reqid, segment, slaves, true);
+ }
+
+ if (client_map.length()) {
+ if (mds->sessionmap.get_version() >= cmapv) {
+ dout(10) << "EUpdate.replay sessionmap v " << cmapv
+ << " <= table " << mds->sessionmap.get_version() << dendl;
+ } else {
+ dout(10) << "EUpdate.replay sessionmap " << mds->sessionmap.get_version()
+ << " < " << cmapv << dendl;
+ // open client sessions?
+ map<client_t,entity_inst_t> cm;
+ map<client_t,client_metadata_t> cmm;
+ auto blp = client_map.cbegin();
+ using ceph::decode;
+ decode(cm, blp);
+ if (!blp.end())
+ decode(cmm, blp);
+ mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
+ }
+ }
+ update_segment();
+}
+
+
+// ------------------------
+// EOpen
+
+void EOpen::encode(bufferlist &bl, uint64_t features) const {
+ ENCODE_START(4, 3, bl);
+ encode(stamp, bl);
+ encode(metablob, bl, features);
+ encode(inos, bl);
+ encode(snap_inos, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EOpen::decode(bufferlist::const_iterator &bl) {
+ DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+ if (struct_v >= 2)
+ decode(stamp, bl);
+ decode(metablob, bl);
+ decode(inos, bl);
+ if (struct_v >= 4)
+ decode(snap_inos, bl);
+ DECODE_FINISH(bl);
+}
+
+void EOpen::dump(Formatter *f) const
+{
+ f->open_object_section("metablob");
+ metablob.dump(f);
+ f->close_section(); // metablob
+ f->open_array_section("inos involved");
+ for (vector<inodeno_t>::const_iterator i = inos.begin();
+ i != inos.end(); ++i) {
+ f->dump_int("ino", *i);
+ }
+ f->close_section(); // inos
+}
+
+void EOpen::generate_test_instances(list<EOpen*>& ls)
+{
+ ls.push_back(new EOpen());
+ ls.push_back(new EOpen());
+ ls.back()->add_ino(0);
+}
+
+void EOpen::update_segment()
+{
+ // ??
+}
+
+void EOpen::replay(MDSRank *mds)
+{
+ dout(10) << "EOpen.replay " << dendl;
+ auto&& segment = get_segment();
+ metablob.replay(mds, segment);
+
+ // note which segments inodes belong to, so we don't have to start rejournaling them
+ for (const auto &ino : inos) {
+ CInode *in = mds->mdcache->get_inode(ino);
+ if (!in) {
+ dout(0) << "EOpen.replay ino " << ino << " not in metablob" << dendl;
+ ceph_assert(in);
+ }
+ segment->open_files.push_back(&in->item_open_file);
+ }
+ for (const auto &vino : snap_inos) {
+ CInode *in = mds->mdcache->get_inode(vino);
+ if (!in) {
+ dout(0) << "EOpen.replay ino " << vino << " not in metablob" << dendl;
+ ceph_assert(in);
+ }
+ segment->open_files.push_back(&in->item_open_file);
+ }
+}
+
+
+// -----------------------
+// ECommitted
+
+void ECommitted::replay(MDSRank *mds)
+{
+ if (mds->mdcache->uncommitted_masters.count(reqid)) {
+ dout(10) << "ECommitted.replay " << reqid << dendl;
+ mds->mdcache->uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
+ mds->mdcache->uncommitted_masters.erase(reqid);
+ } else {
+ dout(10) << "ECommitted.replay " << reqid << " -- didn't see original op" << dendl;
+ }
+}
+
+void ECommitted::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(3, 3, bl);
+ encode(stamp, bl);
+ encode(reqid, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ECommitted::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+ if (struct_v >= 2)
+ decode(stamp, bl);
+ decode(reqid, bl);
+ DECODE_FINISH(bl);
+}
+
+void ECommitted::dump(Formatter *f) const {
+ f->dump_stream("stamp") << stamp;
+ f->dump_stream("reqid") << reqid;
+}
+
+void ECommitted::generate_test_instances(list<ECommitted*>& ls)
+{
+ ls.push_back(new ECommitted);
+ ls.push_back(new ECommitted);
+ ls.back()->stamp = utime_t(1, 2);
+ ls.back()->reqid = metareqid_t(entity_name_t::CLIENT(123), 456);
+}
+
+// -----------------------
+// ESlaveUpdate
+
+void link_rollback::encode(bufferlist &bl) const
+{
+ ENCODE_START(3, 2, bl);
+ encode(reqid, bl);
+ encode(ino, bl);
+ encode(was_inc, bl);
+ encode(old_ctime, bl);
+ encode(old_dir_mtime, bl);
+ encode(old_dir_rctime, bl);
+ encode(snapbl, bl);
+ ENCODE_FINISH(bl);
+}
+
+void link_rollback::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+ decode(reqid, bl);
+ decode(ino, bl);
+ decode(was_inc, bl);
+ decode(old_ctime, bl);
+ decode(old_dir_mtime, bl);
+ decode(old_dir_rctime, bl);
+ if (struct_v >= 3)
+ decode(snapbl, bl);
+ DECODE_FINISH(bl);
+}
+
+void link_rollback::dump(Formatter *f) const
+{
+ f->dump_stream("metareqid") << reqid;
+ f->dump_int("ino", ino);
+ f->dump_string("was incremented", was_inc ? "true" : "false");
+ f->dump_stream("old_ctime") << old_ctime;
+ f->dump_stream("old_dir_mtime") << old_dir_mtime;
+ f->dump_stream("old_dir_rctime") << old_dir_rctime;
+}
+
+void link_rollback::generate_test_instances(list<link_rollback*>& ls)
+{
+ ls.push_back(new link_rollback());
+}
+
+void rmdir_rollback::encode(bufferlist& bl) const
+{
+ ENCODE_START(3, 2, bl);
+ encode(reqid, bl);
+ encode(src_dir, bl);
+ encode(src_dname, bl);
+ encode(dest_dir, bl);
+ encode(dest_dname, bl);
+ encode(snapbl, bl);
+ ENCODE_FINISH(bl);
+}
+
+void rmdir_rollback::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+ decode(reqid, bl);
+ decode(src_dir, bl);
+ decode(src_dname, bl);
+ decode(dest_dir, bl);
+ decode(dest_dname, bl);
+ if (struct_v >= 3)
+ decode(snapbl, bl);
+ DECODE_FINISH(bl);
+}
+
+void rmdir_rollback::dump(Formatter *f) const
+{
+ f->dump_stream("metareqid") << reqid;
+ f->dump_stream("source directory") << src_dir;
+ f->dump_string("source dname", src_dname);
+ f->dump_stream("destination directory") << dest_dir;
+ f->dump_string("destination dname", dest_dname);
+}
+
+void rmdir_rollback::generate_test_instances(list<rmdir_rollback*>& ls)
+{
+ ls.push_back(new rmdir_rollback());
+}
+
+void rename_rollback::drec::encode(bufferlist &bl) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(dirfrag, bl);
+ encode(dirfrag_old_mtime, bl);
+ encode(dirfrag_old_rctime, bl);
+ encode(ino, bl);
+ encode(remote_ino, bl);
+ encode(dname, bl);
+ encode(remote_d_type, bl);
+ encode(old_ctime, bl);
+ ENCODE_FINISH(bl);
+}
+
+void rename_rollback::drec::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(dirfrag, bl);
+ decode(dirfrag_old_mtime, bl);
+ decode(dirfrag_old_rctime, bl);
+ decode(ino, bl);
+ decode(remote_ino, bl);
+ decode(dname, bl);
+ decode(remote_d_type, bl);
+ decode(old_ctime, bl);
+ DECODE_FINISH(bl);
+}
+
+void rename_rollback::drec::dump(Formatter *f) const
+{
+ f->dump_stream("directory fragment") << dirfrag;
+ f->dump_stream("directory old mtime") << dirfrag_old_mtime;
+ f->dump_stream("directory old rctime") << dirfrag_old_rctime;
+ f->dump_int("ino", ino);
+ f->dump_int("remote ino", remote_ino);
+ f->dump_string("dname", dname);
+ uint32_t type = DTTOIF(remote_d_type) & S_IFMT; // convert to type entries
+ string type_string;
+ switch(type) {
+ case S_IFREG:
+ type_string = "file"; break;
+ case S_IFLNK:
+ type_string = "symlink"; break;
+ case S_IFDIR:
+ type_string = "directory"; break;
+ default:
+ type_string = "UNKNOWN-" + stringify((int)type); break;
+ }
+ f->dump_string("remote dtype", type_string);
+ f->dump_stream("old ctime") << old_ctime;
+}
+
+void rename_rollback::drec::generate_test_instances(list<drec*>& ls)
+{
+ ls.push_back(new drec());
+ ls.back()->remote_d_type = IFTODT(S_IFREG);
+}
+
+void rename_rollback::encode(bufferlist &bl) const
+{
+ ENCODE_START(3, 2, bl);
+ encode(reqid, bl);
+ encode(orig_src, bl);
+ encode(orig_dest, bl);
+ encode(stray, bl);
+ encode(ctime, bl);
+ encode(srci_snapbl, bl);
+ encode(desti_snapbl, bl);
+ ENCODE_FINISH(bl);
+}
+
+void rename_rollback::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+ decode(reqid, bl);
+ decode(orig_src, bl);
+ decode(orig_dest, bl);
+ decode(stray, bl);
+ decode(ctime, bl);
+ if (struct_v >= 3) {
+ decode(srci_snapbl, bl);
+ decode(desti_snapbl, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+void rename_rollback::dump(Formatter *f) const
+{
+ f->dump_stream("request id") << reqid;
+ f->open_object_section("original src drec");
+ orig_src.dump(f);
+ f->close_section(); // original src drec
+ f->open_object_section("original dest drec");
+ orig_dest.dump(f);
+ f->close_section(); // original dest drec
+ f->open_object_section("stray drec");
+ stray.dump(f);
+ f->close_section(); // stray drec
+ f->dump_stream("ctime") << ctime;
+}
+
+void rename_rollback::generate_test_instances(list<rename_rollback*>& ls)
+{
+ ls.push_back(new rename_rollback());
+ ls.back()->orig_src.remote_d_type = IFTODT(S_IFREG);
+ ls.back()->orig_dest.remote_d_type = IFTODT(S_IFREG);
+ ls.back()->stray.remote_d_type = IFTODT(S_IFREG);
+}
+
+void ESlaveUpdate::encode(bufferlist &bl, uint64_t features) const
+{
+ ENCODE_START(3, 3, bl);
+ encode(stamp, bl);
+ encode(type, bl);
+ encode(reqid, bl);
+ encode(master, bl);
+ encode(op, bl);
+ encode(origop, bl);
+ encode(commit, bl, features);
+ encode(rollback, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ESlaveUpdate::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+ if (struct_v >= 2)
+ decode(stamp, bl);
+ decode(type, bl);
+ decode(reqid, bl);
+ decode(master, bl);
+ decode(op, bl);
+ decode(origop, bl);
+ decode(commit, bl);
+ decode(rollback, bl);
+ DECODE_FINISH(bl);
+}
+
+void ESlaveUpdate::dump(Formatter *f) const
+{
+ f->open_object_section("metablob");
+ commit.dump(f);
+ f->close_section(); // metablob
+
+ f->dump_int("rollback length", rollback.length());
+ f->dump_string("type", type);
+ f->dump_stream("metareqid") << reqid;
+ f->dump_int("master", master);
+ f->dump_int("op", op);
+ f->dump_int("original op", origop);
+}
+
+void ESlaveUpdate::generate_test_instances(list<ESlaveUpdate*>& ls)
+{
+ ls.push_back(new ESlaveUpdate());
+}
+
+void ESlaveUpdate::replay(MDSRank *mds)
+{
+ MDSlaveUpdate *su;
+ auto&& segment = get_segment();
+ switch (op) {
+ case ESlaveUpdate::OP_PREPARE:
+ dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds." << master
+ << ": applying commit, saving rollback info" << dendl;
+ su = new MDSlaveUpdate(origop, rollback);
+ commit.replay(mds, segment, su);
+ mds->mdcache->add_uncommitted_slave(reqid, segment, master, su);
+ break;
+
+ case ESlaveUpdate::OP_COMMIT:
+ dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << dendl;
+ mds->mdcache->finish_uncommitted_slave(reqid, false);
+ break;
+
+ case ESlaveUpdate::OP_ROLLBACK:
+ dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
+ << ": applying rollback commit blob" << dendl;
+ commit.replay(mds, segment);
+ mds->mdcache->finish_uncommitted_slave(reqid, false);
+ break;
+
+ default:
+ mds->clog->error() << "invalid op in ESlaveUpdate";
+ mds->damaged();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+}
+
+
+// -----------------------
+// ESubtreeMap
+
+void ESubtreeMap::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(6, 5, bl);
+ encode(stamp, bl);
+ encode(metablob, bl, features);
+ encode(subtrees, bl);
+ encode(ambiguous_subtrees, bl);
+ encode(expire_pos, bl);
+ encode(event_seq, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ESubtreeMap::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
+ if (struct_v >= 2)
+ decode(stamp, bl);
+ decode(metablob, bl);
+ decode(subtrees, bl);
+ if (struct_v >= 4)
+ decode(ambiguous_subtrees, bl);
+ if (struct_v >= 3)
+ decode(expire_pos, bl);
+ if (struct_v >= 6)
+ decode(event_seq, bl);
+ DECODE_FINISH(bl);
+}
+
+void ESubtreeMap::dump(Formatter *f) const
+{
+ f->open_object_section("metablob");
+ metablob.dump(f);
+ f->close_section(); // metablob
+
+ f->open_array_section("subtrees");
+ for(map<dirfrag_t,vector<dirfrag_t> >::const_iterator i = subtrees.begin();
+ i != subtrees.end(); ++i) {
+ f->open_object_section("tree");
+ f->dump_stream("root dirfrag") << i->first;
+ for (vector<dirfrag_t>::const_iterator j = i->second.begin();
+ j != i->second.end(); ++j) {
+ f->dump_stream("bound dirfrag") << *j;
+ }
+ f->close_section(); // tree
+ }
+ f->close_section(); // subtrees
+
+ f->open_array_section("ambiguous subtrees");
+ for(set<dirfrag_t>::const_iterator i = ambiguous_subtrees.begin();
+ i != ambiguous_subtrees.end(); ++i) {
+ f->dump_stream("dirfrag") << *i;
+ }
+ f->close_section(); // ambiguous subtrees
+
+ f->dump_int("expire position", expire_pos);
+}
+
+void ESubtreeMap::generate_test_instances(list<ESubtreeMap*>& ls)
+{
+ ls.push_back(new ESubtreeMap());
+}
+
+void ESubtreeMap::replay(MDSRank *mds)
+{
+ if (expire_pos && expire_pos > mds->mdlog->journaler->get_expire_pos())
+ mds->mdlog->journaler->set_expire_pos(expire_pos);
+
+ // suck up the subtree map?
+ if (mds->mdcache->is_subtrees()) {
+ dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl;
+ int errors = 0;
+
+ for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ CDir *dir = mds->mdcache->get_dirfrag(p->first);
+ if (!dir) {
+ mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+ << " subtree root " << p->first << " not in cache";
+ ++errors;
+ continue;
+ }
+
+ if (!mds->mdcache->is_subtree(dir)) {
+ mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+ << " subtree root " << p->first << " not a subtree in cache";
+ ++errors;
+ continue;
+ }
+ if (dir->get_dir_auth().first != mds->get_nodeid()) {
+ mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+ << " subtree root " << p->first
+ << " is not mine in cache (it's " << dir->get_dir_auth() << ")";
+ ++errors;
+ continue;
+ }
+
+ for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
+ mds->mdcache->get_force_dirfrag(*q, true);
+
+ set<CDir*> bounds;
+ mds->mdcache->get_subtree_bounds(dir, bounds);
+ for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
+ CDir *b = mds->mdcache->get_dirfrag(*q);
+ if (!b) {
+ mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+ << " subtree " << p->first << " bound " << *q << " not in cache";
+ ++errors;
+ continue;
+ }
+ if (bounds.count(b) == 0) {
+ mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+ << " subtree " << p->first << " bound " << *q << " not a bound in cache";
+ ++errors;
+ continue;
+ }
+ bounds.erase(b);
+ }
+ for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q) {
+ mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+ << " subtree " << p->first << " has extra bound in cache " << (*q)->dirfrag();
+ ++errors;
+ }
+
+ if (ambiguous_subtrees.count(p->first)) {
+ if (!mds->mdcache->have_ambiguous_import(p->first)) {
+ mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+ << " subtree " << p->first << " is ambiguous but is not in our cache";
+ ++errors;
+ }
+ } else {
+ if (mds->mdcache->have_ambiguous_import(p->first)) {
+ mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+ << " subtree " << p->first << " is not ambiguous but is in our cache";
+ ++errors;
+ }
+ }
+ }
+
+ std::vector<CDir*> dirs;
+ mds->mdcache->get_subtrees(dirs);
+ for (const auto& dir : dirs) {
+ if (dir->get_dir_auth().first != mds->get_nodeid())
+ continue;
+ if (subtrees.count(dir->dirfrag()) == 0) {
+ mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
+ << " does not include cache subtree " << dir->dirfrag();
+ ++errors;
+ }
+ }
+
+ if (errors) {
+ dout(0) << "journal subtrees: " << subtrees << dendl;
+ dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees << dendl;
+ mds->mdcache->show_subtrees();
+ ceph_assert(!g_conf()->mds_debug_subtrees || errors == 0);
+ }
+ return;
+ }
+
+ dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl;
+
+ // first, stick the spanning tree in my cache
+ //metablob.print(*_dout);
+ metablob.replay(mds, get_segment());
+
+ // restore import/export maps
+ for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ CDir *dir = mds->mdcache->get_dirfrag(p->first);
+ ceph_assert(dir);
+ if (ambiguous_subtrees.count(p->first)) {
+ // ambiguous!
+ mds->mdcache->add_ambiguous_import(p->first, p->second);
+ mds->mdcache->adjust_bounded_subtree_auth(dir, p->second,
+ mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
+ } else {
+ // not ambiguous
+ mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid());
+ }
+ }
+
+ mds->mdcache->recalc_auth_bits(true);
+
+ mds->mdcache->show_subtrees();
+}
+
+
+
+// -----------------------
+// EFragment
+
+void EFragment::replay(MDSRank *mds)
+{
+ dout(10) << "EFragment.replay " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << dendl;
+
+ list<CDir*> resultfrags;
+ MDSContext::vec waiters;
+
+ // in may be NULL if it wasn't in our cache yet. if it's a prepare
+ // it will be once we replay the metablob , but first we need to
+ // refragment anything we already have in the cache.
+ CInode *in = mds->mdcache->get_inode(ino);
+
+ auto&& segment = get_segment();
+ switch (op) {
+ case OP_PREPARE:
+ mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, segment, &rollback);
+
+ if (in)
+ mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters, true);
+ break;
+
+ case OP_ROLLBACK: {
+ frag_vec_t old_frags;
+ if (in) {
+ in->dirfragtree.get_leaves_under(basefrag, old_frags);
+ if (orig_frags.empty()) {
+ // old format EFragment
+ mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true);
+ } else {
+ for (const auto& fg : orig_frags)
+ mds->mdcache->force_dir_fragment(in, fg);
+ }
+ }
+ mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), std::move(old_frags));
+ break;
+ }
+
+ case OP_COMMIT:
+ case OP_FINISH:
+ mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op);
+ break;
+
+ default:
+ ceph_abort();
+ }
+
+ metablob.replay(mds, segment);
+ if (in && g_conf()->mds_debug_frag)
+ in->verify_dirfrags();
+}
+
+void EFragment::encode(bufferlist &bl, uint64_t features) const {
+ ENCODE_START(5, 4, bl);
+ encode(stamp, bl);
+ encode(op, bl);
+ encode(ino, bl);
+ encode(basefrag, bl);
+ encode(bits, bl);
+ encode(metablob, bl, features);
+ encode(orig_frags, bl);
+ encode(rollback, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EFragment::decode(bufferlist::const_iterator &bl) {
+ DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
+ if (struct_v >= 2)
+ decode(stamp, bl);
+ if (struct_v >= 3)
+ decode(op, bl);
+ decode(ino, bl);
+ decode(basefrag, bl);
+ decode(bits, bl);
+ decode(metablob, bl);
+ if (struct_v >= 5) {
+ decode(orig_frags, bl);
+ decode(rollback, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+void EFragment::dump(Formatter *f) const
+{
+ /*f->open_object_section("Metablob");
+ metablob.dump(f); // sadly we don't have this; dunno if we'll get it
+ f->close_section();*/
+ f->dump_string("op", op_name(op));
+ f->dump_stream("ino") << ino;
+ f->dump_stream("base frag") << basefrag;
+ f->dump_int("bits", bits);
+}
+
+void EFragment::generate_test_instances(list<EFragment*>& ls)
+{
+ ls.push_back(new EFragment);
+ ls.push_back(new EFragment);
+ ls.back()->op = OP_PREPARE;
+ ls.back()->ino = 1;
+ ls.back()->bits = 5;
+}
+
+void dirfrag_rollback::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(fnode, bl);
+ ENCODE_FINISH(bl);
+}
+
+void dirfrag_rollback::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(1, bl);
+ decode(fnode, bl);
+ DECODE_FINISH(bl);
+}
+
+
+
+// =========================================================================
+
+// -----------------------
+// EExport
+
+void EExport::replay(MDSRank *mds)
+{
+ dout(10) << "EExport.replay " << base << dendl;
+ auto&& segment = get_segment();
+ metablob.replay(mds, segment);
+
+ CDir *dir = mds->mdcache->get_dirfrag(base);
+ ceph_assert(dir);
+
+ set<CDir*> realbounds;
+ for (set<dirfrag_t>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p) {
+ CDir *bd = mds->mdcache->get_dirfrag(*p);
+ ceph_assert(bd);
+ realbounds.insert(bd);
+ }
+
+ // adjust auth away
+ mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, CDIR_AUTH_UNDEF);
+
+ mds->mdcache->try_trim_non_auth_subtree(dir);
+}
+
+void EExport::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(4, 3, bl);
+ encode(stamp, bl);
+ encode(metablob, bl, features);
+ encode(base, bl);
+ encode(bounds, bl);
+ encode(target, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EExport::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+ if (struct_v >= 2)
+ decode(stamp, bl);
+ decode(metablob, bl);
+ decode(base, bl);
+ decode(bounds, bl);
+ if (struct_v >= 4)
+ decode(target, bl);
+ DECODE_FINISH(bl);
+}
+
+void EExport::dump(Formatter *f) const
+{
+ f->dump_float("stamp", (double)stamp);
+ /*f->open_object_section("Metablob");
+ metablob.dump(f); // sadly we don't have this; dunno if we'll get it
+ f->close_section();*/
+ f->dump_stream("base dirfrag") << base;
+ f->open_array_section("bounds dirfrags");
+ for (set<dirfrag_t>::const_iterator i = bounds.begin();
+ i != bounds.end(); ++i) {
+ f->dump_stream("dirfrag") << *i;
+ }
+ f->close_section(); // bounds dirfrags
+}
+
+void EExport::generate_test_instances(list<EExport*>& ls)
+{
+ EExport *sample = new EExport();
+ ls.push_back(sample);
+}
+
+
+// -----------------------
+// EImportStart
+
+void EImportStart::update_segment()
+{
+ get_segment()->sessionmapv = cmapv;
+}
+
+void EImportStart::replay(MDSRank *mds)
+{
+ dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl;
+ //metablob.print(*_dout);
+ auto&& segment = get_segment();
+ metablob.replay(mds, segment);
+
+ // put in ambiguous import list
+ mds->mdcache->add_ambiguous_import(base, bounds);
+
+ // set auth partially to us so we don't trim it
+ CDir *dir = mds->mdcache->get_dirfrag(base);
+ ceph_assert(dir);
+
+ set<CDir*> realbounds;
+ for (vector<dirfrag_t>::iterator p = bounds.begin();
+ p != bounds.end();
+ ++p) {
+ CDir *bd = mds->mdcache->get_dirfrag(*p);
+ ceph_assert(bd);
+ if (!bd->is_subtree_root())
+ bd->state_clear(CDir::STATE_AUTH);
+ realbounds.insert(bd);
+ }
+
+ mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds,
+ mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
+
+ // open client sessions?
+ if (mds->sessionmap.get_version() >= cmapv) {
+ dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
+ << " >= " << cmapv << ", noop" << dendl;
+ } else {
+ dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
+ << " < " << cmapv << dendl;
+ map<client_t,entity_inst_t> cm;
+ map<client_t,client_metadata_t> cmm;
+ auto blp = client_map.cbegin();
+ using ceph::decode;
+ decode(cm, blp);
+ if (!blp.end())
+ decode(cmm, blp);
+ mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
+ }
+ update_segment();
+}
+
+void EImportStart::encode(bufferlist &bl, uint64_t features) const {
+ ENCODE_START(4, 3, bl);
+ encode(stamp, bl);
+ encode(base, bl);
+ encode(metablob, bl, features);
+ encode(bounds, bl);
+ encode(cmapv, bl);
+ encode(client_map, bl);
+ encode(from, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EImportStart::decode(bufferlist::const_iterator &bl) {
+ DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+ if (struct_v >= 2)
+ decode(stamp, bl);
+ decode(base, bl);
+ decode(metablob, bl);
+ decode(bounds, bl);
+ decode(cmapv, bl);
+ decode(client_map, bl);
+ if (struct_v >= 4)
+ decode(from, bl);
+ DECODE_FINISH(bl);
+}
+
+void EImportStart::dump(Formatter *f) const
+{
+ f->dump_stream("base dirfrag") << base;
+ f->open_array_section("boundary dirfrags");
+ for (vector<dirfrag_t>::const_iterator iter = bounds.begin();
+ iter != bounds.end(); ++iter) {
+ f->dump_stream("frag") << *iter;
+ }
+ f->close_section();
+}
+
+void EImportStart::generate_test_instances(list<EImportStart*>& ls)
+{
+ ls.push_back(new EImportStart);
+}
+
+// -----------------------
+// EImportFinish
+
+void EImportFinish::replay(MDSRank *mds)
+{
+ if (mds->mdcache->have_ambiguous_import(base)) {
+ dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl;
+ if (success) {
+ mds->mdcache->finish_ambiguous_import(base);
+ } else {
+ CDir *dir = mds->mdcache->get_dirfrag(base);
+ ceph_assert(dir);
+ vector<dirfrag_t> bounds;
+ mds->mdcache->get_ambiguous_import_bounds(base, bounds);
+ mds->mdcache->adjust_bounded_subtree_auth(dir, bounds, CDIR_AUTH_UNDEF);
+ mds->mdcache->cancel_ambiguous_import(dir);
+ mds->mdcache->try_trim_non_auth_subtree(dir);
+ }
+ } else {
+ // this shouldn't happen unless this is an old journal
+ dout(10) << "EImportFinish.replay " << base << " success=" << success
+ << " on subtree not marked as ambiguous"
+ << dendl;
+ mds->clog->error() << "failure replaying journal (EImportFinish)";
+ mds->damaged();
+ ceph_abort(); // Should be unreachable because damaged() calls respawn()
+ }
+}
+
+void EImportFinish::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(3, 3, bl);
+ encode(stamp, bl);
+ encode(base, bl);
+ encode(success, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EImportFinish::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
+ if (struct_v >= 2)
+ decode(stamp, bl);
+ decode(base, bl);
+ decode(success, bl);
+ DECODE_FINISH(bl);
+}
+
+void EImportFinish::dump(Formatter *f) const
+{
+ f->dump_stream("base dirfrag") << base;
+ f->dump_string("success", success ? "true" : "false");
+}
+void EImportFinish::generate_test_instances(list<EImportFinish*>& ls)
+{
+ ls.push_back(new EImportFinish);
+ ls.push_back(new EImportFinish);
+ ls.back()->success = true;
+}
+
+
+// ------------------------
+// EResetJournal
+
+void EResetJournal::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(stamp, bl);
+ ENCODE_FINISH(bl);
+}
+
+void EResetJournal::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(stamp, bl);
+ DECODE_FINISH(bl);
+}
+
+void EResetJournal::dump(Formatter *f) const
+{
+ f->dump_stream("timestamp") << stamp;
+}
+
+void EResetJournal::generate_test_instances(list<EResetJournal*>& ls)
+{
+ ls.push_back(new EResetJournal());
+}
+
+void EResetJournal::replay(MDSRank *mds)
+{
+ dout(1) << "EResetJournal" << dendl;
+
+ mds->sessionmap.wipe();
+ mds->inotable->replay_reset();
+
+ if (mds->mdsmap->get_root() == mds->get_nodeid()) {
+ CDir *rootdir = mds->mdcache->get_root()->get_or_open_dirfrag(mds->mdcache, frag_t());
+ mds->mdcache->adjust_subtree_auth(rootdir, mds->get_nodeid());
+ }
+
+ CDir *mydir = mds->mdcache->get_myin()->get_or_open_dirfrag(mds->mdcache, frag_t());
+ mds->mdcache->adjust_subtree_auth(mydir, mds->get_nodeid());
+
+ mds->mdcache->recalc_auth_bits(true);
+
+ mds->mdcache->show_subtrees();
+}
+
+
+void ENoOp::encode(bufferlist &bl, uint64_t features) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(pad_size, bl);
+ uint8_t const pad = 0xff;
+ for (unsigned int i = 0; i < pad_size; ++i) {
+ encode(pad, bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+
+void ENoOp::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START(2, bl);
+ decode(pad_size, bl);
+ if (bl.get_remaining() != pad_size) {
+ // This is spiritually an assertion, but expressing in a way that will let
+ // journal debug tools catch it and recognise a malformed entry.
+ throw buffer::end_of_buffer();
+ } else {
+ bl.advance(pad_size);
+ }
+ DECODE_FINISH(bl);
+}
+
+
+void ENoOp::replay(MDSRank *mds)
+{
+ dout(4) << "ENoOp::replay, " << pad_size << " bytes skipped in journal" << dendl;
+}
+
+/**
+ * If re-formatting an old journal that used absolute log position
+ * references as segment sequence numbers, use this function to update
+ * it.
+ *
+ * @param mds
+ * MDSRank instance, just used for logging
+ * @param old_to_new
+ * Map of old journal segment sequence numbers to new journal segment sequence numbers
+ *
+ * @return
+ * True if the event was modified.
+ */
+bool EMetaBlob::rewrite_truncate_finish(MDSRank const *mds,
+ std::map<LogSegment::seq_t, LogSegment::seq_t> const &old_to_new)
+{
+ bool modified = false;
+ map<inodeno_t, LogSegment::seq_t> new_trunc_finish;
+ for (const auto& p : truncate_finish) {
+ auto q = old_to_new.find(p.second);
+ if (q != old_to_new.end()) {
+ dout(20) << __func__ << " applying segment seq mapping "
+ << p.second << " -> " << q->second << dendl;
+ new_trunc_finish.emplace(p.first, q->second);
+ modified = true;
+ } else {
+ dout(20) << __func__ << " no segment seq mapping found for "
+ << p.second << dendl;
+ new_trunc_finish.insert(p);
+ }
+ }
+ truncate_finish.swap(new_trunc_finish);
+
+ return modified;
+}
diff --git a/src/mds/locks.c b/src/mds/locks.c
new file mode 100644
index 00000000..25646fdd
--- /dev/null
+++ b/src/mds/locks.c
@@ -0,0 +1,162 @@
+#include "include/int_types.h"
+
+#include <string.h>
+#include <fcntl.h>
+
+#include "locks.h"
+
+/* Duplicated from ceph_fs.h, which we cannot include into a C file. */
+#define CEPH_CAP_GSHARED 1 /* client can reads */
+#define CEPH_CAP_GEXCL 2 /* client can read and update */
+#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
+#define CEPH_CAP_GRD 8 /* (file) client can read */
+#define CEPH_CAP_GWR 16 /* (file) client can write */
+#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
+
+static const struct sm_state_t simplelock[LOCK_MAX] = {
+ // stable loner rep state r rp rd wr fwr l x caps,other
+ [LOCK_SYNC] = { 0, false, LOCK_SYNC, ANY, 0, ANY, 0, 0, ANY, 0, CEPH_CAP_GSHARED,0,0,CEPH_CAP_GSHARED },
+ [LOCK_LOCK_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, AUTH, XCL, XCL, 0, 0, XCL, 0, 0,0,0,0 },
+ [LOCK_EXCL_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, 0,CEPH_CAP_GSHARED,0,0 },
+ [LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0, 0, 0, 0, AUTH,0, 0, 0,0,0,0 },
+
+ [LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, ANY, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, 0,0,0,0 },
+
+ [LOCK_PREXLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, ANY, 0,0,0,0 },
+ [LOCK_XLOCK] = { LOCK_SYNC, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_XLOCKDONE] = { LOCK_SYNC, false, LOCK_LOCK, XCL, XCL, XCL, 0, 0, XCL, 0, 0,0,CEPH_CAP_GSHARED,0 },
+ [LOCK_LOCK_XLOCK]= { LOCK_PREXLOCK,false,LOCK_LOCK,0, XCL, 0, 0, 0, 0, XCL, 0,0,0,0 },
+
+ [LOCK_EXCL] = { 0, true, LOCK_LOCK, 0, 0, REQ, XCL, 0, 0, 0, 0,CEPH_CAP_GEXCL|CEPH_CAP_GSHARED,0,0 },
+ [LOCK_SYNC_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, ANY, 0, 0, 0, 0, 0, 0, 0,CEPH_CAP_GSHARED,0,0 },
+ [LOCK_LOCK_EXCL] = { LOCK_EXCL, false, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, CEPH_CAP_GSHARED,0,0,0 },
+
+ [LOCK_REMOTEXLOCK]={ LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+
+};
+
+const struct sm_t sm_simplelock = {
+ .states = simplelock,
+ .allowed_ever_auth = CEPH_CAP_GSHARED | CEPH_CAP_GEXCL,
+ .allowed_ever_replica = CEPH_CAP_GSHARED,
+ .careful = CEPH_CAP_GSHARED | CEPH_CAP_GEXCL,
+ .can_remote_xlock = 1,
+};
+
+
+// lock state machine states:
+// Sync -- Lock -- sCatter
+// Tempsync _/
+// (out of date)
+
+static const struct sm_state_t scatterlock[LOCK_MAX] = {
+ // stable loner rep state r rp rd wr fwr l x caps,other
+ [LOCK_SYNC] = { 0, false, LOCK_SYNC, ANY, 0, ANY, 0, 0, ANY, 0, CEPH_CAP_GSHARED,0,0,CEPH_CAP_GSHARED },
+ [LOCK_LOCK_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_MIX_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0, 0, 0, 0, AUTH,0, 0, 0,0,0,0 },
+
+ [LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, ANY, 0,0,0,0 },
+ [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, ANY, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_MIX, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_TSYN_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+
+ [LOCK_TSYN] = { 0, false, LOCK_LOCK, AUTH, 0, AUTH,0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_LOCK_TSYN] = { LOCK_TSYN, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_MIX_TSYN] = { LOCK_TSYN, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+
+ [LOCK_MIX] = { 0, false, LOCK_MIX, 0, 0, REQ, ANY, 0, 0, 0, 0,0,0,0 },
+ [LOCK_TSYN_MIX] = { LOCK_MIX, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_SYNC_MIX] = { LOCK_MIX, false, LOCK_SYNC_MIX2,ANY,0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_SYNC_MIX2] = { LOCK_MIX, false, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+};
+
+const struct sm_t sm_scatterlock = {
+ .states = scatterlock,
+ .allowed_ever_auth = CEPH_CAP_GSHARED | CEPH_CAP_GEXCL,
+ .allowed_ever_replica = CEPH_CAP_GSHARED,
+ .careful = CEPH_CAP_GSHARED | CEPH_CAP_GEXCL,
+ .can_remote_xlock = 0,
+};
+
+const struct sm_state_t filelock[LOCK_MAX] = {
+ // stable loner rep state r rp rd wr fwr l x caps(any,loner,xlocker,replica)
+ [LOCK_SYNC] = { 0, false, LOCK_SYNC, ANY, 0, ANY, 0, 0, ANY, 0, CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD },
+ [LOCK_LOCK_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 },
+ [LOCK_EXCL_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, 0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD,0,0 },
+ [LOCK_MIX_SYNC] = { LOCK_SYNC, false, LOCK_MIX_SYNC2,0,0, 0, 0, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
+ [LOCK_MIX_SYNC2] = { LOCK_SYNC, false, 0, 0, 0, 0, 0, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
+ [LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0, 0, 0, 0, AUTH,0, 0, 0,0,0,0 },
+ [LOCK_XSYN_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, AUTH, 0, AUTH,0, 0, 0, 0, 0,CEPH_CAP_GCACHE,0,0 },
+
+ [LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
+ [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, ANY, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 },
+ [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
+ [LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_MIX, 0, 0, REQ, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, REQ, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_XSYN_LOCK] = { LOCK_LOCK, true, LOCK_LOCK, AUTH, 0, 0, XCL, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
+
+ [LOCK_PREXLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, ANY, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
+ [LOCK_XLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
+ [LOCK_XLOCKDONE] = { LOCK_LOCK, false, LOCK_LOCK, XCL, XCL, XCL, 0, 0, XCL, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,CEPH_CAP_GSHARED,0 },
+ [LOCK_XLOCKSNAP] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 },
+ [LOCK_LOCK_XLOCK]= { LOCK_PREXLOCK,false,LOCK_LOCK,0, XCL, 0, 0, 0, 0, XCL, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
+
+ [LOCK_MIX] = { 0, false, LOCK_MIX, 0, 0, REQ, ANY, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
+ [LOCK_SYNC_MIX] = { LOCK_MIX, false, LOCK_SYNC_MIX2,ANY,0, 0, 0, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
+ [LOCK_SYNC_MIX2] = { LOCK_MIX, false, 0, 0, 0, 0, 0, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
+ [LOCK_EXCL_MIX] = { LOCK_MIX, true, LOCK_LOCK, 0, 0, 0, XCL, 0, 0, 0, 0,CEPH_CAP_GRD|CEPH_CAP_GWR,0,0 },
+ [LOCK_XSYN_MIX] = { LOCK_MIX, true, LOCK_LOCK, 0, 0, 0, XCL, 0, 0, 0, 0,0,0,0 },
+
+ [LOCK_EXCL] = { 0, true, LOCK_LOCK, 0, 0, XCL, XCL, 0, 0, 0, 0,CEPH_CAP_GSHARED|CEPH_CAP_GEXCL|CEPH_CAP_GCACHE|CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GBUFFER,0,0 },
+ [LOCK_SYNC_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, ANY, 0, 0, 0, 0, 0, 0, 0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD,0,0 },
+ [LOCK_MIX_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, 0, 0, 0, XCL, 0, 0, 0, 0,CEPH_CAP_GRD|CEPH_CAP_GWR,0,0 },
+ [LOCK_LOCK_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
+ [LOCK_XSYN_EXCL] = { LOCK_EXCL, true, LOCK_LOCK, AUTH, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
+
+ [LOCK_XSYN] = { 0, true, LOCK_LOCK, AUTH, AUTH,AUTH,XCL, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
+ [LOCK_EXCL_XSYN] = { LOCK_XSYN, false, LOCK_LOCK, 0, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
+
+ [LOCK_PRE_SCAN] = { LOCK_SCAN, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+ [LOCK_SCAN] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0, 0,0,0,0 },
+};
+
+const struct sm_t sm_filelock = {
+ .states = filelock,
+ .allowed_ever_auth = (CEPH_CAP_GSHARED |
+ CEPH_CAP_GEXCL |
+ CEPH_CAP_GCACHE |
+ CEPH_CAP_GRD |
+ CEPH_CAP_GWR |
+ CEPH_CAP_GWREXTEND |
+ CEPH_CAP_GBUFFER |
+ CEPH_CAP_GLAZYIO),
+ .allowed_ever_replica = (CEPH_CAP_GSHARED |
+ CEPH_CAP_GCACHE |
+ CEPH_CAP_GRD |
+ CEPH_CAP_GLAZYIO),
+ .careful = (CEPH_CAP_GSHARED |
+ CEPH_CAP_GEXCL |
+ CEPH_CAP_GCACHE |
+ CEPH_CAP_GBUFFER),
+ .can_remote_xlock = 0,
+};
+
+
+const struct sm_state_t locallock[LOCK_MAX] = {
+ // stable loner rep state r rp rd wr fwr l x caps(any,loner,xlocker,replica)
+ [LOCK_LOCK] = { 0, false, LOCK_LOCK, ANY, 0, ANY, 0, 0, ANY, AUTH,0,0,0,0 },
+};
+
+const struct sm_t sm_locallock = {
+ .states = locallock,
+ .allowed_ever_auth = 0,
+ .allowed_ever_replica = 0,
+ .careful = 0,
+ .can_remote_xlock = 0,
+};
diff --git a/src/mds/locks.h b/src/mds/locks.h
new file mode 100644
index 00000000..e6fdc1cf
--- /dev/null
+++ b/src/mds/locks.h
@@ -0,0 +1,126 @@
+#ifndef CEPH_MDS_LOCKS_H
+#define CEPH_MDS_LOCKS_H
+#include <stdbool.h>
+
+struct sm_state_t {
+ int next; // 0 if stable
+ bool loner;
+ int replica_state;
+ char can_read;
+ char can_read_projected;
+ char can_rdlock;
+ char can_wrlock;
+ char can_force_wrlock;
+ char can_lease;
+ char can_xlock;
+ int caps;
+ int loner_caps;
+ int xlocker_caps;
+ int replica_caps;
+};
+
+struct sm_t {
+ const struct sm_state_t *states;
+ int allowed_ever_auth;
+ int allowed_ever_replica;
+ int careful;
+ int can_remote_xlock;
+};
+
+#define ANY 1 // auth or replica
+#define AUTH 2 // auth only
+#define XCL 3 // auth or exclusive client
+//#define FW 4 // fw to auth, if replica
+#define REQ 5 // req state change from auth, if replica
+
+extern const struct sm_t sm_simplelock;
+extern const struct sm_t sm_filelock;
+extern const struct sm_t sm_scatterlock;
+extern const struct sm_t sm_locallock;
+
+
+
+// -- lock states --
+// sync <-> lock
+enum {
+ LOCK_UNDEF = 0,
+
+ // auth rep
+ LOCK_SYNC, // AR R . RD L . / C . R RD L . / C .
+ LOCK_LOCK, // AR R . .. . X / . . . .. . . / . .
+
+ LOCK_PREXLOCK, // A . . .. . . / . . (lock)
+ LOCK_XLOCK, // A . . .. . . / . . (lock)
+ LOCK_XLOCKDONE, // A r p rd l x / . . (lock) <-- by same client only!!
+ LOCK_XLOCKSNAP, // also revoke Fb
+ LOCK_LOCK_XLOCK,
+
+ LOCK_SYNC_LOCK, // AR R . .. . . / . . R .. . . / . .
+ LOCK_LOCK_SYNC, // A R p rd l . / . . (lock) <-- lc by same client only
+
+ LOCK_EXCL, // A . . .. . . / c x * (lock)
+ LOCK_EXCL_SYNC, // A . . .. . . / c . * (lock)
+ LOCK_EXCL_LOCK, // A . . .. . . / . . (lock)
+ LOCK_SYNC_EXCL, // Ar R . .. . . / c . * (sync->lock)
+ LOCK_LOCK_EXCL, // A R . .. . . / . . (lock)
+
+ LOCK_REMOTEXLOCK, // on NON-auth
+
+ // * = loner mode
+
+ LOCK_MIX,
+ LOCK_SYNC_MIX,
+ LOCK_SYNC_MIX2,
+ LOCK_LOCK_MIX,
+ LOCK_EXCL_MIX,
+ LOCK_MIX_SYNC,
+ LOCK_MIX_SYNC2,
+ LOCK_MIX_LOCK,
+ LOCK_MIX_LOCK2,
+ LOCK_MIX_EXCL,
+
+ LOCK_TSYN,
+ LOCK_TSYN_LOCK,
+ LOCK_TSYN_MIX,
+ LOCK_LOCK_TSYN,
+ LOCK_MIX_TSYN,
+
+ LOCK_PRE_SCAN,
+ LOCK_SCAN,
+
+ LOCK_SNAP_SYNC,
+
+ LOCK_XSYN,
+ LOCK_XSYN_EXCL,
+ LOCK_EXCL_XSYN,
+ LOCK_XSYN_SYNC,
+ LOCK_XSYN_LOCK,
+ LOCK_XSYN_MIX,
+
+ LOCK_MAX,
+};
+
+// -------------------------
+// lock actions
+
+// for replicas
+#define LOCK_AC_SYNC -1
+#define LOCK_AC_MIX -2
+#define LOCK_AC_LOCK -3
+#define LOCK_AC_LOCKFLUSHED -4
+
+// for auth
+#define LOCK_AC_SYNCACK 1
+#define LOCK_AC_MIXACK 2
+#define LOCK_AC_LOCKACK 3
+
+#define LOCK_AC_REQSCATTER 7
+#define LOCK_AC_REQUNSCATTER 8
+#define LOCK_AC_NUDGE 9
+#define LOCK_AC_REQRDLOCK 10
+
+#define LOCK_AC_FOR_REPLICA(a) ((a) < 0)
+#define LOCK_AC_FOR_AUTH(a) ((a) > 0)
+
+
+#endif
diff --git a/src/mds/mds_table_types.h b/src/mds/mds_table_types.h
new file mode 100644
index 00000000..bfb2baa9
--- /dev/null
+++ b/src/mds/mds_table_types.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDSTABLETYPES_H
+#define CEPH_MDSTABLETYPES_H
+
+// MDS TABLES
+
+#include <string_view>
+
+enum {
+ TABLE_ANCHOR,
+ TABLE_SNAP,
+};
+
+inline std::string_view get_mdstable_name(int t) {
+ switch (t) {
+ case TABLE_ANCHOR: return "anchortable";
+ case TABLE_SNAP: return "snaptable";
+ default: ceph_abort(); return std::string_view();
+ }
+}
+
+enum {
+ TABLESERVER_OP_QUERY = 1,
+ TABLESERVER_OP_QUERY_REPLY = -2,
+ TABLESERVER_OP_PREPARE = 3,
+ TABLESERVER_OP_AGREE = -4,
+ TABLESERVER_OP_COMMIT = 5,
+ TABLESERVER_OP_ACK = -6,
+ TABLESERVER_OP_ROLLBACK = 7,
+ TABLESERVER_OP_SERVER_UPDATE = 8,
+ TABLESERVER_OP_SERVER_READY = -9,
+ TABLESERVER_OP_NOTIFY_ACK = 10,
+ TABLESERVER_OP_NOTIFY_PREP = -11,
+};
+
+inline std::string_view get_mdstableserver_opname(int op) {
+ switch (op) {
+ case TABLESERVER_OP_QUERY: return "query";
+ case TABLESERVER_OP_QUERY_REPLY: return "query_reply";
+ case TABLESERVER_OP_PREPARE: return "prepare";
+ case TABLESERVER_OP_AGREE: return "agree";
+ case TABLESERVER_OP_COMMIT: return "commit";
+ case TABLESERVER_OP_ACK: return "ack";
+ case TABLESERVER_OP_ROLLBACK: return "rollback";
+ case TABLESERVER_OP_SERVER_UPDATE: return "server_update";
+ case TABLESERVER_OP_SERVER_READY: return "server_ready";
+ case TABLESERVER_OP_NOTIFY_ACK: return "notify_ack";
+ case TABLESERVER_OP_NOTIFY_PREP: return "notify_prep";
+ default: ceph_abort(); return std::string_view();
+ }
+}
+
+enum {
+ TABLE_OP_CREATE,
+ TABLE_OP_UPDATE,
+ TABLE_OP_DESTROY,
+};
+
+inline std::string_view get_mdstable_opname(int op) {
+ switch (op) {
+ case TABLE_OP_CREATE: return "create";
+ case TABLE_OP_UPDATE: return "update";
+ case TABLE_OP_DESTROY: return "destroy";
+ default: ceph_abort(); return std::string_view();
+ }
+}
+
+#endif
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
new file mode 100644
index 00000000..a55c8559
--- /dev/null
+++ b/src/mds/mdstypes.cc
@@ -0,0 +1,895 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "mdstypes.h"
+#include "MDSContext.h"
+#include "common/Formatter.h"
+
+const mds_gid_t MDS_GID_NONE = mds_gid_t(0);
+
+
+/*
+ * frag_info_t
+ */
+
+void frag_info_t::encode(bufferlist &bl) const
+{
+ ENCODE_START(3, 2, bl);
+ encode(version, bl);
+ encode(mtime, bl);
+ encode(nfiles, bl);
+ encode(nsubdirs, bl);
+ encode(change_attr, bl);
+ ENCODE_FINISH(bl);
+}
+
+void frag_info_t::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+ decode(version, bl);
+ decode(mtime, bl);
+ decode(nfiles, bl);
+ decode(nsubdirs, bl);
+ if (struct_v >= 3)
+ decode(change_attr, bl);
+ else
+ change_attr = 0;
+ DECODE_FINISH(bl);
+}
+
+void frag_info_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("version", version);
+ f->dump_stream("mtime") << mtime;
+ f->dump_unsigned("num_files", nfiles);
+ f->dump_unsigned("num_subdirs", nsubdirs);
+}
+
+void frag_info_t::generate_test_instances(list<frag_info_t*>& ls)
+{
+ ls.push_back(new frag_info_t);
+ ls.push_back(new frag_info_t);
+ ls.back()->version = 1;
+ ls.back()->mtime = utime_t(2, 3);
+ ls.back()->nfiles = 4;
+ ls.back()->nsubdirs = 5;
+}
+
+ostream& operator<<(ostream &out, const frag_info_t &f)
+{
+ if (f == frag_info_t())
+ return out << "f()";
+ out << "f(v" << f.version;
+ if (f.mtime != utime_t())
+ out << " m" << f.mtime;
+ if (f.nfiles || f.nsubdirs)
+ out << " " << f.size() << "=" << f.nfiles << "+" << f.nsubdirs;
+ out << ")";
+ return out;
+}
+
+
+/*
+ * nest_info_t
+ */
+
+void nest_info_t::encode(bufferlist &bl) const
+{
+ ENCODE_START(3, 2, bl);
+ encode(version, bl);
+ encode(rbytes, bl);
+ encode(rfiles, bl);
+ encode(rsubdirs, bl);
+ {
+ // removed field
+ int64_t ranchors = 0;
+ encode(ranchors, bl);
+ }
+ encode(rsnaps, bl);
+ encode(rctime, bl);
+ ENCODE_FINISH(bl);
+}
+
+void nest_info_t::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+ decode(version, bl);
+ decode(rbytes, bl);
+ decode(rfiles, bl);
+ decode(rsubdirs, bl);
+ {
+ int64_t ranchors;
+ decode(ranchors, bl);
+ }
+ decode(rsnaps, bl);
+ decode(rctime, bl);
+ DECODE_FINISH(bl);
+}
+
+void nest_info_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("version", version);
+ f->dump_unsigned("rbytes", rbytes);
+ f->dump_unsigned("rfiles", rfiles);
+ f->dump_unsigned("rsubdirs", rsubdirs);
+ f->dump_unsigned("rsnaps", rsnaps);
+ f->dump_stream("rctime") << rctime;
+}
+
+void nest_info_t::generate_test_instances(list<nest_info_t*>& ls)
+{
+ ls.push_back(new nest_info_t);
+ ls.push_back(new nest_info_t);
+ ls.back()->version = 1;
+ ls.back()->rbytes = 2;
+ ls.back()->rfiles = 3;
+ ls.back()->rsubdirs = 4;
+ ls.back()->rsnaps = 6;
+ ls.back()->rctime = utime_t(7, 8);
+}
+
+ostream& operator<<(ostream &out, const nest_info_t &n)
+{
+ if (n == nest_info_t())
+ return out << "n()";
+ out << "n(v" << n.version;
+ if (n.rctime != utime_t())
+ out << " rc" << n.rctime;
+ if (n.rbytes)
+ out << " b" << n.rbytes;
+ if (n.rsnaps)
+ out << " rs" << n.rsnaps;
+ if (n.rfiles || n.rsubdirs)
+ out << " " << n.rsize() << "=" << n.rfiles << "+" << n.rsubdirs;
+ out << ")";
+ return out;
+}
+
+/*
+ * quota_info_t
+ */
+void quota_info_t::dump(Formatter *f) const
+{
+ f->dump_int("max_bytes", max_bytes);
+ f->dump_int("max_files", max_files);
+}
+
+void quota_info_t::generate_test_instances(list<quota_info_t *>& ls)
+{
+ ls.push_back(new quota_info_t);
+ ls.push_back(new quota_info_t);
+ ls.back()->max_bytes = 16;
+ ls.back()->max_files = 16;
+}
+
+ostream& operator<<(ostream &out, const quota_info_t &n)
+{
+ out << "quota("
+ << "max_bytes = " << n.max_bytes
+ << " max_files = " << n.max_files
+ << ")";
+ return out;
+}
+
+/*
+ * client_writeable_range_t
+ */
+
+void client_writeable_range_t::encode(bufferlist &bl) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(range.first, bl);
+ encode(range.last, bl);
+ encode(follows, bl);
+ ENCODE_FINISH(bl);
+}
+
+void client_writeable_range_t::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(range.first, bl);
+ decode(range.last, bl);
+ decode(follows, bl);
+ DECODE_FINISH(bl);
+}
+
+void client_writeable_range_t::dump(Formatter *f) const
+{
+ f->open_object_section("byte range");
+ f->dump_unsigned("first", range.first);
+ f->dump_unsigned("last", range.last);
+ f->close_section();
+ f->dump_unsigned("follows", follows);
+}
+
+void client_writeable_range_t::generate_test_instances(list<client_writeable_range_t*>& ls)
+{
+ ls.push_back(new client_writeable_range_t);
+ ls.push_back(new client_writeable_range_t);
+ ls.back()->range.first = 123;
+ ls.back()->range.last = 456;
+ ls.back()->follows = 12;
+}
+
+ostream& operator<<(ostream& out, const client_writeable_range_t& r)
+{
+ return out << r.range.first << '-' << r.range.last << "@" << r.follows;
+}
+
+/*
+ * inline_data_t
+ */
+void inline_data_t::encode(bufferlist &bl) const
+{
+ using ceph::encode;
+ encode(version, bl);
+ if (blp)
+ encode(*blp, bl);
+ else
+ encode(bufferlist(), bl);
+}
+void inline_data_t::decode(bufferlist::const_iterator &p)
+{
+ using ceph::decode;
+ decode(version, p);
+ uint32_t inline_len;
+ decode(inline_len, p);
+ if (inline_len > 0)
+ decode_nohead(inline_len, get_data(), p);
+ else
+ free_data();
+}
+
+
+/*
+ * fnode_t
+ */
+void fnode_t::encode(bufferlist &bl) const
+{
+ ENCODE_START(4, 3, bl);
+ encode(version, bl);
+ encode(snap_purged_thru, bl);
+ encode(fragstat, bl);
+ encode(accounted_fragstat, bl);
+ encode(rstat, bl);
+ encode(accounted_rstat, bl);
+ encode(damage_flags, bl);
+ encode(recursive_scrub_version, bl);
+ encode(recursive_scrub_stamp, bl);
+ encode(localized_scrub_version, bl);
+ encode(localized_scrub_stamp, bl);
+ ENCODE_FINISH(bl);
+}
+
+void fnode_t::decode(bufferlist::const_iterator &bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+ decode(version, bl);
+ decode(snap_purged_thru, bl);
+ decode(fragstat, bl);
+ decode(accounted_fragstat, bl);
+ decode(rstat, bl);
+ decode(accounted_rstat, bl);
+ if (struct_v >= 3) {
+ decode(damage_flags, bl);
+ }
+ if (struct_v >= 4) {
+ decode(recursive_scrub_version, bl);
+ decode(recursive_scrub_stamp, bl);
+ decode(localized_scrub_version, bl);
+ decode(localized_scrub_stamp, bl);
+ }
+ DECODE_FINISH(bl);
+}
+
+void fnode_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("version", version);
+ f->dump_unsigned("snap_purged_thru", snap_purged_thru);
+
+ f->open_object_section("fragstat");
+ fragstat.dump(f);
+ f->close_section();
+
+ f->open_object_section("accounted_fragstat");
+ accounted_fragstat.dump(f);
+ f->close_section();
+
+ f->open_object_section("rstat");
+ rstat.dump(f);
+ f->close_section();
+
+ f->open_object_section("accounted_rstat");
+ accounted_rstat.dump(f);
+ f->close_section();
+}
+
+void fnode_t::generate_test_instances(list<fnode_t*>& ls)
+{
+ ls.push_back(new fnode_t);
+ ls.push_back(new fnode_t);
+ ls.back()->version = 1;
+ ls.back()->snap_purged_thru = 2;
+ list<frag_info_t*> fls;
+ frag_info_t::generate_test_instances(fls);
+ ls.back()->fragstat = *fls.back();
+ ls.back()->accounted_fragstat = *fls.front();
+ list<nest_info_t*> nls;
+ nest_info_t::generate_test_instances(nls);
+ ls.back()->rstat = *nls.front();
+ ls.back()->accounted_rstat = *nls.back();
+}
+
+
+/*
+ * old_rstat_t
+ */
+void old_rstat_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(first, bl);
+ encode(rstat, bl);
+ encode(accounted_rstat, bl);
+ ENCODE_FINISH(bl);
+}
+
+void old_rstat_t::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(first, bl);
+ decode(rstat, bl);
+ decode(accounted_rstat, bl);
+ DECODE_FINISH(bl);
+}
+
+void old_rstat_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("snapid", first);
+ f->open_object_section("rstat");
+ rstat.dump(f);
+ f->close_section();
+ f->open_object_section("accounted_rstat");
+ accounted_rstat.dump(f);
+ f->close_section();
+}
+
+void old_rstat_t::generate_test_instances(list<old_rstat_t*>& ls)
+{
+ ls.push_back(new old_rstat_t());
+ ls.push_back(new old_rstat_t());
+ ls.back()->first = 12;
+ list<nest_info_t*> nls;
+ nest_info_t::generate_test_instances(nls);
+ ls.back()->rstat = *nls.back();
+ ls.back()->accounted_rstat = *nls.front();
+}
+
+/*
+ * feature_bitset_t
+ */
+feature_bitset_t::feature_bitset_t(unsigned long value)
+{
+ if (value) {
+ for (size_t i = 0; i < sizeof(value) * 8; i += bits_per_block) {
+ _vec.push_back((block_type)(value >> i));
+ }
+ }
+}
+
+feature_bitset_t::feature_bitset_t(const vector<size_t>& array)
+{
+ if (!array.empty()) {
+ size_t n = array.back();
+ n += bits_per_block;
+ n /= bits_per_block;
+ _vec.resize(n, 0);
+
+ size_t last = 0;
+ for (auto& bit : array) {
+ if (bit > last)
+ last = bit;
+ else
+ ceph_assert(bit == last);
+ _vec[bit / bits_per_block] |= (block_type)1 << (bit % bits_per_block);
+ }
+ }
+}
+
+feature_bitset_t& feature_bitset_t::operator-=(const feature_bitset_t& other)
+{
+ for (size_t i = 0; i < _vec.size(); ++i) {
+ if (i >= other._vec.size())
+ break;
+ _vec[i] &= ~other._vec[i];
+ }
+ return *this;
+}
+
+void feature_bitset_t::encode(bufferlist& bl) const {
+ using ceph::encode;
+ using ceph::encode_nohead;
+ uint32_t len = _vec.size() * sizeof(block_type);
+ encode(len, bl);
+ encode_nohead(_vec, bl);
+}
+
+void feature_bitset_t::decode(bufferlist::const_iterator &p) {
+ using ceph::decode;
+ using ceph::decode_nohead;
+ uint32_t len;
+ decode(len, p);
+
+ _vec.clear();
+ if (len >= sizeof(block_type))
+ decode_nohead(len / sizeof(block_type), _vec, p);
+
+ if (len % sizeof(block_type)) {
+ ceph_le64 buf{};
+ p.copy(len % sizeof(block_type), (char*)&buf);
+ _vec.push_back((block_type)buf);
+ }
+}
+
+void feature_bitset_t::print(ostream& out) const
+{
+ std::ios_base::fmtflags f(out.flags());
+ out << "0x";
+ for (int i = _vec.size() - 1; i >= 0; --i)
+ out << std::setfill('0') << std::setw(sizeof(block_type) * 2)
+ << std::hex << _vec[i];
+ out.flags(f);
+}
+
+/*
+ * client_metadata_t
+ */
+void client_metadata_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 1, bl);
+ encode(kv_map, bl);
+ encode(features, bl);
+ ENCODE_FINISH(bl);
+}
+
+void client_metadata_t::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(2, p);
+ decode(kv_map, p);
+ if (struct_v >= 2)
+ decode(features, p);
+ DECODE_FINISH(p);
+}
+
+void client_metadata_t::dump(Formatter *f) const
+{
+ f->dump_stream("features") << features;
+ for (const auto& [name, val] : kv_map)
+ f->dump_string(name.c_str(), val);
+}
+
+/*
+ * session_info_t
+ */
+void session_info_t::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(7, 7, bl);
+ encode(inst, bl, features);
+ encode(completed_requests, bl);
+ encode(prealloc_inos, bl); // hacky, see below.
+ encode(used_inos, bl);
+ encode(completed_flushes, bl);
+ encode(auth_name, bl);
+ encode(client_metadata, bl);
+ ENCODE_FINISH(bl);
+}
+
+void session_info_t::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(7, 2, 2, p);
+ decode(inst, p);
+ if (struct_v <= 2) {
+ set<ceph_tid_t> s;
+ decode(s, p);
+ while (!s.empty()) {
+ completed_requests[*s.begin()] = inodeno_t();
+ s.erase(s.begin());
+ }
+ } else {
+ decode(completed_requests, p);
+ }
+ decode(prealloc_inos, p);
+ decode(used_inos, p);
+ prealloc_inos.insert(used_inos);
+ used_inos.clear();
+ if (struct_v >= 4 && struct_v < 7) {
+ decode(client_metadata.kv_map, p);
+ }
+ if (struct_v >= 5) {
+ decode(completed_flushes, p);
+ }
+ if (struct_v >= 6) {
+ decode(auth_name, p);
+ }
+ if (struct_v >= 7) {
+ decode(client_metadata, p);
+ }
+ DECODE_FINISH(p);
+}
+
+void session_info_t::dump(Formatter *f) const
+{
+ f->dump_stream("inst") << inst;
+
+ f->open_array_section("completed_requests");
+ for (const auto& [tid, ino] : completed_requests) {
+ f->open_object_section("request");
+ f->dump_unsigned("tid", tid);
+ f->dump_stream("created_ino") << ino;
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("prealloc_inos");
+ for (const auto& [start, len] : prealloc_inos) {
+ f->open_object_section("ino_range");
+ f->dump_unsigned("start", start);
+ f->dump_unsigned("length", len);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("used_inos");
+ for (const auto& [start, len] : used_inos) {
+ f->open_object_section("ino_range");
+ f->dump_unsigned("start", start);
+ f->dump_unsigned("length", len);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->dump_object("client_metadata", client_metadata);
+}
+
+void session_info_t::generate_test_instances(list<session_info_t*>& ls)
+{
+ ls.push_back(new session_info_t);
+ ls.push_back(new session_info_t);
+ ls.back()->inst = entity_inst_t(entity_name_t::MDS(12), entity_addr_t());
+ ls.back()->completed_requests.insert(make_pair(234, inodeno_t(111222)));
+ ls.back()->completed_requests.insert(make_pair(237, inodeno_t(222333)));
+ ls.back()->prealloc_inos.insert(333, 12);
+ ls.back()->prealloc_inos.insert(377, 112);
+ // we can't add used inos; they're cleared on decode
+}
+
+
+/*
+ * string_snap_t
+ */
+void string_snap_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(name, bl);
+ encode(snapid, bl);
+ ENCODE_FINISH(bl);
+}
+
+void string_snap_t::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(name, bl);
+ decode(snapid, bl);
+ DECODE_FINISH(bl);
+}
+
+void string_snap_t::dump(Formatter *f) const
+{
+ f->dump_string("name", name);
+ f->dump_unsigned("snapid", snapid);
+}
+
+void string_snap_t::generate_test_instances(list<string_snap_t*>& ls)
+{
+ ls.push_back(new string_snap_t);
+ ls.push_back(new string_snap_t);
+ ls.back()->name = "foo";
+ ls.back()->snapid = 123;
+ ls.push_back(new string_snap_t);
+ ls.back()->name = "bar";
+ ls.back()->snapid = 456;
+}
+
+
+/*
+ * MDSCacheObjectInfo
+ */
+void MDSCacheObjectInfo::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(ino, bl);
+ encode(dirfrag, bl);
+ encode(dname, bl);
+ encode(snapid, bl);
+ ENCODE_FINISH(bl);
+}
+
+void MDSCacheObjectInfo::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
+ decode(ino, p);
+ decode(dirfrag, p);
+ decode(dname, p);
+ decode(snapid, p);
+ DECODE_FINISH(p);
+}
+
+void MDSCacheObjectInfo::dump(Formatter *f) const
+{
+ f->dump_unsigned("ino", ino);
+ f->dump_stream("dirfrag") << dirfrag;
+ f->dump_string("name", dname);
+ f->dump_unsigned("snapid", snapid);
+}
+
+void MDSCacheObjectInfo::generate_test_instances(list<MDSCacheObjectInfo*>& ls)
+{
+ ls.push_back(new MDSCacheObjectInfo);
+ ls.push_back(new MDSCacheObjectInfo);
+ ls.back()->ino = 1;
+ ls.back()->dirfrag = dirfrag_t(2, 3);
+ ls.back()->dname = "fooname";
+ ls.back()->snapid = CEPH_NOSNAP;
+ ls.push_back(new MDSCacheObjectInfo);
+ ls.back()->ino = 121;
+ ls.back()->dirfrag = dirfrag_t(222, 0);
+ ls.back()->dname = "bar foo";
+ ls.back()->snapid = 21322;
+}
+
+/*
+ * mds_table_pending_t
+ */
+void mds_table_pending_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(reqid, bl);
+ encode(mds, bl);
+ encode(tid, bl);
+ ENCODE_FINISH(bl);
+}
+
+void mds_table_pending_t::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(reqid, bl);
+ decode(mds, bl);
+ decode(tid, bl);
+ DECODE_FINISH(bl);
+}
+
+void mds_table_pending_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("reqid", reqid);
+ f->dump_unsigned("mds", mds);
+ f->dump_unsigned("tid", tid);
+}
+
+void mds_table_pending_t::generate_test_instances(list<mds_table_pending_t*>& ls)
+{
+ ls.push_back(new mds_table_pending_t);
+ ls.push_back(new mds_table_pending_t);
+ ls.back()->reqid = 234;
+ ls.back()->mds = 2;
+ ls.back()->tid = 35434;
+}
+
+
+/*
+ * inode_load_vec_t
+ */
+void inode_load_vec_t::encode(bufferlist &bl) const
+{
+ ENCODE_START(2, 2, bl);
+ for (const auto &i : vec) {
+ encode(i, bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+void inode_load_vec_t::decode(bufferlist::const_iterator &p)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
+ for (auto &i : vec) {
+ decode(i, p);
+ }
+ DECODE_FINISH(p);
+}
+
+void inode_load_vec_t::dump(Formatter *f) const
+{
+ f->open_array_section("Decay Counters");
+ for (const auto &i : vec) {
+ f->open_object_section("Decay Counter");
+ i.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void inode_load_vec_t::generate_test_instances(list<inode_load_vec_t*>& ls)
+{
+ ls.push_back(new inode_load_vec_t(DecayRate()));
+}
+
+
+/*
+ * dirfrag_load_vec_t
+ */
+void dirfrag_load_vec_t::dump(Formatter *f) const
+{
+ f->open_array_section("Decay Counters");
+ for (const auto &i : vec) {
+ f->open_object_section("Decay Counter");
+ i.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void dirfrag_load_vec_t::dump(Formatter *f, const DecayRate& rate) const
+{
+ f->dump_float("meta_load", meta_load());
+ f->dump_float("IRD", get(META_POP_IRD).get());
+ f->dump_float("IWR", get(META_POP_IWR).get());
+ f->dump_float("READDIR", get(META_POP_READDIR).get());
+ f->dump_float("FETCH", get(META_POP_FETCH).get());
+ f->dump_float("STORE", get(META_POP_STORE).get());
+}
+
+void dirfrag_load_vec_t::generate_test_instances(std::list<dirfrag_load_vec_t*>& ls)
+{
+ ls.push_back(new dirfrag_load_vec_t(DecayRate()));
+}
+
+/*
+ * mds_load_t
+ */
+void mds_load_t::encode(bufferlist &bl) const {
+ ENCODE_START(2, 2, bl);
+ encode(auth, bl);
+ encode(all, bl);
+ encode(req_rate, bl);
+ encode(cache_hit_rate, bl);
+ encode(queue_len, bl);
+ encode(cpu_load_avg, bl);
+ ENCODE_FINISH(bl);
+}
+
+void mds_load_t::decode(bufferlist::const_iterator &bl) {
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(auth, bl);
+ decode(all, bl);
+ decode(req_rate, bl);
+ decode(cache_hit_rate, bl);
+ decode(queue_len, bl);
+ decode(cpu_load_avg, bl);
+ DECODE_FINISH(bl);
+}
+
+void mds_load_t::dump(Formatter *f) const
+{
+ f->dump_float("request rate", req_rate);
+ f->dump_float("cache hit rate", cache_hit_rate);
+ f->dump_float("queue length", queue_len);
+ f->dump_float("cpu load", cpu_load_avg);
+ f->open_object_section("auth dirfrag");
+ auth.dump(f);
+ f->close_section();
+ f->open_object_section("all dirfrags");
+ all.dump(f);
+ f->close_section();
+}
+
+void mds_load_t::generate_test_instances(std::list<mds_load_t*>& ls)
+{
+ ls.push_back(new mds_load_t(DecayRate()));
+}
+
+/*
+ * cap_reconnect_t
+ */
+void cap_reconnect_t::encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode_old(bl); // extract out when something changes
+ encode(snap_follows, bl);
+ ENCODE_FINISH(bl);
+}
+
+void cap_reconnect_t::encode_old(bufferlist& bl) const {
+ using ceph::encode;
+ encode(path, bl);
+ capinfo.flock_len = flockbl.length();
+ encode(capinfo, bl);
+ encode_nohead(flockbl, bl);
+}
+
+void cap_reconnect_t::decode(bufferlist::const_iterator& bl) {
+ DECODE_START(2, bl);
+ decode_old(bl); // extract out when something changes
+ if (struct_v >= 2)
+ decode(snap_follows, bl);
+ DECODE_FINISH(bl);
+}
+
+void cap_reconnect_t::decode_old(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(path, bl);
+ decode(capinfo, bl);
+ decode_nohead(capinfo.flock_len, flockbl, bl);
+}
+
+void cap_reconnect_t::dump(Formatter *f) const
+{
+ f->dump_string("path", path);
+ f->dump_int("cap_id", capinfo.cap_id);
+ f->dump_string("cap wanted", ccap_string(capinfo.wanted));
+ f->dump_string("cap issued", ccap_string(capinfo.issued));
+ f->dump_int("snaprealm", capinfo.snaprealm);
+ f->dump_int("path base ino", capinfo.pathbase);
+ f->dump_string("has file locks", capinfo.flock_len ? "true" : "false");
+}
+
+void cap_reconnect_t::generate_test_instances(list<cap_reconnect_t*>& ls)
+{
+ ls.push_back(new cap_reconnect_t);
+ ls.back()->path = "/test/path";
+ ls.back()->capinfo.cap_id = 1;
+}
+
+/*
+ * snaprealm_reconnect_t
+ */
+void snaprealm_reconnect_t::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode_old(bl); // extract out when something changes
+ ENCODE_FINISH(bl);
+}
+
+void snaprealm_reconnect_t::encode_old(bufferlist& bl) const {
+ using ceph::encode;
+ encode(realm, bl);
+}
+
+void snaprealm_reconnect_t::decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode_old(bl); // extract out when something changes
+ DECODE_FINISH(bl);
+}
+
+void snaprealm_reconnect_t::decode_old(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(realm, bl);
+}
+
+void snaprealm_reconnect_t::dump(Formatter *f) const
+{
+ f->dump_int("ino", realm.ino);
+ f->dump_int("seq", realm.seq);
+ f->dump_int("parent", realm.parent);
+}
+
+void snaprealm_reconnect_t::generate_test_instances(list<snaprealm_reconnect_t*>& ls)
+{
+ ls.push_back(new snaprealm_reconnect_t);
+ ls.back()->realm.ino = 0x10000000001ULL;
+ ls.back()->realm.seq = 2;
+ ls.back()->realm.parent = 1;
+}
+
+
+ostream& operator<<(ostream &out, const mds_role_t &role)
+{
+ out << role.fscid << ":" << role.rank;
+ return out;
+}
+
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
new file mode 100644
index 00000000..d241030a
--- /dev/null
+++ b/src/mds/mdstypes.h
@@ -0,0 +1,1821 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_MDSTYPES_H
+#define CEPH_MDSTYPES_H
+
+#include "include/int_types.h"
+
+#include <math.h>
+#include <ostream>
+#include <set>
+#include <map>
+#include <string_view>
+
+#include "common/config.h"
+#include "common/Clock.h"
+#include "common/DecayCounter.h"
+#include "common/entity_name.h"
+
+#include "include/Context.h"
+#include "include/frag.h"
+#include "include/xlist.h"
+#include "include/interval_set.h"
+#include "include/compact_map.h"
+#include "include/compact_set.h"
+#include "include/fs_types.h"
+
+#include "inode_backtrace.h"
+
+#include <boost/spirit/include/qi.hpp>
+#include <boost/pool/pool.hpp>
+#include "include/ceph_assert.h"
+#include <boost/serialization/strong_typedef.hpp>
+
+#define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
+
+#define MDS_PORT_CACHE 0x200
+#define MDS_PORT_LOCKER 0x300
+#define MDS_PORT_MIGRATOR 0x400
+
+#define MAX_MDS 0x100
+#define NUM_STRAY 10
+
+#define MDS_INO_ROOT 1
+
+// No longer created but recognised in existing filesystems
+// so that we don't try to fragment it.
+#define MDS_INO_CEPH 2
+
+#define MDS_INO_GLOBAL_SNAPREALM 3
+
+#define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS)
+#define MDS_INO_STRAY_OFFSET (6*MAX_MDS)
+
+// Locations for journal data
+#define MDS_INO_LOG_OFFSET (2*MAX_MDS)
+#define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS)
+#define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS)
+#define MDS_INO_PURGE_QUEUE (5*MAX_MDS)
+
+#define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY))
+
+#define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i))))
+#define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x))
+
+#define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
+#define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
+#define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
+#define MDS_INO_IS_BASE(i) ((i) == MDS_INO_ROOT || (i) == MDS_INO_GLOBAL_SNAPREALM || MDS_INO_IS_MDSDIR(i))
+#define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
+#define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
+
+#define MDS_TRAVERSE_FORWARD 1
+#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc.
+#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries.
+
+
+typedef int32_t mds_rank_t;
+constexpr mds_rank_t MDS_RANK_NONE = -1;
+
+BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
+extern const mds_gid_t MDS_GID_NONE;
+
+typedef int32_t fs_cluster_id_t;
+constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1;
+// The namespace ID of the anonymous default filesystem from legacy systems
+constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0;
+
+class mds_role_t
+{
+ public:
+ fs_cluster_id_t fscid;
+ mds_rank_t rank;
+
+ mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_)
+ : fscid(fscid_), rank(rank_)
+ {}
+ mds_role_t()
+ : fscid(FS_CLUSTER_ID_NONE), rank(MDS_RANK_NONE)
+ {}
+ bool operator<(mds_role_t const &rhs) const
+ {
+ if (fscid < rhs.fscid) {
+ return true;
+ } else if (fscid == rhs.fscid) {
+ return rank < rhs.rank;
+ } else {
+ return false;
+ }
+ }
+
+ bool is_none() const
+ {
+ return (rank == MDS_RANK_NONE);
+ }
+};
+std::ostream& operator<<(std::ostream &out, const mds_role_t &role);
+
+
+// CAPS
+
+inline string gcap_string(int cap)
+{
+ string s;
+ if (cap & CEPH_CAP_GSHARED) s += "s";
+ if (cap & CEPH_CAP_GEXCL) s += "x";
+ if (cap & CEPH_CAP_GCACHE) s += "c";
+ if (cap & CEPH_CAP_GRD) s += "r";
+ if (cap & CEPH_CAP_GWR) s += "w";
+ if (cap & CEPH_CAP_GBUFFER) s += "b";
+ if (cap & CEPH_CAP_GWREXTEND) s += "a";
+ if (cap & CEPH_CAP_GLAZYIO) s += "l";
+ return s;
+}
+inline string ccap_string(int cap)
+{
+ string s;
+ if (cap & CEPH_CAP_PIN) s += "p";
+
+ int a = (cap >> CEPH_CAP_SAUTH) & 3;
+ if (a) s += 'A' + gcap_string(a);
+
+ a = (cap >> CEPH_CAP_SLINK) & 3;
+ if (a) s += 'L' + gcap_string(a);
+
+ a = (cap >> CEPH_CAP_SXATTR) & 3;
+ if (a) s += 'X' + gcap_string(a);
+
+ a = cap >> CEPH_CAP_SFILE;
+ if (a) s += 'F' + gcap_string(a);
+
+ if (s.length() == 0)
+ s = "-";
+ return s;
+}
+
+
+struct scatter_info_t {
+ version_t version = 0;
+
+ scatter_info_t() {}
+};
+
+struct frag_info_t : public scatter_info_t {
+ // this frag
+ utime_t mtime;
+ uint64_t change_attr = 0;
+ int64_t nfiles = 0; // files
+ int64_t nsubdirs = 0; // subdirs
+
+ frag_info_t() {}
+
+ int64_t size() const { return nfiles + nsubdirs; }
+
+ void zero() {
+ *this = frag_info_t();
+ }
+
+ // *this += cur - acc;
+ void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) {
+ if (cur.mtime > mtime) {
+ mtime = cur.mtime;
+ if (touched_mtime)
+ *touched_mtime = true;
+ }
+ if (cur.change_attr > change_attr) {
+ change_attr = cur.change_attr;
+ if (touched_chattr)
+ *touched_chattr = true;
+ }
+ nfiles += cur.nfiles - acc.nfiles;
+ nsubdirs += cur.nsubdirs - acc.nsubdirs;
+ }
+
+ void add(const frag_info_t& other) {
+ if (other.mtime > mtime)
+ mtime = other.mtime;
+ if (other.change_attr > change_attr)
+ change_attr = other.change_attr;
+ nfiles += other.nfiles;
+ nsubdirs += other.nsubdirs;
+ }
+
+ bool same_sums(const frag_info_t &o) const {
+ return mtime <= o.mtime &&
+ nfiles == o.nfiles &&
+ nsubdirs == o.nsubdirs;
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<frag_info_t*>& ls);
+};
+WRITE_CLASS_ENCODER(frag_info_t)
+
+inline bool operator==(const frag_info_t &l, const frag_info_t &r) {
+ return memcmp(&l, &r, sizeof(l)) == 0;
+}
+inline bool operator!=(const frag_info_t &l, const frag_info_t &r) {
+ return !(l == r);
+}
+
+std::ostream& operator<<(std::ostream &out, const frag_info_t &f);
+
+
+struct nest_info_t : public scatter_info_t {
+ // this frag + children
+ utime_t rctime;
+ int64_t rbytes = 0;
+ int64_t rfiles = 0;
+ int64_t rsubdirs = 0;
+ int64_t rsize() const { return rfiles + rsubdirs; }
+
+ int64_t rsnaps = 0;
+
+ nest_info_t() {}
+
+ void zero() {
+ *this = nest_info_t();
+ }
+
+ void sub(const nest_info_t &other) {
+ add(other, -1);
+ }
+ void add(const nest_info_t &other, int fac=1) {
+ if (other.rctime > rctime)
+ rctime = other.rctime;
+ rbytes += fac*other.rbytes;
+ rfiles += fac*other.rfiles;
+ rsubdirs += fac*other.rsubdirs;
+ rsnaps += fac*other.rsnaps;
+ }
+
+ // *this += cur - acc;
+ void add_delta(const nest_info_t &cur, const nest_info_t &acc) {
+ if (cur.rctime > rctime)
+ rctime = cur.rctime;
+ rbytes += cur.rbytes - acc.rbytes;
+ rfiles += cur.rfiles - acc.rfiles;
+ rsubdirs += cur.rsubdirs - acc.rsubdirs;
+ rsnaps += cur.rsnaps - acc.rsnaps;
+ }
+
+ bool same_sums(const nest_info_t &o) const {
+ return rctime <= o.rctime &&
+ rbytes == o.rbytes &&
+ rfiles == o.rfiles &&
+ rsubdirs == o.rsubdirs &&
+ rsnaps == o.rsnaps;
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<nest_info_t*>& ls);
+};
+WRITE_CLASS_ENCODER(nest_info_t)
+
+inline bool operator==(const nest_info_t &l, const nest_info_t &r) {
+ return memcmp(&l, &r, sizeof(l)) == 0;
+}
+inline bool operator!=(const nest_info_t &l, const nest_info_t &r) {
+ return !(l == r);
+}
+
+std::ostream& operator<<(std::ostream &out, const nest_info_t &n);
+
+
+struct vinodeno_t {
+ inodeno_t ino;
+ snapid_t snapid;
+ vinodeno_t() {}
+ vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {}
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(ino, bl);
+ encode(snapid, bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ using ceph::decode;
+ decode(ino, p);
+ decode(snapid, p);
+ }
+};
+WRITE_CLASS_ENCODER(vinodeno_t)
+
+inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
+ return l.ino == r.ino && l.snapid == r.snapid;
+}
+inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
+ return !(l == r);
+}
+inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
+ return
+ l.ino < r.ino ||
+ (l.ino == r.ino && l.snapid < r.snapid);
+}
+
+struct quota_info_t
+{
+ int64_t max_bytes = 0;
+ int64_t max_files = 0;
+
+ quota_info_t() {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(max_bytes, bl);
+ encode(max_files, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
+ decode(max_bytes, p);
+ decode(max_files, p);
+ DECODE_FINISH(p);
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<quota_info_t *>& ls);
+
+ bool is_valid() const {
+ return max_bytes >=0 && max_files >=0;
+ }
+ bool is_enable() const {
+ return max_bytes || max_files;
+ }
+};
+WRITE_CLASS_ENCODER(quota_info_t)
+
+inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
+ return memcmp(&l, &r, sizeof(l)) == 0;
+}
+
+ostream& operator<<(ostream &out, const quota_info_t &n);
+
+namespace std {
+ template<> struct hash<vinodeno_t> {
+ size_t operator()(const vinodeno_t &vino) const {
+ hash<inodeno_t> H;
+ hash<uint64_t> I;
+ return H(vino.ino) ^ I(vino.snapid);
+ }
+ };
+} // namespace std
+
+
+
+
+inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) {
+ out << vino.ino;
+ if (vino.snapid == CEPH_NOSNAP)
+ out << ".head";
+ else if (vino.snapid)
+ out << '.' << vino.snapid;
+ return out;
+}
+
+
+/*
+ * client_writeable_range_t
+ */
+struct client_writeable_range_t {
+ struct byte_range_t {
+ uint64_t first = 0, last = 0; // interval client can write to
+ byte_range_t() {}
+ };
+
+ byte_range_t range;
+ snapid_t follows = 0; // aka "data+metadata flushed thru"
+
+ client_writeable_range_t() {}
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<client_writeable_range_t*>& ls);
+};
+
+inline void decode(client_writeable_range_t::byte_range_t& range, bufferlist::const_iterator& bl) {
+ decode(range.first, bl);
+ decode(range.last, bl);
+}
+
+WRITE_CLASS_ENCODER(client_writeable_range_t)
+
+std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r);
+
+inline bool operator==(const client_writeable_range_t& l,
+ const client_writeable_range_t& r) {
+ return l.range.first == r.range.first && l.range.last == r.range.last &&
+ l.follows == r.follows;
+}
+
+struct inline_data_t {
+private:
+ std::unique_ptr<bufferlist> blp;
+public:
+ version_t version = 1;
+
+ void free_data() {
+ blp.reset();
+ }
+ bufferlist& get_data() {
+ if (!blp)
+ blp.reset(new bufferlist);
+ return *blp;
+ }
+ size_t length() const { return blp ? blp->length() : 0; }
+
+ inline_data_t() {}
+ inline_data_t(const inline_data_t& o) : version(o.version) {
+ if (o.blp)
+ get_data() = *o.blp;
+ }
+ inline_data_t& operator=(const inline_data_t& o) {
+ version = o.version;
+ if (o.blp)
+ get_data() = *o.blp;
+ else
+ free_data();
+ return *this;
+ }
+ bool operator==(const inline_data_t& o) const {
+ return length() == o.length() &&
+ (length() == 0 ||
+ (*const_cast<bufferlist*>(blp.get()) == *const_cast<bufferlist*>(o.blp.get())));
+ }
+ bool operator!=(const inline_data_t& o) const {
+ return !(*this == o);
+ }
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator& bl);
+};
+WRITE_CLASS_ENCODER(inline_data_t)
+
+enum {
+ DAMAGE_STATS, // statistics (dirstat, size, etc)
+ DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat)
+ DAMAGE_FRAGTREE // fragtree -- repair by searching
+};
+typedef uint32_t damage_flags_t;
+
+/*
+ * inode_t
+ */
+template<template<typename> class Allocator = std::allocator>
+struct inode_t {
+ /**
+ * ***************
+ * Do not forget to add any new fields to the compare() function.
+ * ***************
+ */
+ // base (immutable)
+ inodeno_t ino = 0;
+ uint32_t rdev = 0; // if special file
+
+ // affected by any inode change...
+ utime_t ctime; // inode change time
+ utime_t btime; // birth time
+
+ // perm (namespace permissions)
+ uint32_t mode = 0;
+ uid_t uid = 0;
+ gid_t gid = 0;
+
+ // nlink
+ int32_t nlink = 0;
+
+ // file (data access)
+ ceph_dir_layout dir_layout; // [dir only]
+ file_layout_t layout;
+ compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools;
+ uint64_t size = 0; // on directory, # dentries
+ uint64_t max_size_ever = 0; // max size the file has ever been
+ uint32_t truncate_seq = 0;
+ uint64_t truncate_size = 0, truncate_from = 0;
+ uint32_t truncate_pending = 0;
+ utime_t mtime; // file data modify time.
+ utime_t atime; // file data access time.
+ uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes())
+ inline_data_t inline_data; // FIXME check
+
+ // change attribute
+ uint64_t change_attr = 0;
+
+ using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>;
+ client_range_map client_ranges; // client(s) can write to these ranges
+
+ // dirfrag, recursive accountin
+ frag_info_t dirstat; // protected by my filelock
+ nest_info_t rstat; // protected by my nestlock
+ nest_info_t accounted_rstat; // protected by parent's nestlock
+
+ quota_info_t quota;
+
+ mds_rank_t export_pin = MDS_RANK_NONE;
+
+ // special stuff
+ version_t version = 0; // auth only
+ version_t file_data_version = 0; // auth only
+ version_t xattr_version = 0;
+
+ utime_t last_scrub_stamp; // start time of last complete scrub
+ version_t last_scrub_version = 0;// (parent) start version of last complete scrub
+
+ version_t backtrace_version = 0;
+
+ snapid_t oldest_snap;
+
+ std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink
+
+ inode_t()
+ {
+ clear_layout();
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(&dir_layout, 0, sizeof(dir_layout));
+ }
+
+ // file type
+ bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
+ bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; }
+ bool is_file() const { return (mode & S_IFMT) == S_IFREG; }
+
+ bool is_truncating() const { return (truncate_pending > 0); }
+ void truncate(uint64_t old_size, uint64_t new_size) {
+ ceph_assert(new_size < old_size);
+ if (old_size > max_size_ever)
+ max_size_ever = old_size;
+ truncate_from = old_size;
+ size = new_size;
+ rstat.rbytes = new_size;
+ truncate_size = size;
+ truncate_seq++;
+ truncate_pending++;
+ }
+
+ bool has_layout() const {
+ return layout != file_layout_t();
+ }
+
+ void clear_layout() {
+ layout = file_layout_t();
+ }
+
+ uint64_t get_layout_size_increment() const {
+ return layout.get_period();
+ }
+
+ bool is_dirty_rstat() const { return !(rstat == accounted_rstat); }
+
+ uint64_t get_max_size() const {
+ uint64_t max = 0;
+ for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin();
+ p != client_ranges.end();
+ ++p)
+ if (p->second.range.last > max)
+ max = p->second.range.last;
+ return max;
+ }
+ void set_max_size(uint64_t new_max) {
+ if (new_max == 0) {
+ client_ranges.clear();
+ } else {
+ for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin();
+ p != client_ranges.end();
+ ++p)
+ p->second.range.last = new_max;
+ }
+ }
+
+ void trim_client_ranges(snapid_t last) {
+ std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin();
+ while (p != client_ranges.end()) {
+ if (p->second.follows >= last)
+ client_ranges.erase(p++);
+ else
+ ++p;
+ }
+ }
+
+ bool is_backtrace_updated() const {
+ return backtrace_version == version;
+ }
+ void update_backtrace(version_t pv=0) {
+ backtrace_version = pv ? pv : version;
+ }
+
+ void add_old_pool(int64_t l) {
+ backtrace_version = version;
+ old_pools.insert(l);
+ }
+
+ void encode(bufferlist &bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<inode_t*>& ls);
+ /**
+ * Compare this inode_t with another that represent *the same inode*
+ * at different points in time.
+ * @pre The inodes are the same ino
+ *
+ * @param other The inode_t to compare ourselves with
+ * @param divergent A bool pointer which will be set to true
+ * if the values are different in a way that can't be explained
+ * by one being a newer version than the other.
+ *
+ * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
+ */
+ int compare(const inode_t &other, bool *divergent) const;
+private:
+ bool older_is_consistent(const inode_t &other) const;
+};
+
+// These methods may be moved back to mdstypes.cc when we have pmr
+template<template<typename> class Allocator>
+void inode_t<Allocator>::encode(bufferlist &bl, uint64_t features) const
+{
+ ENCODE_START(15, 6, bl);
+
+ encode(ino, bl);
+ encode(rdev, bl);
+ encode(ctime, bl);
+
+ encode(mode, bl);
+ encode(uid, bl);
+ encode(gid, bl);
+
+ encode(nlink, bl);
+ {
+ // removed field
+ bool anchored = 0;
+ encode(anchored, bl);
+ }
+
+ encode(dir_layout, bl);
+ encode(layout, bl, features);
+ encode(size, bl);
+ encode(truncate_seq, bl);
+ encode(truncate_size, bl);
+ encode(truncate_from, bl);
+ encode(truncate_pending, bl);
+ encode(mtime, bl);
+ encode(atime, bl);
+ encode(time_warp_seq, bl);
+ encode(client_ranges, bl);
+
+ encode(dirstat, bl);
+ encode(rstat, bl);
+ encode(accounted_rstat, bl);
+
+ encode(version, bl);
+ encode(file_data_version, bl);
+ encode(xattr_version, bl);
+ encode(backtrace_version, bl);
+ encode(old_pools, bl);
+ encode(max_size_ever, bl);
+ encode(inline_data, bl);
+ encode(quota, bl);
+
+ encode(stray_prior_path, bl);
+
+ encode(last_scrub_version, bl);
+ encode(last_scrub_stamp, bl);
+
+ encode(btime, bl);
+ encode(change_attr, bl);
+
+ encode(export_pin, bl);
+
+ ENCODE_FINISH(bl);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::decode(bufferlist::const_iterator &p)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(15, 6, 6, p);
+
+ decode(ino, p);
+ decode(rdev, p);
+ decode(ctime, p);
+
+ decode(mode, p);
+ decode(uid, p);
+ decode(gid, p);
+
+ decode(nlink, p);
+ {
+ bool anchored;
+ decode(anchored, p);
+ }
+
+ if (struct_v >= 4)
+ decode(dir_layout, p);
+ else {
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(&dir_layout, 0, sizeof(dir_layout));
+ }
+ decode(layout, p);
+ decode(size, p);
+ decode(truncate_seq, p);
+ decode(truncate_size, p);
+ decode(truncate_from, p);
+ if (struct_v >= 5)
+ decode(truncate_pending, p);
+ else
+ truncate_pending = 0;
+ decode(mtime, p);
+ decode(atime, p);
+ decode(time_warp_seq, p);
+ if (struct_v >= 3) {
+ decode(client_ranges, p);
+ } else {
+ map<client_t, client_writeable_range_t::byte_range_t> m;
+ decode(m, p);
+ for (map<client_t, client_writeable_range_t::byte_range_t>::iterator
+ q = m.begin(); q != m.end(); ++q)
+ client_ranges[q->first].range = q->second;
+ }
+
+ decode(dirstat, p);
+ decode(rstat, p);
+ decode(accounted_rstat, p);
+
+ decode(version, p);
+ decode(file_data_version, p);
+ decode(xattr_version, p);
+ if (struct_v >= 2)
+ decode(backtrace_version, p);
+ if (struct_v >= 7)
+ decode(old_pools, p);
+ if (struct_v >= 8)
+ decode(max_size_ever, p);
+ if (struct_v >= 9) {
+ decode(inline_data, p);
+ } else {
+ inline_data.version = CEPH_INLINE_NONE;
+ }
+ if (struct_v < 10)
+ backtrace_version = 0; // force update backtrace
+ if (struct_v >= 11)
+ decode(quota, p);
+
+ if (struct_v >= 12) {
+ std::string tmp;
+ decode(tmp, p);
+ stray_prior_path = std::string_view(tmp);
+ }
+
+ if (struct_v >= 13) {
+ decode(last_scrub_version, p);
+ decode(last_scrub_stamp, p);
+ }
+ if (struct_v >= 14) {
+ decode(btime, p);
+ decode(change_attr, p);
+ } else {
+ btime = utime_t();
+ change_attr = 0;
+ }
+
+ if (struct_v >= 15) {
+ decode(export_pin, p);
+ } else {
+ export_pin = MDS_RANK_NONE;
+ }
+
+ DECODE_FINISH(p);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::dump(Formatter *f) const
+{
+ f->dump_unsigned("ino", ino);
+ f->dump_unsigned("rdev", rdev);
+ f->dump_stream("ctime") << ctime;
+ f->dump_stream("btime") << btime;
+ f->dump_unsigned("mode", mode);
+ f->dump_unsigned("uid", uid);
+ f->dump_unsigned("gid", gid);
+ f->dump_unsigned("nlink", nlink);
+
+ f->open_object_section("dir_layout");
+ ::dump(dir_layout, f);
+ f->close_section();
+
+ f->dump_object("layout", layout);
+
+ f->open_array_section("old_pools");
+ for (const auto &p : old_pools) {
+ f->dump_int("pool", p);
+ }
+ f->close_section();
+
+ f->dump_unsigned("size", size);
+ f->dump_unsigned("truncate_seq", truncate_seq);
+ f->dump_unsigned("truncate_size", truncate_size);
+ f->dump_unsigned("truncate_from", truncate_from);
+ f->dump_unsigned("truncate_pending", truncate_pending);
+ f->dump_stream("mtime") << mtime;
+ f->dump_stream("atime") << atime;
+ f->dump_unsigned("time_warp_seq", time_warp_seq);
+ f->dump_unsigned("change_attr", change_attr);
+ f->dump_int("export_pin", export_pin);
+
+ f->open_array_section("client_ranges");
+ for (const auto &p : client_ranges) {
+ f->open_object_section("client");
+ f->dump_unsigned("client", p.first.v);
+ p.second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_object_section("dirstat");
+ dirstat.dump(f);
+ f->close_section();
+
+ f->open_object_section("rstat");
+ rstat.dump(f);
+ f->close_section();
+
+ f->open_object_section("accounted_rstat");
+ accounted_rstat.dump(f);
+ f->close_section();
+
+ f->dump_unsigned("version", version);
+ f->dump_unsigned("file_data_version", file_data_version);
+ f->dump_unsigned("xattr_version", xattr_version);
+ f->dump_unsigned("backtrace_version", backtrace_version);
+
+ f->dump_string("stray_prior_path", stray_prior_path);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::generate_test_instances(list<inode_t*>& ls)
+{
+ ls.push_back(new inode_t<Allocator>);
+ ls.push_back(new inode_t<Allocator>);
+ ls.back()->ino = 1;
+ // i am lazy.
+}
+
+template<template<typename> class Allocator>
+int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const
+{
+ ceph_assert(ino == other.ino);
+ *divergent = false;
+ if (version == other.version) {
+ if (rdev != other.rdev ||
+ ctime != other.ctime ||
+ btime != other.btime ||
+ mode != other.mode ||
+ uid != other.uid ||
+ gid != other.gid ||
+ nlink != other.nlink ||
+ memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
+ layout != other.layout ||
+ old_pools != other.old_pools ||
+ size != other.size ||
+ max_size_ever != other.max_size_ever ||
+ truncate_seq != other.truncate_seq ||
+ truncate_size != other.truncate_size ||
+ truncate_from != other.truncate_from ||
+ truncate_pending != other.truncate_pending ||
+ change_attr != other.change_attr ||
+ mtime != other.mtime ||
+ atime != other.atime ||
+ time_warp_seq != other.time_warp_seq ||
+ inline_data != other.inline_data ||
+ client_ranges != other.client_ranges ||
+ !(dirstat == other.dirstat) ||
+ !(rstat == other.rstat) ||
+ !(accounted_rstat == other.accounted_rstat) ||
+ file_data_version != other.file_data_version ||
+ xattr_version != other.xattr_version ||
+ backtrace_version != other.backtrace_version) {
+ *divergent = true;
+ }
+ return 0;
+ } else if (version > other.version) {
+ *divergent = !older_is_consistent(other);
+ return 1;
+ } else {
+ ceph_assert(version < other.version);
+ *divergent = !other.older_is_consistent(*this);
+ return -1;
+ }
+}
+
+template<template<typename> class Allocator>
+bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const
+{
+ if (max_size_ever < other.max_size_ever ||
+ truncate_seq < other.truncate_seq ||
+ time_warp_seq < other.time_warp_seq ||
+ inline_data.version < other.inline_data.version ||
+ dirstat.version < other.dirstat.version ||
+ rstat.version < other.rstat.version ||
+ accounted_rstat.version < other.accounted_rstat.version ||
+ file_data_version < other.file_data_version ||
+ xattr_version < other.xattr_version ||
+ backtrace_version < other.backtrace_version) {
+ return false;
+ }
+ return true;
+}
+
+template<template<typename> class Allocator>
+inline void encode(const inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
+{
+ ENCODE_DUMP_PRE();
+ c.encode(bl, features);
+ ENCODE_DUMP_POST(cl);
+}
+template<template<typename> class Allocator>
+inline void decode(inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
+{
+ c.decode(p);
+}
+
+template<template<typename> class Allocator>
+using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>;
+
+template<template<typename> class Allocator>
+using xattr_map = compact_map<alloc_string<Allocator>, bufferptr, std::less<alloc_string<Allocator>>, Allocator<std::pair<const alloc_string<Allocator>, bufferptr>>>; // FIXME bufferptr not in mempool
+
+template<template<typename> class Allocator>
+inline void decode_noshare(xattr_map<Allocator>& xattrs, ceph::buffer::list::const_iterator &p)
+{
+ __u32 n;
+ decode(n, p);
+ while (n-- > 0) {
+ alloc_string<Allocator> key;
+ decode(key, p);
+ __u32 len;
+ decode(len, p);
+ p.copy_deep(len, xattrs[key]);
+ }
+}
+
+/*
+ * old_inode_t
+ */
+template<template<typename> class Allocator = std::allocator>
+struct old_inode_t {
+ snapid_t first;
+ inode_t<Allocator> inode;
+ xattr_map<Allocator> xattrs;
+
+ void encode(bufferlist &bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<old_inode_t*>& ls);
+};
+
+// These methods may be moved back to mdstypes.cc when we have pmr
+template<template<typename> class Allocator>
+void old_inode_t<Allocator>::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(first, bl);
+ encode(inode, bl, features);
+ encode(xattrs, bl);
+ ENCODE_FINISH(bl);
+}
+
+template<template<typename> class Allocator>
+void old_inode_t<Allocator>::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(first, bl);
+ decode(inode, bl);
+ decode_noshare<Allocator>(xattrs, bl);
+ DECODE_FINISH(bl);
+}
+
+template<template<typename> class Allocator>
+void old_inode_t<Allocator>::dump(Formatter *f) const
+{
+ f->dump_unsigned("first", first);
+ inode.dump(f);
+ f->open_object_section("xattrs");
+ for (const auto &p : xattrs) {
+ std::string v(p.second.c_str(), p.second.length());
+ f->dump_string(p.first.c_str(), v);
+ }
+ f->close_section();
+}
+
+template<template<typename> class Allocator>
+void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls)
+{
+ ls.push_back(new old_inode_t<Allocator>);
+ ls.push_back(new old_inode_t<Allocator>);
+ ls.back()->first = 2;
+ std::list<inode_t<Allocator>*> ils;
+ inode_t<Allocator>::generate_test_instances(ils);
+ ls.back()->inode = *ils.back();
+ ls.back()->xattrs["user.foo"] = buffer::copy("asdf", 4);
+ ls.back()->xattrs["user.unprintable"] = buffer::copy("\000\001\002", 3);
+}
+
+template<template<typename> class Allocator>
+inline void encode(const old_inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
+{
+ ENCODE_DUMP_PRE();
+ c.encode(bl, features);
+ ENCODE_DUMP_POST(cl);
+}
+template<template<typename> class Allocator>
+inline void decode(old_inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
+{
+ c.decode(p);
+}
+
+
+/*
+ * like an inode, but for a dir frag
+ */
+struct fnode_t {
+ version_t version = 0;
+ snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru
+ frag_info_t fragstat, accounted_fragstat;
+ nest_info_t rstat, accounted_rstat;
+ damage_flags_t damage_flags = 0;
+
+ // we know we and all our descendants have been scrubbed since this version
+ version_t recursive_scrub_version = 0;
+ utime_t recursive_scrub_stamp;
+ // version at which we last scrubbed our personal data structures
+ version_t localized_scrub_version = 0;
+ utime_t localized_scrub_stamp;
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<fnode_t*>& ls);
+ fnode_t() {}
+};
+WRITE_CLASS_ENCODER(fnode_t)
+
+
+struct old_rstat_t {
+ snapid_t first;
+ nest_info_t rstat, accounted_rstat;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<old_rstat_t*>& ls);
+};
+WRITE_CLASS_ENCODER(old_rstat_t)
+
+inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) {
+ return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")";
+}
+
+/*
+ * feature_bitset_t
+ */
+class feature_bitset_t {
+public:
+ typedef uint64_t block_type;
+ static const size_t bits_per_block = sizeof(block_type) * 8;
+
+ feature_bitset_t(const feature_bitset_t& other) : _vec(other._vec) {}
+ feature_bitset_t(feature_bitset_t&& other) : _vec(std::move(other._vec)) {}
+ feature_bitset_t(unsigned long value = 0);
+ feature_bitset_t(const vector<size_t>& array);
+ feature_bitset_t& operator=(const feature_bitset_t& other) {
+ _vec = other._vec;
+ return *this;
+ }
+ feature_bitset_t& operator=(feature_bitset_t&& other) {
+ _vec = std::move(other._vec);
+ return *this;
+ }
+ bool empty() const {
+ for (auto& v : _vec) {
+ if (v)
+ return false;
+ }
+ return true;
+ }
+ bool test(size_t bit) const {
+ if (bit >= bits_per_block * _vec.size())
+ return false;
+ return _vec[bit / bits_per_block] & ((block_type)1 << (bit % bits_per_block));
+ }
+ void clear() {
+ _vec.clear();
+ }
+ feature_bitset_t& operator-=(const feature_bitset_t& other);
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator &p);
+ void print(ostream& out) const;
+private:
+ vector<block_type> _vec;
+};
+WRITE_CLASS_ENCODER(feature_bitset_t)
+
+inline std::ostream& operator<<(std::ostream& out, const feature_bitset_t& s) {
+ s.print(out);
+ return out;
+}
+
+/*
+ * client_metadata_t
+ */
+struct client_metadata_t {
+ using kv_map_t = std::map<std::string,std::string>;
+ using iterator = kv_map_t::const_iterator;
+
+ kv_map_t kv_map;
+ feature_bitset_t features;
+
+ client_metadata_t() {}
+ client_metadata_t(const client_metadata_t& other) :
+ kv_map(other.kv_map), features(other.features) {}
+ client_metadata_t(client_metadata_t&& other) :
+ kv_map(std::move(other.kv_map)), features(std::move(other.features)) {}
+ client_metadata_t(kv_map_t&& kv, feature_bitset_t &&f) :
+ kv_map(std::move(kv)), features(std::move(f)) {}
+ client_metadata_t(const kv_map_t& kv, const feature_bitset_t &f) :
+ kv_map(kv), features(f) {}
+ client_metadata_t& operator=(const client_metadata_t& other) {
+ kv_map = other.kv_map;
+ features = other.features;
+ return *this;
+ }
+
+ bool empty() const { return kv_map.empty() && features.empty(); }
+ iterator find(const std::string& key) const { return kv_map.find(key); }
+ iterator begin() const { return kv_map.begin(); }
+ iterator end() const { return kv_map.end(); }
+ void erase(iterator it) { kv_map.erase(it); }
+ std::string& operator[](const std::string& key) { return kv_map[key]; }
+ void merge(const client_metadata_t& other) {
+ kv_map.insert(other.kv_map.begin(), other.kv_map.end());
+ features = other.features;
+ }
+ void clear() {
+ kv_map.clear();
+ features.clear();
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(client_metadata_t)
+
+/*
+ * session_info_t
+ */
+struct session_info_t {
+ entity_inst_t inst;
+ std::map<ceph_tid_t,inodeno_t> completed_requests;
+ interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use.
+ interval_set<inodeno_t> used_inos; // journaling use
+ client_metadata_t client_metadata;
+ std::set<ceph_tid_t> completed_flushes;
+ EntityName auth_name;
+
+ client_t get_client() const { return client_t(inst.name.num()); }
+ bool has_feature(size_t bit) const { return client_metadata.features.test(bit); }
+ const entity_name_t& get_source() const { return inst.name; }
+
+ void clear_meta() {
+ prealloc_inos.clear();
+ used_inos.clear();
+ completed_requests.clear();
+ completed_flushes.clear();
+ client_metadata.clear();
+ }
+
+ void encode(bufferlist& bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator& p);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<session_info_t*>& ls);
+};
+WRITE_CLASS_ENCODER_FEATURES(session_info_t)
+
+
+// =======
+// dentries
+
+struct dentry_key_t {
+ snapid_t snapid = 0;
+ std::string_view name;
+ __u32 hash = 0;
+ dentry_key_t() {}
+ dentry_key_t(snapid_t s, std::string_view n, __u32 h=0) :
+ snapid(s), name(n), hash(h) {}
+
+ bool is_valid() { return name.length() || snapid; }
+
+ // encode into something that can be decoded as a string.
+ // name_ (head) or name_%x (!head)
+ void encode(bufferlist& bl) const {
+ string key;
+ encode(key);
+ using ceph::encode;
+ encode(key, bl);
+ }
+ void encode(string& key) const {
+ char b[20];
+ if (snapid != CEPH_NOSNAP) {
+ uint64_t val(snapid);
+ snprintf(b, sizeof(b), "%" PRIx64, val);
+ } else {
+ snprintf(b, sizeof(b), "%s", "head");
+ }
+ ostringstream oss;
+ oss << name << "_" << b;
+ key = oss.str();
+ }
+ static void decode_helper(bufferlist::const_iterator& bl, string& nm, snapid_t& sn) {
+ string key;
+ decode(key, bl);
+ decode_helper(key, nm, sn);
+ }
+ static void decode_helper(std::string_view key, string& nm, snapid_t& sn) {
+ size_t i = key.find_last_of('_');
+ ceph_assert(i != string::npos);
+ if (key.compare(i+1, std::string_view::npos, "head") == 0) {
+ // name_head
+ sn = CEPH_NOSNAP;
+ } else {
+ // name_%x
+ long long unsigned x = 0;
+ std::string x_str(key.substr(i+1));
+ sscanf(x_str.c_str(), "%llx", &x);
+ sn = x;
+ }
+ nm = key.substr(0, i);
+ }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k)
+{
+ return out << "(" << k.name << "," << k.snapid << ")";
+}
+
+inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2)
+{
+ /*
+ * order by hash, name, snap
+ */
+ int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash);
+ if (c)
+ return c < 0;
+ c = k1.name.compare(k2.name);
+ if (c)
+ return c < 0;
+ return k1.snapid < k2.snapid;
+}
+
+
+/*
+ * string_snap_t is a simple (string, snapid_t) pair
+ */
+struct string_snap_t {
+ string name;
+ snapid_t snapid;
+ string_snap_t() {}
+ string_snap_t(std::string_view n, snapid_t s) : name(n), snapid(s) {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<string_snap_t*>& ls);
+};
+WRITE_CLASS_ENCODER(string_snap_t)
+
+inline bool operator<(const string_snap_t& l, const string_snap_t& r) {
+ int c = l.name.compare(r.name);
+ return c < 0 || (c == 0 && l.snapid < r.snapid);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k)
+{
+ return out << "(" << k.name << "," << k.snapid << ")";
+}
+
+/*
+ * mds_table_pending_t
+ *
+ * mds's requesting any pending ops. child needs to encode the corresponding
+ * pending mutation state in the table.
+ */
+struct mds_table_pending_t {
+ uint64_t reqid = 0;
+ __s32 mds = 0;
+ version_t tid = 0;
+ mds_table_pending_t() {}
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<mds_table_pending_t*>& ls);
+};
+WRITE_CLASS_ENCODER(mds_table_pending_t)
+
+
+// =========
+// requests
+
+struct metareqid_t {
+ entity_name_t name;
+ uint64_t tid = 0;
+ metareqid_t() {}
+ metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {}
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(name, bl);
+ encode(tid, bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ using ceph::decode;
+ decode(name, p);
+ decode(tid, p);
+ }
+};
+WRITE_CLASS_ENCODER(metareqid_t)
+
+inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) {
+ return out << r.name << ":" << r.tid;
+}
+
+inline bool operator==(const metareqid_t& l, const metareqid_t& r) {
+ return (l.name == r.name) && (l.tid == r.tid);
+}
+inline bool operator!=(const metareqid_t& l, const metareqid_t& r) {
+ return (l.name != r.name) || (l.tid != r.tid);
+}
+inline bool operator<(const metareqid_t& l, const metareqid_t& r) {
+ return (l.name < r.name) ||
+ (l.name == r.name && l.tid < r.tid);
+}
+inline bool operator<=(const metareqid_t& l, const metareqid_t& r) {
+ return (l.name < r.name) ||
+ (l.name == r.name && l.tid <= r.tid);
+}
+inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); }
+inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); }
+
+namespace std {
+ template<> struct hash<metareqid_t> {
+ size_t operator()(const metareqid_t &r) const {
+ hash<uint64_t> H;
+ return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid);
+ }
+ };
+} // namespace std
+
+
+// cap info for client reconnect
+struct cap_reconnect_t {
+ string path;
+ mutable ceph_mds_cap_reconnect capinfo;
+ snapid_t snap_follows;
+ bufferlist flockbl;
+
+ cap_reconnect_t() {
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(&capinfo, 0, sizeof(capinfo));
+ snap_follows = 0;
+ }
+ cap_reconnect_t(uint64_t cap_id, inodeno_t pino, std::string_view p, int w, int i,
+ inodeno_t sr, snapid_t sf, bufferlist& lb) :
+ path(p) {
+ capinfo.cap_id = cap_id;
+ capinfo.wanted = w;
+ capinfo.issued = i;
+ capinfo.snaprealm = sr;
+ capinfo.pathbase = pino;
+ capinfo.flock_len = 0;
+ snap_follows = sf;
+ flockbl.claim(lb);
+ }
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void encode_old(bufferlist& bl) const;
+ void decode_old(bufferlist::const_iterator& bl);
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<cap_reconnect_t*>& ls);
+};
+WRITE_CLASS_ENCODER(cap_reconnect_t)
+
+struct snaprealm_reconnect_t {
+ mutable ceph_mds_snaprealm_reconnect realm;
+
+ snaprealm_reconnect_t() {
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(&realm, 0, sizeof(realm));
+ }
+ snaprealm_reconnect_t(inodeno_t ino, snapid_t seq, inodeno_t parent) {
+ realm.ino = ino;
+ realm.seq = seq;
+ realm.parent = parent;
+ }
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void encode_old(bufferlist& bl) const;
+ void decode_old(bufferlist::const_iterator& bl);
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<snaprealm_reconnect_t*>& ls);
+};
+WRITE_CLASS_ENCODER(snaprealm_reconnect_t)
+
+// compat for pre-FLOCK feature
+struct old_ceph_mds_cap_reconnect {
+ ceph_le64 cap_id;
+ ceph_le32 wanted;
+ ceph_le32 issued;
+ ceph_le64 old_size;
+ struct ceph_timespec old_mtime, old_atime;
+ ceph_le64 snaprealm;
+ ceph_le64 pathbase; /* base ino for our path to this ino */
+} __attribute__ ((packed));
+WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect)
+
+struct old_cap_reconnect_t {
+ string path;
+ old_ceph_mds_cap_reconnect capinfo;
+
+ const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) {
+ path = n.path;
+ capinfo.cap_id = n.capinfo.cap_id;
+ capinfo.wanted = n.capinfo.wanted;
+ capinfo.issued = n.capinfo.issued;
+ capinfo.snaprealm = n.capinfo.snaprealm;
+ capinfo.pathbase = n.capinfo.pathbase;
+ return *this;
+ }
+ operator cap_reconnect_t() {
+ cap_reconnect_t n;
+ n.path = path;
+ n.capinfo.cap_id = capinfo.cap_id;
+ n.capinfo.wanted = capinfo.wanted;
+ n.capinfo.issued = capinfo.issued;
+ n.capinfo.snaprealm = capinfo.snaprealm;
+ n.capinfo.pathbase = capinfo.pathbase;
+ return n;
+ }
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(path, bl);
+ encode(capinfo, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(path, bl);
+ decode(capinfo, bl);
+ }
+};
+WRITE_CLASS_ENCODER(old_cap_reconnect_t)
+
+
+// ================================================================
+// dir frag
+
+struct dirfrag_t {
+ inodeno_t ino = 0;
+ frag_t frag;
+
+ dirfrag_t() {}
+ dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(ino, bl);
+ encode(frag, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(ino, bl);
+ decode(frag, bl);
+ }
+};
+WRITE_CLASS_ENCODER(dirfrag_t)
+
+
+inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) {
+ out << df.ino;
+ if (!df.frag.is_root()) out << "." << df.frag;
+ return out;
+}
+inline bool operator<(dirfrag_t l, dirfrag_t r) {
+ if (l.ino < r.ino) return true;
+ if (l.ino == r.ino && l.frag < r.frag) return true;
+ return false;
+}
+inline bool operator==(dirfrag_t l, dirfrag_t r) {
+ return l.ino == r.ino && l.frag == r.frag;
+}
+
+namespace std {
+ template<> struct hash<dirfrag_t> {
+ size_t operator()(const dirfrag_t &df) const {
+ static rjhash<uint64_t> H;
+ static rjhash<uint32_t> I;
+ return H(df.ino) ^ I(df.frag);
+ }
+ };
+} // namespace std
+
+
+
+// ================================================================
+
+#define META_POP_IRD 0
+#define META_POP_IWR 1
+#define META_POP_READDIR 2
+#define META_POP_FETCH 3
+#define META_POP_STORE 4
+#define META_NPOP 5
+
+class inode_load_vec_t {
+public:
+ using time = DecayCounter::time;
+ using clock = DecayCounter::clock;
+ static const size_t NUM = 2;
+
+ inode_load_vec_t() : vec{DecayCounter(DecayRate()), DecayCounter(DecayRate())} {}
+ inode_load_vec_t(const DecayRate &rate) : vec{DecayCounter(rate), DecayCounter(rate)} {}
+
+ DecayCounter &get(int t) {
+ return vec[t];
+ }
+ void zero() {
+ for (auto &d : vec) {
+ d.reset();
+ }
+ }
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator& p);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<inode_load_vec_t*>& ls);
+
+private:
+ std::array<DecayCounter, NUM> vec;
+};
+inline void encode(const inode_load_vec_t &c, bufferlist &bl) {
+ c.encode(bl);
+}
+inline void decode(inode_load_vec_t & c, bufferlist::const_iterator &p) {
+ c.decode(p);
+}
+
+class dirfrag_load_vec_t {
+public:
+ using time = DecayCounter::time;
+ using clock = DecayCounter::clock;
+ static const size_t NUM = 5;
+
+ dirfrag_load_vec_t() :
+ vec{DecayCounter(DecayRate()),
+ DecayCounter(DecayRate()),
+ DecayCounter(DecayRate()),
+ DecayCounter(DecayRate()),
+ DecayCounter(DecayRate())
+ }
+ {}
+ dirfrag_load_vec_t(const DecayRate &rate) :
+ vec{DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate)}
+ {}
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(2, 2, bl);
+ for (const auto &i : vec) {
+ encode(i, bl);
+ }
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
+ for (auto &i : vec) {
+ decode(i, p);
+ }
+ DECODE_FINISH(p);
+ }
+ void dump(Formatter *f) const;
+ void dump(Formatter *f, const DecayRate& rate) const;
+ static void generate_test_instances(std::list<dirfrag_load_vec_t*>& ls);
+
+ const DecayCounter &get(int t) const {
+ return vec[t];
+ }
+ DecayCounter &get(int t) {
+ return vec[t];
+ }
+ void adjust(double d) {
+ for (auto &i : vec) {
+ i.adjust(d);
+ }
+ }
+ void zero() {
+ for (auto &i : vec) {
+ i.reset();
+ }
+ }
+ double meta_load() const {
+ return
+ 1*vec[META_POP_IRD].get() +
+ 2*vec[META_POP_IWR].get() +
+ 1*vec[META_POP_READDIR].get() +
+ 2*vec[META_POP_FETCH].get() +
+ 4*vec[META_POP_STORE].get();
+ }
+
+ void add(dirfrag_load_vec_t& r) {
+ for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
+ vec[i].adjust(r.vec[i].get());
+ }
+ void sub(dirfrag_load_vec_t& r) {
+ for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
+ vec[i].adjust(-r.vec[i].get());
+ }
+ void scale(double f) {
+ for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
+ vec[i].scale(f);
+ }
+
+private:
+ friend inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl);
+ std::array<DecayCounter, NUM> vec;
+};
+
+inline void encode(const dirfrag_load_vec_t &c, bufferlist &bl) {
+ c.encode(bl);
+}
+inline void decode(dirfrag_load_vec_t& c, bufferlist::const_iterator &p) {
+ c.decode(p);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl)
+{
+ std::ostringstream ss;
+ ss << std::setprecision(1) << std::fixed
+ << "[pop"
+ " IRD:" << dl.vec[0]
+ << " IWR:" << dl.vec[1]
+ << " RDR:" << dl.vec[2]
+ << " FET:" << dl.vec[3]
+ << " STR:" << dl.vec[4]
+ << " *LOAD:" << dl.meta_load() << "]";
+ return out << ss.str() << std::endl;
+}
+
+
+/* mds_load_t
+ * mds load
+ */
+
+struct mds_load_t {
+ using clock = dirfrag_load_vec_t::clock;
+ using time = dirfrag_load_vec_t::time;
+
+ dirfrag_load_vec_t auth;
+ dirfrag_load_vec_t all;
+
+ mds_load_t() : auth(DecayRate()), all(DecayRate()) {}
+ mds_load_t(const DecayRate &rate) : auth(rate), all(rate) {}
+
+ double req_rate = 0.0;
+ double cache_hit_rate = 0.0;
+ double queue_len = 0.0;
+
+ double cpu_load_avg = 0.0;
+
+ double mds_load() const; // defiend in MDBalancer.cc
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<mds_load_t*>& ls);
+};
+inline void encode(const mds_load_t &c, bufferlist &bl) {
+ c.encode(bl);
+}
+inline void decode(mds_load_t &c, bufferlist::const_iterator &p) {
+ c.decode(p);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load)
+{
+ return out << "mdsload<" << load.auth << "/" << load.all
+ << ", req " << load.req_rate
+ << ", hr " << load.cache_hit_rate
+ << ", qlen " << load.queue_len
+ << ", cpu " << load.cpu_load_avg
+ << ">";
+}
+
+class load_spread_t {
+public:
+ using time = DecayCounter::time;
+ using clock = DecayCounter::clock;
+ static const int MAX = 4;
+ int last[MAX];
+ int p = 0, n = 0;
+ DecayCounter count;
+
+public:
+ load_spread_t() = delete;
+ load_spread_t(const DecayRate &rate) : count(rate)
+ {
+ for (int i=0; i<MAX; i++)
+ last[i] = -1;
+ }
+
+ double hit(int who) {
+ for (int i=0; i<n; i++)
+ if (last[i] == who)
+ return count.get_last();
+
+ // we're new(ish)
+ last[p++] = who;
+ if (n < MAX) n++;
+ if (n == 1) return 0.0;
+
+ if (p == MAX) p = 0;
+
+ return count.hit();
+ }
+ double get() const {
+ return count.get();
+ }
+};
+
+
+
+// ================================================================
+typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t;
+
+// -- authority delegation --
+// directory authority types
+// >= 0 is the auth mds
+#define CDIR_AUTH_PARENT mds_rank_t(-1) // default
+#define CDIR_AUTH_UNKNOWN mds_rank_t(-2)
+#define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN)
+#define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)
+//#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
+
+class MDSCacheObjectInfo {
+public:
+ inodeno_t ino = 0;
+ dirfrag_t dirfrag;
+ string dname;
+ snapid_t snapid;
+
+ MDSCacheObjectInfo() {}
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<MDSCacheObjectInfo*>& ls);
+};
+
+inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) {
+ if (info.ino) return out << info.ino << "." << info.snapid;
+ if (info.dname.length()) return out << info.dirfrag << "/" << info.dname
+ << " snap " << info.snapid;
+ return out << info.dirfrag;
+}
+
+inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) {
+ if (l.ino || r.ino)
+ return l.ino == r.ino && l.snapid == r.snapid;
+ else
+ return l.dirfrag == r.dirfrag && l.dname == r.dname;
+}
+WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
+
+
+// parse a map of keys/values.
+namespace qi = boost::spirit::qi;
+
+template <typename Iterator>
+struct keys_and_values
+ : qi::grammar<Iterator, std::map<string, string>()>
+{
+ keys_and_values()
+ : keys_and_values::base_type(query)
+ {
+ query = pair >> *(qi::lit(' ') >> pair);
+ pair = key >> '=' >> value;
+ key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
+ value = +qi::char_("a-zA-Z0-9-_.");
+ }
+ qi::rule<Iterator, std::map<string, string>()> query;
+ qi::rule<Iterator, std::pair<string, string>()> pair;
+ qi::rule<Iterator, string()> key, value;
+};
+
+#endif
diff --git a/src/mds/snap.cc b/src/mds/snap.cc
new file mode 100644
index 00000000..e53daef2
--- /dev/null
+++ b/src/mds/snap.cc
@@ -0,0 +1,218 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004- Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string_view>
+
+#include "snap.h"
+
+#include "common/Formatter.h"
+
+/*
+ * SnapInfo
+ */
+
+void SnapInfo::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(snapid, bl);
+ encode(ino, bl);
+ encode(stamp, bl);
+ encode(name, bl);
+ ENCODE_FINISH(bl);
+}
+
+void SnapInfo::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(snapid, bl);
+ decode(ino, bl);
+ decode(stamp, bl);
+ decode(name, bl);
+ DECODE_FINISH(bl);
+}
+
+void SnapInfo::dump(Formatter *f) const
+{
+ f->dump_unsigned("snapid", snapid);
+ f->dump_unsigned("ino", ino);
+ f->dump_stream("stamp") << stamp;
+ f->dump_string("name", name);
+}
+
+void SnapInfo::generate_test_instances(list<SnapInfo*>& ls)
+{
+ ls.push_back(new SnapInfo);
+ ls.push_back(new SnapInfo);
+ ls.back()->snapid = 1;
+ ls.back()->ino = 2;
+ ls.back()->stamp = utime_t(3, 4);
+ ls.back()->name = "foo";
+}
+
+ostream& operator<<(ostream& out, const SnapInfo &sn)
+{
+ return out << "snap(" << sn.snapid
+ << " " << sn.ino
+ << " '" << sn.name
+ << "' " << sn.stamp << ")";
+}
+
+std::string_view SnapInfo::get_long_name() const
+{
+ if (long_name.empty() ||
+ long_name.compare(1, name.size(), name) ||
+ long_name.find_last_of("_") != name.size() + 1) {
+ char nm[80];
+ snprintf(nm, sizeof(nm), "_%s_%llu", name.c_str(), (unsigned long long)ino);
+ long_name = nm;
+ }
+ return long_name;
+}
+
+/*
+ * snaplink_t
+ */
+
+void snaplink_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ encode(ino, bl);
+ encode(first, bl);
+ ENCODE_FINISH(bl);
+}
+
+void snaplink_t::decode(bufferlist::const_iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ decode(ino, bl);
+ decode(first, bl);
+ DECODE_FINISH(bl);
+}
+
+void snaplink_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("ino", ino);
+ f->dump_unsigned("first", first);
+}
+
+void snaplink_t::generate_test_instances(list<snaplink_t*>& ls)
+{
+ ls.push_back(new snaplink_t);
+ ls.push_back(new snaplink_t);
+ ls.back()->ino = 2;
+ ls.back()->first = 123;
+}
+
+ostream& operator<<(ostream& out, const snaplink_t &l)
+{
+ return out << l.ino << "@" << l.first;
+}
+
+/*
+ * sr_t
+ */
+
+void sr_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(6, 4, bl);
+ encode(seq, bl);
+ encode(created, bl);
+ encode(last_created, bl);
+ encode(last_destroyed, bl);
+ encode(current_parent_since, bl);
+ encode(snaps, bl);
+ encode(past_parents, bl);
+ encode(past_parent_snaps, bl);
+ encode(flags, bl);
+ ENCODE_FINISH(bl);
+}
+
+void sr_t::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(6, 4, 4, p);
+ if (struct_v == 2) {
+ __u8 struct_v;
+ decode(struct_v, p); // yes, really: extra byte for v2 encoding only, see 6ee52e7d.
+ }
+ decode(seq, p);
+ decode(created, p);
+ decode(last_created, p);
+ decode(last_destroyed, p);
+ decode(current_parent_since, p);
+ decode(snaps, p);
+ decode(past_parents, p);
+ if (struct_v >= 5)
+ decode(past_parent_snaps, p);
+ if (struct_v >= 6)
+ decode(flags, p);
+ else
+ flags = 0;
+ DECODE_FINISH(p);
+}
+
+void sr_t::dump(Formatter *f) const
+{
+ f->dump_unsigned("seq", seq);
+ f->dump_unsigned("created", created);
+ f->dump_unsigned("last_created", last_created);
+ f->dump_unsigned("last_destroyed", last_destroyed);
+ f->dump_unsigned("current_parent_since", current_parent_since);
+
+ f->open_array_section("snaps");
+ for (map<snapid_t,SnapInfo>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
+ f->open_object_section("snapinfo");
+ f->dump_unsigned("last", p->first);
+ p->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("past_parents");
+ for (map<snapid_t,snaplink_t>::const_iterator p = past_parents.begin(); p != past_parents.end(); ++p) {
+ f->open_object_section("past_parent");
+ f->dump_unsigned("last", p->first);
+ p->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("past_parent_snaps");
+ for (auto p = past_parent_snaps.begin(); p != past_parent_snaps.end(); ++p) {
+ f->open_object_section("snapinfo");
+ f->dump_unsigned("snapid", *p);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void sr_t::generate_test_instances(list<sr_t*>& ls)
+{
+ ls.push_back(new sr_t);
+ ls.push_back(new sr_t);
+ ls.back()->seq = 1;
+ ls.back()->created = 2;
+ ls.back()->last_created = 3;
+ ls.back()->last_destroyed = 4;
+ ls.back()->current_parent_since = 5;
+ ls.back()->snaps[123].snapid = 7;
+ ls.back()->snaps[123].ino = 8;
+ ls.back()->snaps[123].stamp = utime_t(9, 10);
+ ls.back()->snaps[123].name = "name1";
+ ls.back()->past_parents[12].ino = 12;
+ ls.back()->past_parents[12].first = 3;
+
+ ls.back()->past_parent_snaps.insert(5);
+ ls.back()->past_parent_snaps.insert(6);
+}
+
diff --git a/src/mds/snap.h b/src/mds/snap.h
new file mode 100644
index 00000000..41f48d80
--- /dev/null
+++ b/src/mds/snap.h
@@ -0,0 +1,115 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDS_SNAP_H
+#define CEPH_MDS_SNAP_H
+
+#include <string_view>
+
+#include "mdstypes.h"
+#include "common/snap_types.h"
+
+/*
+ * generic snap descriptor.
+ */
+struct SnapInfo {
+ snapid_t snapid;
+ inodeno_t ino;
+ utime_t stamp;
+ string name;
+
+ mutable string long_name; ///< cached _$ino_$name
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<SnapInfo*>& ls);
+
+ std::string_view get_long_name() const;
+};
+WRITE_CLASS_ENCODER(SnapInfo)
+
+inline bool operator==(const SnapInfo &l, const SnapInfo &r)
+{
+ return l.snapid == r.snapid && l.ino == r.ino &&
+ l.stamp == r.stamp && l.name == r.name;
+}
+
+ostream& operator<<(ostream& out, const SnapInfo &sn);
+
+
+/*
+ * SnapRealm - a subtree that shares the same set of snapshots.
+ */
+struct SnapRealm;
+class CInode;
+class MDCache;
+
+
+
+#include "Capability.h"
+
+struct snaplink_t {
+ inodeno_t ino;
+ snapid_t first;
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<snaplink_t*>& ls);
+};
+WRITE_CLASS_ENCODER(snaplink_t)
+
+ostream& operator<<(ostream& out, const snaplink_t &l);
+
+
+// carry data about a specific version of a SnapRealm
+struct sr_t {
+ snapid_t seq; // basically, a version/seq # for changes to _this_ realm.
+ snapid_t created; // when this realm was created.
+ snapid_t last_created; // last snap created in _this_ realm.
+ snapid_t last_destroyed; // seq for last removal
+ snapid_t current_parent_since;
+ map<snapid_t, SnapInfo> snaps;
+ map<snapid_t, snaplink_t> past_parents; // key is "last" (or NOSNAP)
+ set<snapid_t> past_parent_snaps;
+
+ __u32 flags;
+ enum {
+ PARENT_GLOBAL = 1 << 0,
+ SUBVOLUME = 1 << 1,
+ };
+
+ void mark_parent_global() { flags |= PARENT_GLOBAL; }
+ void clear_parent_global() { flags &= ~PARENT_GLOBAL; }
+ bool is_parent_global() const { return flags & PARENT_GLOBAL; }
+
+ void mark_subvolume() { flags |= SUBVOLUME; }
+ void clear_subvolume() { flags &= ~SUBVOLUME; }
+ bool is_subvolume() const { return flags & SUBVOLUME; }
+
+ sr_t()
+ : seq(0), created(0),
+ last_created(0), last_destroyed(0),
+ current_parent_since(1), flags(0)
+ {}
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<sr_t*>& ls);
+};
+WRITE_CLASS_ENCODER(sr_t)
+
+#endif