summaryrefslogtreecommitdiffstats
path: root/src/mds/Server.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/mds/Server.cc')
-rw-r--r--src/mds/Server.cc11740
1 files changed, 11740 insertions, 0 deletions
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
new file mode 100644
index 000000000..ced4ecffa
--- /dev/null
+++ b/src/mds/Server.cc
@@ -0,0 +1,11740 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <boost/lexical_cast.hpp>
+#include "include/ceph_assert.h" // lexical_cast includes system assert.h
+
+#include <boost/config/warning_disable.hpp>
+#include <boost/fusion/include/std_pair.hpp>
+#include <boost/range/adaptor/reversed.hpp>
+
+#include "MDSRank.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDCache.h"
+#include "MDLog.h"
+#include "Migrator.h"
+#include "MDBalancer.h"
+#include "InoTable.h"
+#include "SnapClient.h"
+#include "Mutation.h"
+#include "MetricsHandler.h"
+#include "cephfs_features.h"
+
+#include "msg/Messenger.h"
+
+#include "osdc/Objecter.h"
+
+#include "events/EUpdate.h"
+#include "events/EPeerUpdate.h"
+#include "events/ESession.h"
+#include "events/EOpen.h"
+#include "events/ECommitted.h"
+#include "events/EPurged.h"
+
+#include "include/stringify.h"
+#include "include/filepath.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+#include "include/compat.h"
+#include "osd/OSDMap.h"
+#include "fscrypt.h"
+
+#include <errno.h>
+
+#include <list>
+#include <regex>
+#include <string_view>
+#include <functional>
+
+#include "common/config.h"
+
+#include "msg/Message.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
+
+using namespace std;
+
+class ServerContext : public MDSContext {
+ protected:
+ Server *server;
+ MDSRank *get_mds() override
+ {
+ return server->mds;
+ }
+
+ public:
+ explicit ServerContext(Server *s) : server(s) {
+ ceph_assert(server != NULL);
+ }
+};
+
+class Batch_Getattr_Lookup : public BatchOp {
+protected:
+ Server* server;
+ ceph::ref_t<MDRequestImpl> mdr;
+ std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
+ int res = 0;
+public:
+ Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
+ : server(s), mdr(r) {
+ if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
+ mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
+ else
+ mdr->batch_op_map = &mdr->in[0]->batch_ops;
+ }
+ void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
+ batch_reqs.push_back(r);
+ }
+ ceph::ref_t<MDRequestImpl> find_new_head() override {
+ while (!batch_reqs.empty()) {
+ auto r = std::move(batch_reqs.back());
+ batch_reqs.pop_back();
+ if (r->killed)
+ continue;
+
+ r->batch_op_map = mdr->batch_op_map;
+ mdr->batch_op_map = nullptr;
+ mdr = r;
+ return mdr;
+ }
+ return nullptr;
+ }
+ void _forward(mds_rank_t t) override {
+ MDCache* mdcache = server->mdcache;
+ mdcache->mds->forward_message_mds(mdr, t);
+ mdr->set_mds_stamp(ceph_clock_now());
+ for (auto& m : batch_reqs) {
+ if (!m->killed)
+ mdcache->request_forward(m, t);
+ }
+ batch_reqs.clear();
+ }
+ void _respond(int r) override {
+ mdr->set_mds_stamp(ceph_clock_now());
+ for (auto& m : batch_reqs) {
+ if (!m->killed) {
+ m->tracei = mdr->tracei;
+ m->tracedn = mdr->tracedn;
+ server->respond_to_request(m, r);
+ }
+ }
+ batch_reqs.clear();
+ server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
+ }
+ void print(std::ostream& o) const override {
+ o << "[batch front=" << *mdr << "]";
+ }
+};
+
+class ServerLogContext : public MDSLogContextBase {
+protected:
+ Server *server;
+ MDSRank *get_mds() override
+ {
+ return server->mds;
+ }
+
+ MDRequestRef mdr;
+ void pre_finish(int r) override {
+ if (mdr)
+ mdr->mark_event("journal_committed: ");
+ }
+public:
+ explicit ServerLogContext(Server *s) : server(s) {
+ ceph_assert(server != NULL);
+ }
+ explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
+ ceph_assert(server != NULL);
+ }
+};
+
+void Server::create_logger()
+{
+ PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
+
+ plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
+ "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_u64_counter(l_mdss_handle_peer_request, "handle_peer_request",
+ "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_u64_counter(l_mdss_handle_client_session,
+ "handle_client_session", "Client session messages", "hcs",
+ PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
+ "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
+ plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
+ "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
+ PerfCountersBuilder::PRIO_INTERESTING);
+
+ // fop latencies are useful
+ plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+ plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
+ "Request type lookup hash of inode latency");
+ plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
+ "Request type lookup inode latency");
+ plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
+ "Request type lookup parent latency");
+ plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
+ "Request type lookup name latency");
+ plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
+ "Request type lookup latency");
+ plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
+ "Request type lookup snapshot latency");
+ plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
+ "Request type get attribute latency");
+ plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
+ "Request type set attribute latency");
+ plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
+ "Request type set file layout latency");
+ plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
+ "Request type set directory layout latency");
+ plb.add_time_avg(l_mdss_req_getvxattr_latency, "req_getvxattr_latency",
+ "Request type get virtual extended attribute latency");
+ plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
+ "Request type set extended attribute latency");
+ plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
+ "Request type remove extended attribute latency");
+ plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
+ "Request type read directory latency");
+ plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
+ "Request type set file lock latency");
+ plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
+ "Request type get file lock latency");
+ plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
+ "Request type create latency");
+ plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
+ "Request type open latency");
+ plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
+ "Request type make node latency");
+ plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
+ "Request type link latency");
+ plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
+ "Request type unlink latency");
+ plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
+ "Request type remove directory latency");
+ plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
+ "Request type rename latency");
+ plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
+ "Request type make directory latency");
+ plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
+ "Request type symbolic link latency");
+ plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
+ "Request type list snapshot latency");
+ plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
+ "Request type make snapshot latency");
+ plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
+ "Request type remove snapshot latency");
+ plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
+ "Request type rename snapshot latency");
+ plb.add_time_avg(l_mdss_req_snapdiff_latency, "req_snapdiff_latency",
+ "Request type snapshot difference latency");
+
+ plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+ plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
+ "Client requests dispatched");
+ plb.add_u64_counter(l_mdss_dispatch_peer_request, "dispatch_server_request",
+ "Server requests dispatched");
+
+ logger = plb.create_perf_counters();
+ g_ceph_context->get_perfcounters_collection()->add(logger);
+}
+
+Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
+ mds(m),
+ mdcache(mds->mdcache), mdlog(mds->mdlog),
+ inject_rename_corrupt_dentry_first(g_conf().get_val<double>("mds_inject_rename_corrupt_dentry_first")),
+ recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
+ metrics_handler(metrics_handler)
+{
+ forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
+ replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
+ cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
+ max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
+ delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
+ max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+ cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
+ max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
+ caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
+ dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
+ bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
+ supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
+ supported_metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
+}
+
+void Server::dispatch(const cref_t<Message> &m)
+{
+ switch (m->get_type()) {
+ case CEPH_MSG_CLIENT_RECONNECT:
+ handle_client_reconnect(ref_cast<MClientReconnect>(m));
+ return;
+ }
+
+/*
+ *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
+
+1. In reconnect phase, client sent unsafe requests to mds.
+2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
+(Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
+3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
+
+*/
+ bool sessionclosed_isok = replay_unsafe_with_closed_session;
+ // active?
+ // handle_peer_request()/handle_client_session() will wait if necessary
+ if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
+ const auto &req = ref_cast<MClientRequest>(m);
+ if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
+ Session *session = mds->get_session(req);
+ if (!session || (!session->is_open() && !sessionclosed_isok)) {
+ dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
+ return;
+ }
+ bool queue_replay = false;
+ if (req->is_replay() || req->is_async()) {
+ dout(3) << "queuing replayed op" << dendl;
+ queue_replay = true;
+ if (req->head.ino &&
+ !session->have_completed_request(req->get_reqid().tid, nullptr)) {
+ inodeno_t ino(req->head.ino);
+ mdcache->add_replay_ino_alloc(ino);
+ if (replay_unsafe_with_closed_session &&
+ session->free_prealloc_inos.contains(ino)) {
+ // don't purge inodes that will be created by later replay
+ session->free_prealloc_inos.erase(ino);
+ session->delegated_inos.insert(ino);
+ }
+ }
+ } else if (req->get_retry_attempt()) {
+ // process completed request in clientreplay stage. The completed request
+ // might have created new file/directorie. This guarantees MDS sends a reply
+ // to client before other request modifies the new file/directorie.
+ if (session->have_completed_request(req->get_reqid().tid, NULL)) {
+ dout(3) << "queuing completed op" << dendl;
+ queue_replay = true;
+ }
+ // this request was created before the cap reconnect message, drop any embedded
+ // cap releases.
+ req->releases.clear();
+ }
+ if (queue_replay) {
+ req->mark_queued_for_replay();
+ mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ }
+
+ bool wait_for_active = true;
+ if (mds->is_stopping()) {
+ wait_for_active = false;
+ } else if (mds->is_clientreplay()) {
+ if (req->is_queued_for_replay()) {
+ wait_for_active = false;
+ }
+ }
+ if (wait_for_active) {
+ dout(3) << "not active yet, waiting" << dendl;
+ mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ }
+
+ switch (m->get_type()) {
+ case CEPH_MSG_CLIENT_SESSION:
+ handle_client_session(ref_cast<MClientSession>(m));
+ return;
+ case CEPH_MSG_CLIENT_REQUEST:
+ handle_client_request(ref_cast<MClientRequest>(m));
+ return;
+ case CEPH_MSG_CLIENT_RECLAIM:
+ handle_client_reclaim(ref_cast<MClientReclaim>(m));
+ return;
+ case MSG_MDS_PEER_REQUEST:
+ handle_peer_request(ref_cast<MMDSPeerRequest>(m));
+ return;
+ default:
+ derr << "Server unknown message " << m->get_type() << " from peer type " << m->get_connection()->get_peer_type() << dendl;
+ ceph_abort_msg("server unknown message " + to_string(m->get_type()) + " from peer type " + to_string(m->get_connection()->get_peer_type()));
+ }
+}
+
+
+
+// ----------------------------------------------------------
+// SESSION management
+
+class C_MDS_session_finish : public ServerLogContext {
+ Session *session;
+ uint64_t state_seq;
+ bool open;
+ version_t cmapv;
+ interval_set<inodeno_t> inos_to_free;
+ version_t inotablev;
+ interval_set<inodeno_t> inos_to_purge;
+ LogSegment *ls = nullptr;
+ Context *fin;
+public:
+ C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = nullptr) :
+ ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
+ C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv,
+ const interval_set<inodeno_t>& to_free, version_t iv,
+ const interval_set<inodeno_t>& to_purge, LogSegment *_ls, Context *fin_ = nullptr) :
+ ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv),
+ inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge), ls(_ls), fin(fin_) {}
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ server->_session_logged(session, state_seq, open, cmapv, inos_to_free, inotablev, inos_to_purge, ls);
+ if (fin) {
+ fin->complete(r);
+ }
+ }
+};
+
+Session* Server::find_session_by_uuid(std::string_view uuid)
+{
+ Session* session = nullptr;
+ for (auto& it : mds->sessionmap.get_sessions()) {
+ auto& metadata = it.second->info.client_metadata;
+
+ auto p = metadata.find("uuid");
+ if (p == metadata.end() || p->second != uuid)
+ continue;
+
+ if (!session) {
+ session = it.second;
+ } else if (!session->reclaiming_from) {
+ ceph_assert(it.second->reclaiming_from == session);
+ session = it.second;
+ } else {
+ ceph_assert(session->reclaiming_from == it.second);
+ }
+ }
+ return session;
+}
+
+void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
+{
+ if (!session->is_open() && !session->is_stale()) {
+ dout(10) << "session not open, dropping this req" << dendl;
+ return;
+ }
+
+ auto reply = make_message<MClientReclaimReply>(0);
+ if (m->get_uuid().empty()) {
+ dout(10) << __func__ << " invalid message (no uuid)" << dendl;
+ reply->set_result(-CEPHFS_EINVAL);
+ mds->send_message_client(reply, session);
+ return;
+ }
+
+ unsigned flags = m->get_flags();
+ if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
+ dout(10) << __func__ << " unsupported flags" << dendl;
+ reply->set_result(-CEPHFS_EINVAL);
+ mds->send_message_client(reply, session);
+ return;
+ }
+
+ Session* target = find_session_by_uuid(m->get_uuid());
+ if (target) {
+ if (session->info.auth_name != target->info.auth_name) {
+ dout(10) << __func__ << " session auth_name " << session->info.auth_name
+ << " != target auth_name " << target->info.auth_name << dendl;
+ reply->set_result(-CEPHFS_EPERM);
+ mds->send_message_client(reply, session);
+ }
+
+ ceph_assert(!target->reclaiming_from);
+ ceph_assert(!session->reclaiming_from);
+ session->reclaiming_from = target;
+ reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
+ }
+
+ if (flags & CEPH_RECLAIM_RESET) {
+ finish_reclaim_session(session, reply);
+ } else ceph_assert(0); /* no other flags are handled at this time */
+}
+
+void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
+{
+ Session *target = session->reclaiming_from;
+ if (target) {
+ session->reclaiming_from = nullptr;
+
+ Context *send_reply;
+ if (reply) {
+ int64_t session_id = session->get_client().v;
+ send_reply = new LambdaContext([this, session_id, reply](int r) {
+ ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
+ if (!session) {
+ return;
+ }
+ auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
+ reply->set_epoch(epoch);
+ mds->send_message_client(reply, session);
+ });
+ } else {
+ send_reply = nullptr;
+ }
+
+ bool blocklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
+ return map.is_blocklisted(target->info.inst.addr);
+ });
+
+ if (blocklisted || !g_conf()->mds_session_blocklist_on_evict) {
+ kill_session(target, send_reply);
+ } else {
+ CachedStackStringStream css;
+ mds->evict_client(target->get_client().v, false, true, *css, send_reply);
+ }
+ } else if (reply) {
+ mds->send_message_client(reply, session);
+ }
+}
+
+void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
+{
+ Session *session = mds->get_session(m);
+ uint32_t flags = m->get_flags();
+ dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+ ceph_assert(m->is_a_client()); // should _not_ come from an mds!
+
+ if (!session) {
+ dout(0) << " ignoring sessionless msg " << *m << dendl;
+ return;
+ }
+
+ std::string_view fs_name = mds->mdsmap->get_fs_name();
+ if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
+ dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
+ return;
+ }
+
+ if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ if (flags & MClientReclaim::FLAG_FINISH) {
+ if (flags ^ MClientReclaim::FLAG_FINISH) {
+ dout(0) << __func__ << " client specified FLAG_FINISH with other flags."
+ " Other flags:" << flags << dendl;
+ auto reply = make_message<MClientReclaimReply>(0);
+ reply->set_result(-CEPHFS_EINVAL);
+ mds->send_message_client(reply, session);
+ return;
+ }
+ finish_reclaim_session(session);
+ } else {
+ reclaim_session(session, m);
+ }
+}
+
+void Server::handle_client_session(const cref_t<MClientSession> &m)
+{
+ version_t pv;
+ Session *session = mds->get_session(m);
+
+ dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
+ ceph_assert(m->is_a_client()); // should _not_ come from an mds!
+
+ if (!session) {
+ dout(0) << " ignoring sessionless msg " << *m << dendl;
+ auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
+ reply->metadata["error_string"] = "sessionless";
+ mds->send_message(reply, m->get_connection());
+ return;
+ }
+
+ std::string_view fs_name = mds->mdsmap->get_fs_name();
+ if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
+ dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
+ auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
+ reply->metadata["error_string"] = "client doesn't have caps for FS \"" +
+ std::string(fs_name) + "\"";
+ mds->send_message(std::move(reply), m->get_connection());
+ return;
+ }
+
+ if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
+ // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
+ } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
+ // close requests need to be handled when mds is active
+ if (mds->get_state() < MDSMap::STATE_ACTIVE) {
+ mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ } else {
+ if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ }
+
+ if (logger)
+ logger->inc(l_mdss_handle_client_session);
+
+ uint64_t sseq = 0;
+ switch (m->get_op()) {
+ case CEPH_SESSION_REQUEST_OPEN:
+ if(mds->mdsmap->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
+ dout(0) << "new sessions are not permitted, enable again via"
+ "`ceph fs set <fs_name> refuse_client_session false`" << dendl;
+ auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
+ reply->metadata["error_string"] = "new sessions are not permitted,"
+ " enable again via `ceph fs set"
+ " <fs_name> refuse_client_session false`";
+ mds->send_message(reply, m->get_connection());
+ return;
+ }
+ if (session->is_opening() ||
+ session->is_open() ||
+ session->is_stale() ||
+ session->is_killing() ||
+ terminating_sessions) {
+ if (m->supported_features.test(CEPHFS_FEATURE_NOTIFY_SESSION_STATE)) {
+ if (session->is_open() && !mds->is_stopping()) {
+ dout(10) << "currently already opened" << dendl;
+
+ auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN,
+ session->get_push_seq());
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
+ reply->supported_features = supported_features;
+ mds->send_message_client(reply, session);
+ if (mdcache->is_readonly()) {
+ auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
+ mds->send_message_client(m, session);
+ }
+ }
+ }
+ dout(10) << "currently " << session->get_state_name()
+ << ", dropping this req" << dendl;
+ return;
+ }
+ ceph_assert(session->is_closed() || session->is_closing());
+
+ if (mds->is_stopping()) {
+ dout(10) << "mds is stopping, dropping open req" << dendl;
+ return;
+ }
+
+ {
+ auto& addr = session->info.inst.addr;
+ session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
+ auto& client_metadata = session->info.client_metadata;
+
+ auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
+ auto now = ceph_clock_now();
+ auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
+ auto elapsed = now - m->get_recv_stamp();
+ CachedStackStringStream css;
+ *css << "New client session:"
+ << " addr=\"" << session->info.inst.addr << "\""
+ << ",elapsed=" << elapsed
+ << ",throttled=" << throttle_elapsed
+ << ",status=\"" << status << "\"";
+ if (!err.empty()) {
+ *css << ",error=\"" << err << "\"";
+ }
+ const auto& metadata = session->info.client_metadata;
+ if (auto it = metadata.find("root"); it != metadata.end()) {
+ *css << ",root=\"" << it->second << "\"";
+ }
+ dout(2) << css->strv() << dendl;
+ };
+
+ auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str, unsigned flags=0) {
+ auto m = make_message<MClientSession>(CEPH_SESSION_REJECT, 0, flags);
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
+ m->metadata["error_string"] = err_str;
+ mds->send_message_client(m, session);
+ log_session_status("REJECTED", err_str);
+ };
+
+ bool blocklisted = mds->objecter->with_osdmap(
+ [&addr](const OSDMap &osd_map) -> bool {
+ return osd_map.is_blocklisted(addr);
+ });
+
+ if (blocklisted) {
+ dout(10) << "rejecting blocklisted client " << addr << dendl;
+ // This goes on the wire and the "blacklisted" substring is
+ // depended upon by the kernel client for detecting whether it
+ // has been blocklisted. If mounted with recover_session=clean
+ // (since 5.4), it tries to automatically recover itself from
+ // blocklisting.
+ unsigned flags = 0;
+ flags |= MClientSession::SESSION_BLOCKLISTED;
+ send_reject_message("blocklisted (blacklisted)", flags);
+ session->clear();
+ break;
+ }
+
+ if (client_metadata.features.empty())
+ infer_supported_features(session, client_metadata);
+
+ dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
+ dout(20) << " features: '" << client_metadata.features << "'" << dendl;
+ dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
+ for (const auto& p : client_metadata) {
+ dout(20) << " " << p.first << ": " << p.second << dendl;
+ }
+
+ feature_bitset_t missing_features = required_client_features;
+ missing_features -= client_metadata.features;
+ if (!missing_features.empty()) {
+ CachedStackStringStream css;
+ *css << "missing required features '" << missing_features << "'";
+ send_reject_message(css->strv());
+ mds->clog->warn() << "client session (" << session->info.inst
+ << ") lacks required features " << missing_features
+ << "; client supports " << client_metadata.features;
+ session->clear();
+ break;
+ }
+
+ // Special case for the 'root' metadata path; validate that the claimed
+ // root is actually within the caps of the session
+ if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
+ auto claimed_root = it->second;
+ CachedStackStringStream css;
+ bool denied = false;
+ // claimed_root has a leading "/" which we strip before passing
+ // into caps check
+ if (claimed_root.empty() || claimed_root[0] != '/') {
+ denied = true;
+ *css << "invalue root '" << claimed_root << "'";
+ } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
+ denied = true;
+ *css << "non-allowable root '" << claimed_root << "'";
+ }
+
+ if (denied) {
+ // Tell the client we're rejecting their open
+ send_reject_message(css->strv());
+ mds->clog->warn() << "client session with " << css->strv()
+ << " denied (" << session->info.inst << ")";
+ session->clear();
+ break;
+ }
+ }
+
+ if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
+ if (find_session_by_uuid(it->second)) {
+ send_reject_message("duplicated session uuid");
+ mds->clog->warn() << "client session with duplicated session uuid '"
+ << it->second << "' denied (" << session->info.inst << ")";
+ session->clear();
+ break;
+ }
+ }
+
+ if (session->is_closed()) {
+ mds->sessionmap.add_session(session);
+ }
+
+ pv = mds->sessionmap.mark_projected(session);
+ sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
+ mds->sessionmap.touch_session(session);
+ auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
+ ceph_assert(r == 0);
+ log_session_status("ACCEPTED", "");
+ });
+ mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
+ new C_MDS_session_finish(this, session, sseq, true, pv, fin));
+ mdlog->flush();
+ }
+ break;
+
+ case CEPH_SESSION_REQUEST_RENEWCAPS:
+ if (session->is_open() || session->is_stale()) {
+ mds->sessionmap.touch_session(session);
+ if (session->is_stale()) {
+ mds->sessionmap.set_state(session, Session::STATE_OPEN);
+ mds->locker->resume_stale_caps(session);
+ mds->sessionmap.touch_session(session);
+ }
+ auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
+ mds->send_message_client(reply, session);
+ } else {
+ dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
+ }
+ break;
+
+ case CEPH_SESSION_REQUEST_CLOSE:
+ {
+ if (session->is_closed() ||
+ session->is_closing() ||
+ session->is_killing()) {
+ dout(10) << "already closed|closing|killing, dropping this req" << dendl;
+ return;
+ }
+ if (session->is_importing()) {
+ dout(10) << "ignoring close req on importing session" << dendl;
+ return;
+ }
+ ceph_assert(session->is_open() ||
+ session->is_stale() ||
+ session->is_opening());
+ if (m->get_seq() < session->get_push_seq()) {
+ dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
+ << ", dropping" << dendl;
+ return;
+ }
+ // We are getting a seq that is higher than expected.
+ // Handle the same as any other seqn error.
+ //
+ if (m->get_seq() != session->get_push_seq()) {
+ dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
+ << ", BUGGY!" << dendl;
+ mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
+ << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
+ return;
+ }
+ journal_close_session(session, Session::STATE_CLOSING, NULL);
+ }
+ break;
+
+ case CEPH_SESSION_FLUSHMSG_ACK:
+ finish_flush_session(session, m->get_seq());
+ break;
+
+ case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
+ if (mds->is_active())
+ mdlog->flush();
+ break;
+
+ default:
+ auto m = make_message<MClientSession>(CEPH_SESSION_REJECT);
+ mds->send_message_client(m, session);
+ derr << "Server received unknown message " << m->get_type() << ", closing session and blocklisting the client " << session->get_client() << dendl;
+ CachedStackStringStream css;
+ mds->evict_client(session->get_client().v, false, true, *css, nullptr);
+ }
+}
+
+void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
+ if (!session->is_open() ||
+ !session->get_connection() ||
+ !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
+ return;
+ }
+
+ version_t seq = session->wait_for_flush(gather.new_sub());
+ mds->send_message_client(
+ make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
+}
+
+void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
+{
+ for (const auto& client : client_set) {
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
+ ceph_assert(session);
+ flush_session(session, gather);
+ }
+}
+
+void Server::finish_flush_session(Session *session, version_t seq)
+{
+ MDSContext::vec finished;
+ session->finish_flush(seq, finished);
+ mds->queue_waiters(finished);
+}
+
+void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
+ const interval_set<inodeno_t>& inos_to_free, version_t piv,
+ const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls)
+{
+ dout(10) << "_session_logged " << session->info.inst
+ << " state_seq " << state_seq
+ << " " << (open ? "open":"close") << " " << pv
+ << " inos_to_free " << inos_to_free << " inotablev " << piv
+ << " inos_to_purge " << inos_to_purge << dendl;
+
+ if (!open) {
+ if (inos_to_purge.size()){
+ ceph_assert(ls);
+ session->info.prealloc_inos.subtract(inos_to_purge);
+ ls->purging_inodes.insert(inos_to_purge);
+ if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping())
+ mdcache->purge_inodes(inos_to_purge, ls);
+ }
+
+ if (inos_to_free.size()) {
+ ceph_assert(piv);
+ ceph_assert(session->is_closing() || session->is_killing() ||
+ session->is_opening()); // re-open closing session
+ session->info.prealloc_inos.subtract(inos_to_free);
+ mds->inotable->apply_release_ids(inos_to_free);
+ ceph_assert(mds->inotable->get_version() == piv);
+ }
+ session->free_prealloc_inos = session->info.prealloc_inos;
+ session->delegated_inos.clear();
+ }
+
+ mds->sessionmap.mark_dirty(session);
+
+ // apply
+ if (session->get_state_seq() != state_seq) {
+ dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
+ << ", noop" << dendl;
+ // close must have been canceled (by an import?), or any number of other things..
+ } else if (open) {
+ ceph_assert(session->is_opening());
+ mds->sessionmap.set_state(session, Session::STATE_OPEN);
+ mds->sessionmap.touch_session(session);
+ metrics_handler->add_session(session);
+ ceph_assert(session->get_connection());
+ auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
+ reply->supported_features = supported_features;
+ reply->metric_spec = supported_metric_spec;
+ }
+ mds->send_message_client(reply, session);
+ if (mdcache->is_readonly()) {
+ auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
+ mds->send_message_client(m, session);
+ }
+ } else if (session->is_closing() ||
+ session->is_killing()) {
+ // kill any lingering capabilities, leases, requests
+ bool killing = session->is_killing();
+ while (!session->caps.empty()) {
+ Capability *cap = session->caps.front();
+ CInode *in = cap->get_inode();
+ dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
+ mds->locker->remove_client_cap(in, cap, killing);
+ }
+ while (!session->leases.empty()) {
+ ClientLease *r = session->leases.front();
+ CDentry *dn = static_cast<CDentry*>(r->parent);
+ dout(20) << " killing client lease of " << *dn << dendl;
+ dn->remove_client_lease(r, mds->locker);
+ }
+ if (client_reconnect_gather.erase(session->info.get_client())) {
+ dout(20) << " removing client from reconnect set" << dendl;
+ if (client_reconnect_gather.empty()) {
+ dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
+ reconnect_gather_finish();
+ }
+ }
+ if (client_reclaim_gather.erase(session->info.get_client())) {
+ dout(20) << " removing client from reclaim set" << dendl;
+ if (client_reclaim_gather.empty()) {
+ dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
+ mds->maybe_clientreplay_done();
+ }
+ }
+
+ if (session->is_closing()) {
+ // mark con disposable. if there is a fault, we will get a
+ // reset and clean it up. if the client hasn't received the
+ // CLOSE message yet, they will reconnect and get an
+ // ms_handle_remote_reset() and realize they had in fact closed.
+ // do this *before* sending the message to avoid a possible
+ // race.
+ if (session->get_connection()) {
+ // Conditional because terminate_sessions will indiscrimately
+ // put sessions in CLOSING whether they ever had a conn or not.
+ session->get_connection()->mark_disposable();
+ }
+
+ // reset session
+ mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
+ mds->sessionmap.set_state(session, Session::STATE_CLOSED);
+ session->clear();
+ metrics_handler->remove_session(session);
+ mds->sessionmap.remove_session(session);
+ } else if (session->is_killing()) {
+ // destroy session, close connection
+ if (session->get_connection()) {
+ session->get_connection()->mark_down();
+ mds->sessionmap.set_state(session, Session::STATE_CLOSED);
+ session->set_connection(nullptr);
+ }
+ metrics_handler->remove_session(session);
+ mds->sessionmap.remove_session(session);
+ } else {
+ ceph_abort();
+ }
+ } else {
+ ceph_abort();
+ }
+}
+
+/**
+ * Inject sessions from some source other than actual connections.
+ *
+ * For example:
+ * - sessions inferred from journal replay
+ * - sessions learned from other MDSs during rejoin
+ * - sessions learned from other MDSs during dir/caps migration
+ * - sessions learned from other MDSs during a cross-MDS rename
+ */
+version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
+ map<client_t,client_metadata_t>& cmm,
+ map<client_t, pair<Session*,uint64_t> >& smap)
+{
+ version_t pv = mds->sessionmap.get_projected();
+
+ dout(10) << "prepare_force_open_sessions " << pv
+ << " on " << cm.size() << " clients"
+ << dendl;
+
+ mds->objecter->with_osdmap(
+ [this, &cm, &cmm](const OSDMap &osd_map) {
+ for (auto p = cm.begin(); p != cm.end(); ) {
+ if (osd_map.is_blocklisted(p->second.addr)) {
+ dout(10) << " ignoring blocklisted client." << p->first
+ << " (" << p->second.addr << ")" << dendl;
+ cmm.erase(p->first);
+ cm.erase(p++);
+ } else {
+ ++p;
+ }
+ }
+ });
+
+ for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
+ Session *session = mds->sessionmap.get_or_add_session(p->second);
+ pv = mds->sessionmap.mark_projected(session);
+ uint64_t sseq;
+ if (session->is_closed() ||
+ session->is_closing() ||
+ session->is_killing()) {
+ sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
+ auto q = cmm.find(p->first);
+ if (q != cmm.end())
+ session->info.client_metadata.merge(q->second);
+ } else {
+ ceph_assert(session->is_open() ||
+ session->is_opening() ||
+ session->is_stale());
+ sseq = 0;
+ }
+ smap[p->first] = make_pair(session, sseq);
+ session->inc_importing();
+ }
+ return pv;
+}
+
+void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
+ bool dec_import)
+{
+ /*
+ * FIXME: need to carefully consider the race conditions between a
+ * client trying to close a session and an MDS doing an import
+ * trying to force open a session...
+ */
+ dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
+ << " initial v " << mds->sessionmap.get_version() << dendl;
+
+ for (auto &it : smap) {
+ Session *session = it.second.first;
+ uint64_t sseq = it.second.second;
+ if (sseq > 0) {
+ if (session->get_state_seq() != sseq) {
+ dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
+ } else {
+ dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
+ mds->sessionmap.set_state(session, Session::STATE_OPEN);
+ mds->sessionmap.touch_session(session);
+ metrics_handler->add_session(session);
+
+ auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
+ reply->supported_features = supported_features;
+ reply->metric_spec = supported_metric_spec;
+ }
+ mds->send_message_client(reply, session);
+
+ if (mdcache->is_readonly())
+ mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
+ }
+ } else {
+ dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
+ ceph_assert(session->is_open() || session->is_stale());
+ }
+
+ if (dec_import) {
+ session->dec_importing();
+ }
+
+ mds->sessionmap.mark_dirty(session);
+ }
+
+ dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
+}
+
+class C_MDS_TerminatedSessions : public ServerContext {
+ void finish(int r) override {
+ server->terminating_sessions = false;
+ }
+ public:
+ explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
+};
+
+void Server::terminate_sessions()
+{
+ dout(5) << "terminating all sessions..." << dendl;
+
+ terminating_sessions = true;
+
+ // kill them off. clients will retry etc.
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ for (set<Session*>::const_iterator p = sessions.begin();
+ p != sessions.end();
+ ++p) {
+ Session *session = *p;
+ if (session->is_closing() ||
+ session->is_killing() ||
+ session->is_closed())
+ continue;
+ journal_close_session(session, Session::STATE_CLOSING, NULL);
+ }
+
+ mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
+}
+
+
+void Server::find_idle_sessions()
+{
+ auto now = clock::now();
+ auto last_cleared_laggy = mds->last_cleared_laggy();
+
+ dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
+
+ // timeout/stale
+ // (caps go stale, lease die)
+ double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
+ double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
+
+ // don't kick clients if we've been laggy
+ if (last_cleared_laggy < cutoff) {
+ dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
+ << "), not marking any client stale" << dendl;
+ return;
+ }
+
+ std::vector<Session*> to_evict;
+
+ bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
+ const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
+ if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
+ std::vector<Session*> new_stale;
+
+ for (auto session : *(sessions_p1->second)) {
+ auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+ if (last_cap_renew_span < cutoff) {
+ dout(20) << "laggiest active session is " << session->info.inst
+ << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+ break;
+ }
+
+ if (session->last_seen > session->last_cap_renew) {
+ last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
+ if (last_cap_renew_span < cutoff) {
+ dout(20) << "laggiest active session is " << session->info.inst
+ << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+ continue;
+ }
+ }
+
+ if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
+ dout(20) << "evicting session " << session->info.inst << " since autoclose "
+ "has arrived" << dendl;
+ // evict session without marking it stale
+ to_evict.push_back(session);
+ continue;
+ }
+
+ if (defer_session_stale &&
+ !session->is_any_flush_waiter() &&
+ !mds->locker->is_revoking_any_caps_from(session->get_client())) {
+ dout(20) << "deferring marking session " << session->info.inst << " stale "
+ "since it holds no caps" << dendl;
+ continue;
+ }
+
+ auto it = session->info.client_metadata.find("timeout");
+ if (it != session->info.client_metadata.end()) {
+ unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
+ if (timeout == 0) {
+ dout(10) << "skipping session " << session->info.inst
+ << ", infinite timeout specified" << dendl;
+ continue;
+ }
+ double cutoff = queue_max_age + timeout;
+ if (last_cap_renew_span < cutoff) {
+ dout(10) << "skipping session " << session->info.inst
+ << ", timeout (" << timeout << ") specified"
+ << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+ continue;
+ }
+
+ // do not go through stale, evict it directly.
+ to_evict.push_back(session);
+ } else {
+ dout(10) << "new stale session " << session->info.inst
+ << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
+ new_stale.push_back(session);
+ }
+ }
+
+ for (auto session : new_stale) {
+ mds->sessionmap.set_state(session, Session::STATE_STALE);
+ if (mds->locker->revoke_stale_caps(session)) {
+ mds->locker->remove_stale_leases(session);
+ finish_flush_session(session, session->get_push_seq());
+ auto m = make_message<MClientSession>(CEPH_SESSION_STALE);
+ mds->send_message_client(m, session);
+ } else {
+ to_evict.push_back(session);
+ }
+ }
+ }
+
+ // autoclose
+ cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
+
+ // Collect a list of sessions exceeding the autoclose threshold
+ const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
+ if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
+ for (auto session : *(sessions_p2->second)) {
+ ceph_assert(session->is_stale());
+ auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+ if (last_cap_renew_span < cutoff) {
+ dout(20) << "oldest stale session is " << session->info.inst
+ << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
+ break;
+ }
+ to_evict.push_back(session);
+ }
+ }
+
+ for (auto session: to_evict) {
+ if (session->is_importing()) {
+ dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
+ continue;
+ }
+
+ auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+ mds->clog->warn() << "evicting unresponsive client " << *session
+ << ", after " << last_cap_renew_span << " seconds";
+ dout(10) << "autoclosing stale session " << session->info.inst
+ << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
+
+ if (g_conf()->mds_session_blocklist_on_timeout) {
+ CachedStackStringStream css;
+ mds->evict_client(session->get_client().v, false, true, *css, nullptr);
+ } else {
+ kill_session(session, NULL);
+ }
+ }
+}
+
+void Server::evict_cap_revoke_non_responders() {
+ if (!cap_revoke_eviction_timeout) {
+ return;
+ }
+
+ auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
+
+ for (auto const &client: to_evict) {
+ mds->clog->warn() << "client id " << client << " has not responded to"
+ << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
+ << " seconds, evicting";
+ dout(1) << __func__ << ": evicting cap revoke non-responder client id "
+ << client << dendl;
+
+ CachedStackStringStream css;
+ bool evicted = mds->evict_client(client.v, false,
+ g_conf()->mds_session_blocklist_on_evict,
+ *css, nullptr);
+ if (evicted && logger) {
+ logger->inc(l_mdss_cap_revoke_eviction);
+ }
+ }
+}
+
+void Server::handle_conf_change(const std::set<std::string>& changed) {
+ if (changed.count("mds_forward_all_requests_to_auth")){
+ forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
+ }
+ if (changed.count("mds_cap_revoke_eviction_timeout")) {
+ cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
+ dout(20) << __func__ << " cap revoke eviction timeout changed to "
+ << cap_revoke_eviction_timeout << dendl;
+ }
+ if (changed.count("mds_recall_max_decay_rate")) {
+ recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
+ }
+ if (changed.count("mds_max_snaps_per_dir")) {
+ max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
+ dout(20) << __func__ << " max snapshots per directory changed to "
+ << max_snaps_per_dir << dendl;
+ }
+ if (changed.count("mds_client_delegate_inos_pct")) {
+ delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
+ }
+ if (changed.count("mds_max_caps_per_client")) {
+ max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+ }
+ if (changed.count("mds_session_cap_acquisition_throttle")) {
+ cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
+ }
+ if (changed.count("mds_session_max_caps_throttle_ratio")) {
+ max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
+ }
+ if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
+ caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
+ }
+ if (changed.count("mds_alternate_name_max")) {
+ alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
+ }
+ if (changed.count("mds_fscrypt_last_block_max_size")) {
+ fscrypt_last_block_max_size = g_conf().get_val<Option::size_t>("mds_fscrypt_last_block_max_size");
+ }
+ if (changed.count("mds_dir_max_entries")) {
+ dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
+ dout(20) << __func__ << " max entries per directory changed to "
+ << dir_max_entries << dendl;
+ }
+ if (changed.count("mds_bal_fragment_size_max")) {
+ bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
+ dout(20) << __func__ << " max fragment size changed to "
+ << bal_fragment_size_max << dendl;
+ }
+ if (changed.count("mds_inject_rename_corrupt_dentry_first")) {
+ inject_rename_corrupt_dentry_first = g_conf().get_val<double>("mds_inject_rename_corrupt_dentry_first");
+ }
+}
+
+/*
+ * XXX bump in the interface here, not using an MDSContext here
+ * because all the callers right now happen to use a SaferCond
+ */
+void Server::kill_session(Session *session, Context *on_safe)
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
+
+ if ((session->is_opening() ||
+ session->is_open() ||
+ session->is_stale()) &&
+ !session->is_importing()) {
+ dout(10) << "kill_session " << session << dendl;
+ journal_close_session(session, Session::STATE_KILLING, on_safe);
+ } else {
+ dout(10) << "kill_session importing or already closing/killing " << session << dendl;
+ if (session->is_closing() ||
+ session->is_killing()) {
+ if (on_safe)
+ mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
+ } else {
+ ceph_assert(session->is_closed() ||
+ session->is_importing());
+ if (on_safe)
+ on_safe->complete(0);
+ }
+ }
+}
+
+size_t Server::apply_blocklist()
+{
+ std::vector<Session*> victims;
+ const auto& sessions = mds->sessionmap.get_sessions();
+ mds->objecter->with_osdmap(
+ [&](const OSDMap& o) {
+ for (const auto& p : sessions) {
+ if (!p.first.is_client()) {
+ // Do not apply OSDMap blocklist to MDS daemons, we find out
+ // about their death via MDSMap.
+ continue;
+ }
+ if (o.is_blocklisted(p.second->info.inst.addr)) {
+ victims.push_back(p.second);
+ }
+ }
+ });
+
+ for (const auto& s : victims) {
+ kill_session(s, nullptr);
+ }
+
+ dout(10) << "apply_blocklist: killed " << victims.size() << dendl;
+
+ return victims.size();
+}
+
+void Server::journal_close_session(Session *session, int state, Context *on_safe)
+{
+ dout(10) << __func__ << " : "
+ << session->info.inst
+ << " pending_prealloc_inos " << session->pending_prealloc_inos
+ << " free_prealloc_inos " << session->free_prealloc_inos
+ << " delegated_inos " << session->delegated_inos << dendl;
+
+ uint64_t sseq = mds->sessionmap.set_state(session, state);
+ version_t pv = mds->sessionmap.mark_projected(session);
+ version_t piv = 0;
+
+ // release alloc and pending-alloc inos for this session
+ // and wipe out session state, in case the session close aborts for some reason
+ interval_set<inodeno_t> inos_to_free;
+ inos_to_free.insert(session->pending_prealloc_inos);
+ inos_to_free.insert(session->free_prealloc_inos);
+ if (inos_to_free.size()) {
+ mds->inotable->project_release_ids(inos_to_free);
+ piv = mds->inotable->get_projected_version();
+ } else
+ piv = 0;
+
+ auto le = new ESession(session->info.inst, false, pv, inos_to_free, piv, session->delegated_inos);
+ auto fin = new C_MDS_session_finish(this, session, sseq, false, pv, inos_to_free, piv,
+ session->delegated_inos, mdlog->get_current_segment(), on_safe);
+ mdlog->start_submit_entry(le, fin);
+ mdlog->flush();
+
+ // clean up requests, too
+ while(!session->requests.empty()) {
+ auto mdr = MDRequestRef(*session->requests.begin());
+ mdcache->request_kill(mdr);
+ }
+
+ finish_flush_session(session, session->get_push_seq());
+}
+
+void Server::reconnect_clients(MDSContext *reconnect_done_)
+{
+ reconnect_done = reconnect_done_;
+
+ auto now = clock::now();
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ for (auto session : sessions) {
+ if (session->is_open()) {
+ client_reconnect_gather.insert(session->get_client());
+ session->set_reconnecting(true);
+ session->last_cap_renew = now;
+ }
+ }
+
+ if (client_reconnect_gather.empty()) {
+ dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
+ reconnect_gather_finish();
+ return;
+ }
+
+ // clients will get the mdsmap and discover we're reconnecting via the monitor.
+
+ reconnect_start = now;
+ dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
+ mds->sessionmap.dump();
+}
+
+void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
+{
+ dout(7) << "handle_client_reconnect " << m->get_source()
+ << (m->has_more() ? " (more)" : "") << dendl;
+ client_t from = m->get_source().num();
+ Session *session = mds->get_session(m);
+ if (!session) {
+ dout(0) << " ignoring sessionless msg " << *m << dendl;
+ auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
+ reply->metadata["error_string"] = "sessionless";
+ mds->send_message(reply, m->get_connection());
+ return;
+ }
+
+ if(mds->mdsmap->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
+ mds->clog->warn() << "client could not reconnect as"
+ " file system flag refuse_client_session is set";
+ dout(0) << "client cannot reconnect when file system flag"
+ " refuse_client_session is set" << dendl;
+ auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
+ reply->metadata["error_string"] = "client cannot reconnect when file system flag"
+ " refuse_client_session is set";
+ mds->send_message(reply, m->get_connection());
+ return;
+ }
+
+ if (!session->is_open()) {
+ dout(0) << " ignoring msg from not-open session" << *m << dendl;
+ auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
+ mds->send_message(reply, m->get_connection());
+ return;
+ }
+
+ bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
+
+ if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
+ dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
+ mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
+ dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
+
+ bool deny = false;
+ if (reconnect_all_deny || !mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
+ // XXX maybe in the future we can do better than this?
+ if (reconnect_all_deny) {
+ dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl;
+ } else {
+ dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl;
+ }
+ mds->clog->info() << "denied reconnect attempt (mds is "
+ << ceph_mds_state_name(mds->get_state())
+ << ") from " << m->get_source_inst()
+ << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
+ deny = true;
+ } else {
+ std::string error_str;
+ if (!session->is_open()) {
+ error_str = "session is closed";
+ } else if (mdcache->is_readonly()) {
+ error_str = "mds is readonly";
+ } else {
+ if (session->info.client_metadata.features.empty())
+ infer_supported_features(session, session->info.client_metadata);
+
+ feature_bitset_t missing_features = required_client_features;
+ missing_features -= session->info.client_metadata.features;
+ if (!missing_features.empty()) {
+ CachedStackStringStream css;
+ *css << "missing required features '" << missing_features << "'";
+ error_str = css->strv();
+ }
+ }
+
+ if (!error_str.empty()) {
+ deny = true;
+ dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
+ mds->clog->info() << "denied reconnect attempt from "
+ << m->get_source_inst() << " (" << error_str << ")";
+ }
+ }
+
+ if (deny) {
+ auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
+ mds->send_message_client(r, session);
+ if (session->is_open()) {
+ client_reconnect_denied.insert(session->get_client());
+ }
+ return;
+ }
+
+ if (!m->has_more()) {
+ metrics_handler->add_session(session);
+ // notify client of success with an OPEN
+ auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
+ if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
+ reply->supported_features = supported_features;
+ reply->metric_spec = supported_metric_spec;
+ }
+ mds->send_message_client(reply, session);
+ mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
+ }
+
+ session->last_cap_renew = clock::now();
+
+ // snaprealms
+ for (const auto &r : m->realms) {
+ CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
+ if (in && in->state_test(CInode::STATE_PURGING))
+ continue;
+ if (in) {
+ if (in->snaprealm) {
+ dout(15) << "open snaprealm (w inode) on " << *in << dendl;
+ } else {
+ // this can happen if we are non-auth or we rollback snaprealm
+ dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
+ }
+ mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
+ } else {
+ dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
+ << " seq " << r.realm.seq << dendl;
+ mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
+ }
+ }
+
+ // caps
+ for (const auto &p : m->caps) {
+ // make sure our last_cap_id is MAX over all issued caps
+ if (p.second.capinfo.cap_id > mdcache->last_cap_id)
+ mdcache->last_cap_id = p.second.capinfo.cap_id;
+
+ CInode *in = mdcache->get_inode(p.first);
+ if (in && in->state_test(CInode::STATE_PURGING))
+ continue;
+ if (in && in->is_auth()) {
+ // we recovered it, and it's ours. take note.
+ dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
+ << " on " << *in << dendl;
+ in->reconnect_cap(from, p.second, session);
+ mdcache->add_reconnected_cap(from, p.first, p.second);
+ recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
+ continue;
+ }
+
+ if (in && !in->is_auth()) {
+ // not mine.
+ dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
+ // add to cap export list.
+ mdcache->rejoin_export_caps(p.first, from, p.second,
+ in->authority().first, true);
+ } else {
+ // don't know if the inode is mine
+ dout(10) << "missing ino " << p.first << ", will load later" << dendl;
+ mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
+ }
+ }
+
+ reconnect_last_seen = clock::now();
+
+ if (!m->has_more()) {
+ mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
+
+ // remove from gather set
+ client_reconnect_gather.erase(from);
+ session->set_reconnecting(false);
+ if (client_reconnect_gather.empty())
+ reconnect_gather_finish();
+ }
+}
+
+void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
+{
+ int supported = -1;
+ auto it = client_metadata.find("ceph_version");
+ if (it != client_metadata.end()) {
+ // user space client
+ if (it->second.compare(0, 16, "ceph version 12.") == 0)
+ supported = CEPHFS_FEATURE_LUMINOUS;
+ else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
+ supported = CEPHFS_FEATURE_KRAKEN;
+ } else {
+ it = client_metadata.find("kernel_version");
+ if (it != client_metadata.end()) {
+ // kernel client
+ if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
+ supported = CEPHFS_FEATURE_LUMINOUS;
+ }
+ }
+ if (supported == -1 &&
+ session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
+ supported = CEPHFS_FEATURE_JEWEL;
+
+ if (supported >= 0) {
+ unsigned long value = (1UL << (supported + 1)) - 1;
+ client_metadata.features = feature_bitset_t(value);
+ dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
+ }
+}
+
+void Server::update_required_client_features()
+{
+ required_client_features = mds->mdsmap->get_required_client_features();
+ dout(7) << "required_client_features: " << required_client_features << dendl;
+
+ if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ for (auto session : sessions) {
+ feature_bitset_t missing_features = required_client_features;
+ missing_features -= session->info.client_metadata.features;
+ if (!missing_features.empty()) {
+ bool blocklisted = mds->objecter->with_osdmap(
+ [session](const OSDMap &osd_map) -> bool {
+ return osd_map.is_blocklisted(session->info.inst.addr);
+ });
+ if (blocklisted)
+ continue;
+
+ mds->clog->warn() << "evicting session " << *session << ", missing required features '"
+ << missing_features << "'";
+ CachedStackStringStream css;
+ mds->evict_client(session->get_client().v, false,
+ g_conf()->mds_session_blocklist_on_evict, *css);
+ }
+ }
+ }
+}
+
+void Server::reconnect_gather_finish()
+{
+ dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
+ ceph_assert(reconnect_done);
+
+ if (!mds->snapclient->is_synced()) {
+ // make sure snaptable cache is populated. snaprealms will be
+ // extensively used in rejoin stage.
+ dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
+ mds->snapclient->wait_for_sync(reconnect_done);
+ } else {
+ reconnect_done->complete(0);
+ }
+ reconnect_done = NULL;
+}
+
+void Server::reconnect_tick()
+{
+ bool reject_all_reconnect = false;
+ if (reconnect_evicting) {
+ dout(7) << "reconnect_tick: waiting for evictions" << dendl;
+ return;
+ }
+
+ /*
+ * Set mds_deny_all_reconnect to reject all the reconnect req ,
+ * then load less meta information in rejoin phase. This will shorten reboot time.
+ * Moreover, loading less meta increases the chance standby with less memory can failover.
+
+ * Why not shorten reconnect period?
+ * Clients may send unsafe or retry requests, which haven't been
+ * completed before old mds stop, to new mds. These requests may
+ * need to be processed during new mds's clientreplay phase,
+ * see: #https://github.com/ceph/ceph/pull/29059.
+ */
+ bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
+ if (client_reconnect_gather.empty())
+ return;
+
+ if (reconnect_all_deny && (client_reconnect_gather == client_reconnect_denied))
+ reject_all_reconnect = true;
+
+ auto now = clock::now();
+ auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
+ if (elapse1 < g_conf()->mds_reconnect_timeout && !reject_all_reconnect)
+ return;
+
+ vector<Session*> remaining_sessions;
+ remaining_sessions.reserve(client_reconnect_gather.size());
+ for (auto c : client_reconnect_gather) {
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
+ ceph_assert(session);
+ remaining_sessions.push_back(session);
+ // client re-sends cap flush messages before the reconnect message
+ if (session->last_seen > reconnect_last_seen)
+ reconnect_last_seen = session->last_seen;
+ }
+
+ auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
+ if (elapse2 < g_conf()->mds_reconnect_timeout / 2 && !reject_all_reconnect) {
+ dout(7) << "reconnect_tick: last seen " << elapse2
+ << " seconds ago, extending reconnect interval" << dendl;
+ return;
+ }
+
+ dout(7) << "reconnect timed out, " << remaining_sessions.size()
+ << " clients have not reconnected in time" << dendl;
+
+ // If we're doing blocklist evictions, use this to wait for them before
+ // proceeding to reconnect_gather_finish
+ MDSGatherBuilder gather(g_ceph_context);
+
+ for (auto session : remaining_sessions) {
+ // Keep sessions that have specified timeout. These sessions will prevent
+ // mds from going to active. MDS goes to active after they all have been
+ // killed or reclaimed.
+ if (session->info.client_metadata.find("timeout") !=
+ session->info.client_metadata.end()) {
+ dout(1) << "reconnect keeps " << session->info.inst
+ << ", need to be reclaimed" << dendl;
+ client_reclaim_gather.insert(session->get_client());
+ continue;
+ }
+
+ dout(1) << "reconnect gives up on " << session->info.inst << dendl;
+
+ mds->clog->warn() << "evicting unresponsive client " << *session
+ << ", after waiting " << elapse1
+ << " seconds during MDS startup";
+
+ // make _session_logged() purge orphan objects of lost async/unsafe requests
+ session->delegated_inos.swap(session->free_prealloc_inos);
+
+ if (g_conf()->mds_session_blocklist_on_timeout) {
+ CachedStackStringStream css;
+ mds->evict_client(session->get_client().v, false, true, *css,
+ gather.new_sub());
+ } else {
+ kill_session(session, NULL);
+ }
+
+ failed_reconnects++;
+ }
+ client_reconnect_gather.clear();
+ client_reconnect_denied.clear();
+
+ if (gather.has_subs()) {
+ dout(1) << "reconnect will complete once clients are evicted" << dendl;
+ gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
+ [this](int r){reconnect_gather_finish();})));
+ gather.activate();
+ reconnect_evicting = true;
+ } else {
+ reconnect_gather_finish();
+ }
+}
+
+void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
+{
+ if (!locks.length()) return;
+ int numlocks;
+ ceph_filelock lock;
+ auto p = locks.cbegin();
+ decode(numlocks, p);
+ for (int i = 0; i < numlocks; ++i) {
+ decode(lock, p);
+ lock.client = client;
+ in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
+ ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
+ }
+ decode(numlocks, p);
+ for (int i = 0; i < numlocks; ++i) {
+ decode(lock, p);
+ lock.client = client;
+ in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
+ ++in->get_flock_lock_state()->client_held_lock_counts[client];
+ }
+}
+
+/**
+ * Call this when the MDCache is oversized, to send requests to the clients
+ * to trim some caps, and consequently unpin some inodes in the MDCache so
+ * that it can trim too.
+ */
+std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
+{
+ const auto now = clock::now();
+ const bool steady = !!(flags&RecallFlags::STEADY);
+ const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
+ const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
+ const bool trim = !!(flags&RecallFlags::TRIM);
+
+ const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+ const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
+ const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
+ const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
+ const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
+ const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
+
+ dout(7) << __func__ << ":"
+ << " min=" << min_caps_per_client
+ << " max=" << max_caps_per_client
+ << " total=" << Capability::count()
+ << " flags=" << flags
+ << dendl;
+
+ /* trim caps of sessions with the most caps first */
+ std::multimap<uint64_t, Session*> caps_session;
+ auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
+ auto num_caps = s->caps.size();
+ auto cache_liveness = s->get_session_cache_liveness();
+ if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
+ caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
+ }
+ };
+ mds->sessionmap.get_client_sessions(std::move(f));
+
+ std::pair<bool, uint64_t> result = {false, 0};
+ auto& [throttled, caps_recalled] = result;
+ last_recall_state = now;
+ for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
+ if (!session->is_open() ||
+ !session->get_connection() ||
+ !session->info.inst.name.is_client())
+ continue;
+
+ dout(10) << __func__ << ":"
+ << " session " << session->info.inst
+ << " caps " << num_caps
+ << ", leases " << session->leases.size()
+ << dendl;
+
+ uint64_t newlim;
+ if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
+ newlim = min_caps_per_client;
+ } else {
+ newlim = num_caps-recall_max_caps;
+ }
+ if (num_caps > newlim) {
+ /* now limit the number of caps we recall at a time to prevent overloading ourselves */
+ uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
+ newlim = num_caps-recall;
+ const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
+ const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
+ const uint64_t global_recall_throttle = recall_throttle.get();
+ if (session_recall_throttle+recall > recall_max_decay_threshold) {
+ dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
+ throttled = true;
+ continue;
+ } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
+ dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
+ throttled = true;
+ continue;
+ } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
+ dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
+ throttled = true;
+ break;
+ }
+
+ // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
+ if (steady) {
+ const auto session_recall = session->get_recall_caps();
+ const auto session_release = session->get_release_caps();
+ if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
+ /* The session has been unable to keep up with the number of caps
+ * recalled (by half); additionally, to prevent marking sessions
+ * we've just begun to recall from, the session_recall counter
+ * (decayed count of caps recently recalled) is **greater** than the
+ * session threshold for the session's cap recall throttle.
+ */
+ dout(15) << " 2*session_release < session_recall"
+ " (2*" << session_release << " < " << session_recall << ") &&"
+ " 2*session_recall < recall_max_decay_threshold"
+ " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
+ " Skipping because we are unlikely to get more released." << dendl;
+ continue;
+ } else if (recall < recall_max_caps && 2*recall < session_recall) {
+ /* The number of caps recalled is less than the number we *could*
+ * recall (so there isn't much left to recall?) and the number of
+ * caps is less than the current recall_caps counter (decayed count
+ * of caps recently recalled).
+ */
+ dout(15) << " 2*recall < session_recall "
+ " (2*" << recall << " < " << session_recall << ") &&"
+ " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
+ " Skipping because we are unlikely to get more released." << dendl;
+ continue;
+ }
+ }
+
+ dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
+
+ auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
+ m->head.max_caps = newlim;
+ mds->send_message_client(m, session);
+ if (gather) {
+ flush_session(session, *gather);
+ }
+ caps_recalled += session->notify_recall_sent(newlim);
+ recall_throttle.hit(recall);
+ }
+ }
+
+ dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
+
+ return result;
+}
+
+void Server::force_clients_readonly()
+{
+ dout(10) << "force_clients_readonly" << dendl;
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ for (set<Session*>::const_iterator p = sessions.begin();
+ p != sessions.end();
+ ++p) {
+ Session *session = *p;
+ if (!session->info.inst.name.is_client() ||
+ !(session->is_open() || session->is_stale()))
+ continue;
+ mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
+ }
+}
+
+/*******
+ * some generic stuff for finishing off requests
+ */
+void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
+{
+ dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
+ ceph_assert(!mdr->has_completed);
+
+ // note trace items for eventual reply.
+ mdr->tracei = in;
+ if (in)
+ mdr->pin(in);
+
+ mdr->tracedn = dn;
+ if (dn)
+ mdr->pin(dn);
+
+ early_reply(mdr, in, dn);
+
+ mdr->committing = true;
+ submit_mdlog_entry(le, fin, mdr, __func__);
+
+ if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
+ if (mds->queue_one_replay()) {
+ dout(10) << " queued next replay op" << dendl;
+ } else {
+ dout(10) << " journaled last replay op" << dendl;
+ }
+ } else if (mdr->did_early_reply)
+ mds->locker->drop_rdlocks_for_early_reply(mdr.get());
+ else
+ mdlog->flush();
+}
+
+void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
+ std::string_view event)
+{
+ if (mdr) {
+ string event_str("submit entry: ");
+ event_str += event;
+ mdr->mark_event(event_str);
+ }
+ mdlog->submit_entry(le, fin);
+}
+
+/*
+ * send response built from mdr contents and error code; clean up mdr
+ */
+void Server::respond_to_request(MDRequestRef& mdr, int r)
+{
+ if (mdr->client_request) {
+ if (mdr->is_batch_head()) {
+ dout(20) << __func__ << " batch head " << *mdr << dendl;
+ mdr->release_batch_op()->respond(r);
+ } else {
+ reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
+ }
+ } else if (mdr->internal_op > -1) {
+ dout(10) << "respond_to_request on internal request " << mdr << dendl;
+ if (!mdr->internal_op_finish)
+ ceph_abort_msg("trying to respond to internal op without finisher");
+ mdr->internal_op_finish->complete(r);
+ mdcache->request_finish(mdr);
+ }
+}
+
+// statistics mds req op number and latency
+void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
+{
+ int code = l_mdss_first;
+ switch(req->get_op()) {
+ case CEPH_MDS_OP_LOOKUPHASH:
+ code = l_mdss_req_lookuphash_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUPINO:
+ code = l_mdss_req_lookupino_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUPPARENT:
+ code = l_mdss_req_lookupparent_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUPNAME:
+ code = l_mdss_req_lookupname_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUP:
+ code = l_mdss_req_lookup_latency;
+ break;
+ case CEPH_MDS_OP_LOOKUPSNAP:
+ code = l_mdss_req_lookupsnap_latency;
+ break;
+ case CEPH_MDS_OP_GETATTR:
+ code = l_mdss_req_getattr_latency;
+ break;
+ case CEPH_MDS_OP_SETATTR:
+ code = l_mdss_req_setattr_latency;
+ break;
+ case CEPH_MDS_OP_SETLAYOUT:
+ code = l_mdss_req_setlayout_latency;
+ break;
+ case CEPH_MDS_OP_SETDIRLAYOUT:
+ code = l_mdss_req_setdirlayout_latency;
+ break;
+ case CEPH_MDS_OP_GETVXATTR:
+ code = l_mdss_req_getvxattr_latency;
+ break;
+ case CEPH_MDS_OP_SETXATTR:
+ code = l_mdss_req_setxattr_latency;
+ break;
+ case CEPH_MDS_OP_RMXATTR:
+ code = l_mdss_req_rmxattr_latency;
+ break;
+ case CEPH_MDS_OP_READDIR:
+ code = l_mdss_req_readdir_latency;
+ break;
+ case CEPH_MDS_OP_SETFILELOCK:
+ code = l_mdss_req_setfilelock_latency;
+ break;
+ case CEPH_MDS_OP_GETFILELOCK:
+ code = l_mdss_req_getfilelock_latency;
+ break;
+ case CEPH_MDS_OP_CREATE:
+ code = l_mdss_req_create_latency;
+ break;
+ case CEPH_MDS_OP_OPEN:
+ code = l_mdss_req_open_latency;
+ break;
+ case CEPH_MDS_OP_MKNOD:
+ code = l_mdss_req_mknod_latency;
+ break;
+ case CEPH_MDS_OP_LINK:
+ code = l_mdss_req_link_latency;
+ break;
+ case CEPH_MDS_OP_UNLINK:
+ code = l_mdss_req_unlink_latency;
+ break;
+ case CEPH_MDS_OP_RMDIR:
+ code = l_mdss_req_rmdir_latency;
+ break;
+ case CEPH_MDS_OP_RENAME:
+ code = l_mdss_req_rename_latency;
+ break;
+ case CEPH_MDS_OP_MKDIR:
+ code = l_mdss_req_mkdir_latency;
+ break;
+ case CEPH_MDS_OP_SYMLINK:
+ code = l_mdss_req_symlink_latency;
+ break;
+ case CEPH_MDS_OP_LSSNAP:
+ code = l_mdss_req_lssnap_latency;
+ break;
+ case CEPH_MDS_OP_MKSNAP:
+ code = l_mdss_req_mksnap_latency;
+ break;
+ case CEPH_MDS_OP_RMSNAP:
+ code = l_mdss_req_rmsnap_latency;
+ break;
+ case CEPH_MDS_OP_RENAMESNAP:
+ code = l_mdss_req_renamesnap_latency;
+ break;
+ case CEPH_MDS_OP_READDIR_SNAPDIFF:
+ code = l_mdss_req_snapdiff_latency;
+ break;
+ default:
+ dout(1) << ": unknown client op" << dendl;
+ return;
+ }
+ logger->tinc(code, lat);
+}
+
+void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
+{
+ if (!g_conf()->mds_early_reply)
+ return;
+
+ if (mdr->no_early_reply) {
+ dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
+ return;
+ }
+
+ if (mdr->has_more() && mdr->more()->has_journaled_peers) {
+ dout(10) << "early_reply - there are journaled peers, not allowed." << dendl;
+ return;
+ }
+
+ if (mdr->alloc_ino) {
+ dout(10) << "early_reply - allocated ino, not allowed" << dendl;
+ return;
+ }
+
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ entity_inst_t client_inst = req->get_source_inst();
+ if (client_inst.name.is_mds())
+ return;
+
+ if (req->is_replay()) {
+ dout(10) << " no early reply on replay op" << dendl;
+ return;
+ }
+
+
+ auto reply = make_message<MClientReply>(*req, 0);
+ reply->set_unsafe();
+
+ // mark xlocks "done", indicating that we are exposing uncommitted changes.
+ //
+ //_rename_finish() does not send dentry link/unlink message to replicas.
+ // so do not set xlocks on dentries "done", the xlocks prevent dentries
+ // that have projected linkages from getting new replica.
+ mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
+
+ dout(10) << "early_reply " << reply->get_result()
+ << " (" << cpp_strerror(reply->get_result())
+ << ") " << *req << dendl;
+
+ if (tracei || tracedn) {
+ if (tracei)
+ mdr->cap_releases.erase(tracei->vino());
+ if (tracedn)
+ mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
+
+ set_trace_dist(reply, tracei, tracedn, mdr);
+ }
+
+ reply->set_extra_bl(mdr->reply_extra_bl);
+ mds->send_message_client(reply, mdr->session);
+
+ mdr->did_early_reply = true;
+
+ mds->logger->inc(l_mds_reply);
+ utime_t lat = ceph_clock_now() - req->get_recv_stamp();
+ mds->logger->tinc(l_mds_reply_latency, lat);
+ if (lat >= g_conf()->mds_op_complaint_time) {
+ mds->logger->inc(l_mds_slow_reply);
+ }
+ if (client_inst.name.is_client()) {
+ mds->sessionmap.hit_session(mdr->session);
+ }
+ perf_gather_op_latency(req, lat);
+ dout(20) << "lat " << lat << dendl;
+
+ mdr->mark_event("early_replied");
+}
+
+/*
+ * send given reply
+ * include a trace to tracei
+ * Clean up mdr
+ */
+void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
+{
+ ceph_assert(mdr.get());
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ dout(7) << "reply_client_request " << reply->get_result()
+ << " (" << cpp_strerror(reply->get_result())
+ << ") " << *req << dendl;
+
+ mdr->mark_event("replying");
+
+ Session *session = mdr->session;
+
+ // note successful request in session map?
+ //
+ // setfilelock requests are special, they only modify states in MDS memory.
+ // The states get lost when MDS fails. If Client re-send a completed
+ // setfilelock request, it means that client did not receive corresponding
+ // setfilelock reply. So MDS should re-execute the setfilelock request.
+ if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
+ reply->get_result() == 0 && session) {
+ inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
+ session->add_completed_request(mdr->reqid.tid, created);
+ if (mdr->ls) {
+ mdr->ls->touched_sessions.insert(session->info.inst.name);
+ }
+ }
+
+ // give any preallocated inos to the session
+ apply_allocated_inos(mdr, session);
+
+ // get tracei/tracedn from mdr?
+ CInode *tracei = mdr->tracei;
+ CDentry *tracedn = mdr->tracedn;
+
+ bool is_replay = mdr->client_request->is_replay();
+ bool did_early_reply = mdr->did_early_reply;
+ entity_inst_t client_inst = req->get_source_inst();
+
+ if (!did_early_reply && !is_replay) {
+
+ mds->logger->inc(l_mds_reply);
+ utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
+ mds->logger->tinc(l_mds_reply_latency, lat);
+ if (lat >= g_conf()->mds_op_complaint_time) {
+ mds->logger->inc(l_mds_slow_reply);
+ }
+ if (session && client_inst.name.is_client()) {
+ mds->sessionmap.hit_session(session);
+ }
+ perf_gather_op_latency(req, lat);
+ dout(20) << "lat " << lat << dendl;
+
+ if (tracei)
+ mdr->cap_releases.erase(tracei->vino());
+ if (tracedn)
+ mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
+ }
+
+ // drop non-rdlocks before replying, so that we can issue leases
+ mdcache->request_drop_non_rdlocks(mdr);
+
+ // reply at all?
+ if (session && !client_inst.name.is_mds()) {
+ // send reply.
+ if (!did_early_reply && // don't issue leases if we sent an earlier reply already
+ (tracei || tracedn)) {
+ if (is_replay) {
+ if (tracei)
+ mdcache->try_reconnect_cap(tracei, session);
+ } else {
+ // include metadata in reply
+ set_trace_dist(reply, tracei, tracedn, mdr);
+ }
+ }
+
+ // We can set the extra bl unconditionally: if it's already been sent in the
+ // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
+ reply->set_extra_bl(mdr->reply_extra_bl);
+
+ reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
+ mds->send_message_client(reply, session);
+ }
+
+ if (req->is_queued_for_replay() &&
+ (mdr->has_completed || reply->get_result() < 0)) {
+ if (reply->get_result() < 0) {
+ int r = reply->get_result();
+ derr << "reply_client_request: failed to replay " << *req
+ << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
+ mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
+ }
+ mds->queue_one_replay();
+ }
+
+ // clean up request
+ mdcache->request_finish(mdr);
+
+ // take a closer look at tracei, if it happens to be a remote link
+ if (tracei &&
+ tracedn &&
+ tracedn->get_projected_linkage()->is_remote()) {
+ mdcache->eval_remote(tracedn);
+ }
+}
+
+/*
+ * pass inode OR dentry (not both, or we may get confused)
+ *
+ * trace is in reverse order (i.e. root inode comes last)
+ */
+void Server::set_trace_dist(const ref_t<MClientReply> &reply,
+ CInode *in, CDentry *dn,
+ MDRequestRef& mdr)
+{
+ // skip doing this for debugging purposes?
+ if (g_conf()->mds_inject_traceless_reply_probability &&
+ mdr->ls && !mdr->o_trunc &&
+ (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
+ dout(5) << "deliberately skipping trace for " << *reply << dendl;
+ return;
+ }
+
+ // inode, dentry, dir, ..., inode
+ bufferlist bl;
+ mds_rank_t whoami = mds->get_nodeid();
+ Session *session = mdr->session;
+ snapid_t snapid = mdr->snapid;
+ utime_t now = ceph_clock_now();
+
+ dout(20) << "set_trace_dist snapid " << snapid << dendl;
+
+ // realm
+ if (snapid == CEPH_NOSNAP) {
+ SnapRealm *realm;
+ if (in)
+ realm = in->find_snaprealm();
+ else
+ realm = dn->get_dir()->get_inode()->find_snaprealm();
+ reply->snapbl = get_snap_trace(session, realm);
+ dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
+ }
+
+ // dir + dentry?
+ if (dn) {
+ reply->head.is_dentry = 1;
+ CDir *dir = dn->get_dir();
+ CInode *diri = dir->get_inode();
+
+ diri->encode_inodestat(bl, session, NULL, snapid);
+ dout(20) << "set_trace_dist added diri " << *diri << dendl;
+
+#ifdef MDS_VERIFY_FRAGSTAT
+ if (dir->is_complete())
+ dir->verify_fragstat();
+#endif
+ DirStat ds;
+ ds.frag = dir->get_frag();
+ ds.auth = dir->get_dir_auth().first;
+ if (dir->is_auth() && !forward_all_requests_to_auth)
+ dir->get_dist_spec(ds.dist, whoami);
+
+ dir->encode_dirstat(bl, session->info, ds);
+ dout(20) << "set_trace_dist added dir " << *dir << dendl;
+
+ encode(dn->get_name(), bl);
+ mds->locker->issue_client_lease(dn, in, mdr, now, bl);
+ } else
+ reply->head.is_dentry = 0;
+
+ // inode
+ if (in) {
+ in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
+ dout(20) << "set_trace_dist added snap " << snapid << " in " << *in
+ << dendl;
+ reply->head.is_target = 1;
+ } else
+ reply->head.is_target = 0;
+
+ reply->set_trace(bl);
+}
+
+void Server::handle_client_request(const cref_t<MClientRequest> &req)
+{
+ dout(4) << "handle_client_request " << *req << dendl;
+
+ if (mds->logger)
+ mds->logger->inc(l_mds_request);
+ if (logger)
+ logger->inc(l_mdss_handle_client_request);
+
+ if (!mdcache->is_open()) {
+ dout(5) << "waiting for root" << dendl;
+ mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
+ return;
+ }
+
+ bool sessionclosed_isok = replay_unsafe_with_closed_session;
+ // active session?
+ Session *session = 0;
+ if (req->is_a_client()) {
+ session = mds->get_session(req);
+ if (!session) {
+ dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
+ } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
+ session->is_closing() ||
+ session->is_killing()) {
+ dout(5) << "session closed|closing|killing, dropping" << dendl;
+ session = NULL;
+ }
+ if (!session) {
+ if (req->is_queued_for_replay())
+ mds->queue_one_replay();
+ return;
+ }
+ }
+
+ // old mdsmap?
+ if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
+ // send it? hrm, this isn't ideal; they may get a lot of copies if
+ // they have a high request rate.
+ }
+
+ // completed request?
+ bool has_completed = false;
+ if (req->is_replay() || req->get_retry_attempt()) {
+ ceph_assert(session);
+ inodeno_t created;
+ if (session->have_completed_request(req->get_reqid().tid, &created)) {
+ has_completed = true;
+ if (!session->is_open())
+ return;
+ // Don't send traceless reply if the completed request has created
+ // new inode. Treat the request as lookup request instead.
+ if (req->is_replay() ||
+ ((created == inodeno_t() || !mds->is_clientreplay()) &&
+ req->get_op() != CEPH_MDS_OP_OPEN &&
+ req->get_op() != CEPH_MDS_OP_CREATE)) {
+ dout(5) << "already completed " << req->get_reqid() << dendl;
+ auto reply = make_message<MClientReply>(*req, 0);
+ if (created != inodeno_t()) {
+ bufferlist extra;
+ encode(created, extra);
+ reply->set_extra_bl(extra);
+ }
+ mds->send_message_client(reply, session);
+
+ if (req->is_queued_for_replay())
+ mds->queue_one_replay();
+
+ return;
+ }
+ if (req->get_op() != CEPH_MDS_OP_OPEN &&
+ req->get_op() != CEPH_MDS_OP_CREATE) {
+ dout(10) << " completed request which created new inode " << created
+ << ", convert it to lookup request" << dendl;
+ req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
+ req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
+ }
+ }
+ }
+
+ // trim completed_request list
+ if (req->get_oldest_client_tid() > 0) {
+ dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
+ ceph_assert(session);
+ if (session->trim_completed_requests(req->get_oldest_client_tid())) {
+ // Sessions 'completed_requests' was dirtied, mark it to be
+ // potentially flushed at segment expiry.
+ mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
+
+ if (session->get_num_trim_requests_warnings() > 0 &&
+ session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
+ session->reset_num_trim_requests_warnings();
+ } else {
+ if (session->get_num_completed_requests() >=
+ (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
+ session->inc_num_trim_requests_warnings();
+ CachedStackStringStream css;
+ *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
+ << req->get_oldest_client_tid() << "), "
+ << session->get_num_completed_requests()
+ << " completed requests recorded in session\n";
+ mds->clog->warn() << css->strv();
+ dout(20) << __func__ << " " << css->strv() << dendl;
+ }
+ }
+ }
+
+ // register + dispatch
+ MDRequestRef mdr = mdcache->request_start(req);
+ if (!mdr.get())
+ return;
+
+ if (session) {
+ mdr->session = session;
+ session->requests.push_back(&mdr->item_session_request);
+ }
+
+ if (has_completed)
+ mdr->has_completed = true;
+
+ // process embedded cap releases?
+ // (only if NOT replay!)
+ if (!req->releases.empty() && req->is_a_client() && !req->is_replay()) {
+ client_t client = req->get_source().num();
+ for (const auto &r : req->releases) {
+ mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
+ }
+ req->releases.clear();
+ }
+
+ dispatch_client_request(mdr);
+ return;
+}
+
+void Server::handle_osd_map()
+{
+ /* Note that we check the OSDMAP_FULL flag directly rather than
+ * using osdmap_full_flag(), because we want to know "is the flag set"
+ * rather than "does the flag apply to us?" */
+ mds->objecter->with_osdmap([this](const OSDMap& o) {
+ auto pi = o.get_pg_pool(mds->get_metadata_pool());
+ is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
+ dout(7) << __func__ << ": full = " << is_full << " epoch = "
+ << o.get_epoch() << dendl;
+ });
+}
+
+void Server::dispatch_client_request(MDRequestRef& mdr)
+{
+ // we shouldn't be waiting on anyone.
+ ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_peer.empty());
+
+ if (mdr->killed) {
+ dout(10) << "request " << *mdr << " was killed" << dendl;
+ //if the mdr is a "batch_op" and it has followers, pick a follower as
+ //the new "head of the batch ops" and go on processing the new one.
+ if (mdr->is_batch_head()) {
+ int mask = mdr->client_request->head.args.getattr.mask;
+ auto it = mdr->batch_op_map->find(mask);
+ auto new_batch_head = it->second->find_new_head();
+ if (!new_batch_head) {
+ mdr->batch_op_map->erase(it);
+ return;
+ }
+ mdr = std::move(new_batch_head);
+ } else {
+ return;
+ }
+ } else if (mdr->aborted) {
+ mdr->aborted = false;
+ mdcache->request_kill(mdr);
+ return;
+ }
+
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ if (logger) logger->inc(l_mdss_dispatch_client_request);
+
+ dout(7) << "dispatch_client_request " << *req << dendl;
+
+ if (req->may_write() && mdcache->is_readonly()) {
+ dout(10) << " read-only FS" << dendl;
+ respond_to_request(mdr, -CEPHFS_EROFS);
+ return;
+ }
+ if (mdr->has_more() && mdr->more()->peer_error) {
+ dout(10) << " got error from peers" << dendl;
+ respond_to_request(mdr, mdr->more()->peer_error);
+ return;
+ }
+
+ if (is_full) {
+ CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!cur) {
+ // the request is already responded to
+ return;
+ }
+ if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
+ req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
+ req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
+ req->get_op() == CEPH_MDS_OP_RMXATTR ||
+ req->get_op() == CEPH_MDS_OP_SETXATTR ||
+ req->get_op() == CEPH_MDS_OP_CREATE ||
+ req->get_op() == CEPH_MDS_OP_SYMLINK ||
+ req->get_op() == CEPH_MDS_OP_MKSNAP ||
+ ((req->get_op() == CEPH_MDS_OP_LINK ||
+ req->get_op() == CEPH_MDS_OP_RENAME) &&
+ (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
+ ) {
+
+ if (check_access(mdr, cur, MAY_FULL)) {
+ dout(20) << __func__ << ": full, has FULL caps, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
+ } else {
+ dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
+ respond_to_request(mdr, -CEPHFS_ENOSPC);
+ return;
+ }
+ } else {
+ dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
+ }
+ }
+
+ switch (req->get_op()) {
+ case CEPH_MDS_OP_LOOKUPHASH:
+ case CEPH_MDS_OP_LOOKUPINO:
+ handle_client_lookup_ino(mdr, false, false);
+ break;
+ case CEPH_MDS_OP_LOOKUPPARENT:
+ handle_client_lookup_ino(mdr, true, false);
+ break;
+ case CEPH_MDS_OP_LOOKUPNAME:
+ handle_client_lookup_ino(mdr, false, true);
+ break;
+
+ // inodes ops.
+ case CEPH_MDS_OP_LOOKUP:
+ handle_client_getattr(mdr, true);
+ break;
+
+ case CEPH_MDS_OP_LOOKUPSNAP:
+ // lookupsnap does not reference a CDentry; treat it as a getattr
+ case CEPH_MDS_OP_GETATTR:
+ handle_client_getattr(mdr, false);
+ break;
+ case CEPH_MDS_OP_GETVXATTR:
+ handle_client_getvxattr(mdr);
+ break;
+
+ case CEPH_MDS_OP_SETATTR:
+ handle_client_setattr(mdr);
+ break;
+ case CEPH_MDS_OP_SETLAYOUT:
+ handle_client_setlayout(mdr);
+ break;
+ case CEPH_MDS_OP_SETDIRLAYOUT:
+ handle_client_setdirlayout(mdr);
+ break;
+ case CEPH_MDS_OP_SETXATTR:
+ handle_client_setxattr(mdr);
+ break;
+ case CEPH_MDS_OP_RMXATTR:
+ handle_client_removexattr(mdr);
+ break;
+
+ case CEPH_MDS_OP_READDIR:
+ handle_client_readdir(mdr);
+ break;
+
+ case CEPH_MDS_OP_SETFILELOCK:
+ handle_client_file_setlock(mdr);
+ break;
+
+ case CEPH_MDS_OP_GETFILELOCK:
+ handle_client_file_readlock(mdr);
+ break;
+
+ // funky.
+ case CEPH_MDS_OP_CREATE:
+ if (mdr->has_completed)
+ handle_client_open(mdr); // already created.. just open
+ else
+ handle_client_openc(mdr);
+ break;
+
+ case CEPH_MDS_OP_OPEN:
+ handle_client_open(mdr);
+ break;
+
+ // namespace.
+ // no prior locks.
+ case CEPH_MDS_OP_MKNOD:
+ handle_client_mknod(mdr);
+ break;
+ case CEPH_MDS_OP_LINK:
+ handle_client_link(mdr);
+ break;
+ case CEPH_MDS_OP_UNLINK:
+ case CEPH_MDS_OP_RMDIR:
+ handle_client_unlink(mdr);
+ break;
+ case CEPH_MDS_OP_RENAME:
+ handle_client_rename(mdr);
+ break;
+ case CEPH_MDS_OP_MKDIR:
+ handle_client_mkdir(mdr);
+ break;
+ case CEPH_MDS_OP_SYMLINK:
+ handle_client_symlink(mdr);
+ break;
+
+
+ // snaps
+ case CEPH_MDS_OP_LSSNAP:
+ handle_client_lssnap(mdr);
+ break;
+ case CEPH_MDS_OP_MKSNAP:
+ handle_client_mksnap(mdr);
+ break;
+ case CEPH_MDS_OP_RMSNAP:
+ handle_client_rmsnap(mdr);
+ break;
+ case CEPH_MDS_OP_RENAMESNAP:
+ handle_client_renamesnap(mdr);
+ break;
+ case CEPH_MDS_OP_READDIR_SNAPDIFF:
+ handle_client_readdir_snapdiff(mdr);
+ break;
+
+ default:
+ dout(1) << " unknown client op " << req->get_op() << dendl;
+ respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
+ }
+}
+
+
+// ---------------------------------------
+// PEER REQUESTS
+
+void Server::handle_peer_request(const cref_t<MMDSPeerRequest> &m)
+{
+ dout(4) << "handle_peer_request " << m->get_reqid() << " from " << m->get_source() << dendl;
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+
+ if (logger) logger->inc(l_mdss_handle_peer_request);
+
+ // reply?
+ if (m->is_reply())
+ return handle_peer_request_reply(m);
+
+ // the purpose of rename notify is enforcing causal message ordering. making sure
+ // bystanders have received all messages from rename srcdn's auth MDS.
+ if (m->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY) {
+ auto reply = make_message<MMDSPeerRequest>(m->get_reqid(), m->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK);
+ mds->send_message(reply, m->get_connection());
+ return;
+ }
+
+ CDentry *straydn = NULL;
+ if (m->straybl.length() > 0) {
+ mdcache->decode_replica_stray(straydn, nullptr, m->straybl, from);
+ ceph_assert(straydn);
+ m->straybl.clear();
+ }
+
+ if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+ dout(3) << "not clientreplay|active yet, waiting" << dendl;
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ // am i a new peer?
+ MDRequestRef mdr;
+ if (mdcache->have_request(m->get_reqid())) {
+ // existing?
+ mdr = mdcache->request_get(m->get_reqid());
+
+ // is my request newer?
+ if (mdr->attempt > m->get_attempt()) {
+ dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
+ << ", dropping " << *m << dendl;
+ return;
+ }
+
+ if (mdr->attempt < m->get_attempt()) {
+ // mine is old, close it out
+ dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
+ << ", closing out" << dendl;
+ mdcache->request_finish(mdr);
+ mdr.reset();
+ } else if (mdr->peer_to_mds != from) {
+ dout(10) << "local request " << *mdr << " not peer to mds." << from << dendl;
+ return;
+ }
+
+ // may get these while mdr->peer_request is non-null
+ if (m->get_op() == MMDSPeerRequest::OP_DROPLOCKS) {
+ mds->locker->drop_locks(mdr.get());
+ return;
+ }
+ if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
+ if (m->is_abort()) {
+ mdr->aborted = true;
+ if (mdr->peer_request) {
+ // only abort on-going xlock, wrlock and auth pin
+ ceph_assert(!mdr->peer_did_prepare());
+ } else {
+ mdcache->request_finish(mdr);
+ }
+ } else {
+ if (m->inode_export.length() > 0)
+ mdr->more()->inode_import = m->inode_export;
+ // finish off request.
+ mdcache->request_finish(mdr);
+ }
+ return;
+ }
+ }
+ if (!mdr.get()) {
+ // new?
+ if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
+ dout(10) << "missing peer request for " << m->get_reqid()
+ << " OP_FINISH, must have lost race with a forward" << dendl;
+ return;
+ }
+ mdr = mdcache->request_start_peer(m->get_reqid(), m->get_attempt(), m);
+ mdr->set_op_stamp(m->op_stamp);
+ }
+ ceph_assert(mdr->peer_request == 0); // only one at a time, please!
+
+ if (straydn) {
+ mdr->pin(straydn);
+ mdr->straydn = straydn;
+ }
+
+ if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
+ mdr->locks.empty()) {
+ dout(3) << "not active yet, waiting" << dendl;
+ mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ mdr->reset_peer_request(m);
+
+ dispatch_peer_request(mdr);
+}
+
+void Server::handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m)
+{
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+
+ if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+ metareqid_t r = m->get_reqid();
+ if (!mdcache->have_uncommitted_leader(r, from)) {
+ dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
+ << from << " reqid " << r << dendl;
+ return;
+ }
+ dout(3) << "not clientreplay|active yet, waiting" << dendl;
+ mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ if (m->get_op() == MMDSPeerRequest::OP_COMMITTED) {
+ metareqid_t r = m->get_reqid();
+ mdcache->committed_leader_peer(r, from);
+ return;
+ }
+
+ MDRequestRef mdr = mdcache->request_get(m->get_reqid());
+ if (m->get_attempt() != mdr->attempt) {
+ dout(10) << "handle_peer_request_reply " << *mdr << " ignoring reply from other attempt "
+ << m->get_attempt() << dendl;
+ return;
+ }
+
+ switch (m->get_op()) {
+ case MMDSPeerRequest::OP_XLOCKACK:
+ {
+ // identify lock, leader request
+ SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
+ m->get_object_info());
+ mdr->more()->peers.insert(from);
+ lock->decode_locked_state(m->get_lock_data());
+ dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
+ mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
+ mdr->finish_locking(lock);
+ lock->get_xlock(mdr, mdr->get_client());
+
+ ceph_assert(mdr->more()->waiting_on_peer.count(from));
+ mdr->more()->waiting_on_peer.erase(from);
+ ceph_assert(mdr->more()->waiting_on_peer.empty());
+ mdcache->dispatch_request(mdr);
+ }
+ break;
+
+ case MMDSPeerRequest::OP_WRLOCKACK:
+ {
+ // identify lock, leader request
+ SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
+ m->get_object_info());
+ mdr->more()->peers.insert(from);
+ dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
+ auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
+ ceph_assert(it->is_remote_wrlock());
+ ceph_assert(it->wrlock_target == from);
+
+ mdr->finish_locking(lock);
+
+ ceph_assert(mdr->more()->waiting_on_peer.count(from));
+ mdr->more()->waiting_on_peer.erase(from);
+ ceph_assert(mdr->more()->waiting_on_peer.empty());
+ mdcache->dispatch_request(mdr);
+ }
+ break;
+
+ case MMDSPeerRequest::OP_AUTHPINACK:
+ handle_peer_auth_pin_ack(mdr, m);
+ break;
+
+ case MMDSPeerRequest::OP_LINKPREPACK:
+ handle_peer_link_prep_ack(mdr, m);
+ break;
+
+ case MMDSPeerRequest::OP_RMDIRPREPACK:
+ handle_peer_rmdir_prep_ack(mdr, m);
+ break;
+
+ case MMDSPeerRequest::OP_RENAMEPREPACK:
+ handle_peer_rename_prep_ack(mdr, m);
+ break;
+
+ case MMDSPeerRequest::OP_RENAMENOTIFYACK:
+ handle_peer_rename_notify_ack(mdr, m);
+ break;
+
+ default:
+ ceph_abort_msg("unknown op " + to_string(m->get_op()) + " requested");
+ }
+}
+
+void Server::dispatch_peer_request(MDRequestRef& mdr)
+{
+ dout(7) << "dispatch_peer_request " << *mdr << " " << *mdr->peer_request << dendl;
+
+ if (mdr->aborted) {
+ dout(7) << " abort flag set, finishing" << dendl;
+ mdcache->request_finish(mdr);
+ return;
+ }
+
+ if (logger) logger->inc(l_mdss_dispatch_peer_request);
+
+ int op = mdr->peer_request->get_op();
+ switch (op) {
+ case MMDSPeerRequest::OP_XLOCK:
+ case MMDSPeerRequest::OP_WRLOCK:
+ {
+ // identify object
+ SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
+ mdr->peer_request->get_object_info());
+
+ if (!lock) {
+ dout(10) << "don't have object, dropping" << dendl;
+ ceph_abort_msg("don't have object"); // can this happen, if we auth pinned properly.
+ }
+ if (op == MMDSPeerRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
+ dout(10) << "not auth for remote xlock attempt, dropping on "
+ << *lock << " on " << *lock->get_parent() << dendl;
+ } else {
+ // use acquire_locks so that we get auth_pinning.
+ MutationImpl::LockOpVec lov;
+ for (const auto& p : mdr->locks) {
+ if (p.is_xlock())
+ lov.add_xlock(p.lock);
+ else if (p.is_wrlock())
+ lov.add_wrlock(p.lock);
+ }
+
+ int replycode = 0;
+ switch (op) {
+ case MMDSPeerRequest::OP_XLOCK:
+ lov.add_xlock(lock);
+ replycode = MMDSPeerRequest::OP_XLOCKACK;
+ break;
+ case MMDSPeerRequest::OP_WRLOCK:
+ lov.add_wrlock(lock);
+ replycode = MMDSPeerRequest::OP_WRLOCKACK;
+ break;
+ }
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ // ack
+ auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, replycode);
+ r->set_lock_type(lock->get_type());
+ lock->get_parent()->set_object_info(r->get_object_info());
+ if (replycode == MMDSPeerRequest::OP_XLOCKACK)
+ lock->encode_locked_state(r->get_lock_data());
+ mds->send_message(r, mdr->peer_request->get_connection());
+ }
+
+ // done.
+ mdr->reset_peer_request();
+ }
+ break;
+
+ case MMDSPeerRequest::OP_UNXLOCK:
+ case MMDSPeerRequest::OP_UNWRLOCK:
+ {
+ SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
+ mdr->peer_request->get_object_info());
+ ceph_assert(lock);
+ auto it = mdr->locks.find(lock);
+ ceph_assert(it != mdr->locks.end());
+ bool need_issue = false;
+ switch (op) {
+ case MMDSPeerRequest::OP_UNXLOCK:
+ mds->locker->xlock_finish(it, mdr.get(), &need_issue);
+ break;
+ case MMDSPeerRequest::OP_UNWRLOCK:
+ mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
+ break;
+ }
+ if (need_issue)
+ mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
+
+ // done. no ack necessary.
+ mdr->reset_peer_request();
+ }
+ break;
+
+ case MMDSPeerRequest::OP_AUTHPIN:
+ handle_peer_auth_pin(mdr);
+ break;
+
+ case MMDSPeerRequest::OP_LINKPREP:
+ case MMDSPeerRequest::OP_UNLINKPREP:
+ handle_peer_link_prep(mdr);
+ break;
+
+ case MMDSPeerRequest::OP_RMDIRPREP:
+ handle_peer_rmdir_prep(mdr);
+ break;
+
+ case MMDSPeerRequest::OP_RENAMEPREP:
+ handle_peer_rename_prep(mdr);
+ break;
+
+ default:
+ ceph_abort_msg("unknown op "+ to_string(op)+ " received");
+ }
+}
+
+void Server::handle_peer_auth_pin(MDRequestRef& mdr)
+{
+ dout(10) << "handle_peer_auth_pin " << *mdr << dendl;
+
+ // build list of objects
+ list<MDSCacheObject*> objects;
+ CInode *auth_pin_freeze = NULL;
+ bool nonblocking = mdr->peer_request->is_nonblocking();
+ bool fail = false, wouldblock = false, readonly = false;
+ ref_t<MMDSPeerRequest> reply;
+
+ if (mdcache->is_readonly()) {
+ dout(10) << " read-only FS" << dendl;
+ readonly = true;
+ fail = true;
+ }
+
+ if (!fail) {
+ for (const auto &oi : mdr->peer_request->get_authpins()) {
+ MDSCacheObject *object = mdcache->get_object(oi);
+ if (!object) {
+ dout(10) << " don't have " << oi << dendl;
+ fail = true;
+ break;
+ }
+
+ objects.push_back(object);
+ if (oi == mdr->peer_request->get_authpin_freeze())
+ auth_pin_freeze = static_cast<CInode*>(object);
+ }
+ }
+
+ // can we auth pin them?
+ if (!fail) {
+ for (const auto& obj : objects) {
+ if (!obj->is_auth()) {
+ dout(10) << " not auth for " << *obj << dendl;
+ fail = true;
+ break;
+ }
+ if (mdr->is_auth_pinned(obj))
+ continue;
+ if (!mdr->can_auth_pin(obj)) {
+ if (nonblocking) {
+ dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
+ fail = true;
+ wouldblock = true;
+ break;
+ }
+ // wait
+ dout(10) << " waiting for authpinnable on " << *obj << dendl;
+ obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ mdr->drop_local_auth_pins();
+
+ mds->locker->notify_freeze_waiter(obj);
+ goto blocked;
+ }
+ }
+ }
+
+ if (!fail) {
+ /* freeze authpin wrong inode */
+ if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
+ mdr->more()->rename_inode != auth_pin_freeze)
+ mdr->unfreeze_auth_pin(true);
+
+ /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
+ * on the source inode to complete. This happens after all locks for the rename
+ * operation are acquired. But to acquire locks, we need auth pin locks' parent
+ * objects first. So there is an ABBA deadlock if someone auth pins the source inode
+ * after locks are acquired and before Server::handle_peer_rename_prep() is called.
+ * The solution is freeze the inode and prevent other MDRequests from getting new
+ * auth pins.
+ */
+ if (auth_pin_freeze) {
+ dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
+ if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
+ auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
+ mds->mdlog->flush();
+ goto blocked;
+ }
+ }
+ }
+
+ reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
+
+ if (fail) {
+ mdr->drop_local_auth_pins(); // just in case
+ if (readonly)
+ reply->mark_error_rofs();
+ if (wouldblock)
+ reply->mark_error_wouldblock();
+ } else {
+ // auth pin!
+ for (const auto& obj : objects) {
+ dout(10) << "auth_pinning " << *obj << dendl;
+ mdr->auth_pin(obj);
+ }
+ // return list of my auth_pins (if any)
+ for (const auto &p : mdr->object_states) {
+ if (!p.second.auth_pinned)
+ continue;
+ MDSCacheObjectInfo info;
+ p.first->set_object_info(info);
+ reply->get_authpins().push_back(info);
+ if (p.first == (MDSCacheObject*)auth_pin_freeze)
+ auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
+ }
+ }
+
+ mds->send_message_mds(reply, mdr->peer_to_mds);
+
+ // clean up this request
+ mdr->reset_peer_request();
+ return;
+
+blocked:
+ if (mdr->peer_request->should_notify_blocking()) {
+ reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
+ reply->mark_req_blocked();
+ mds->send_message_mds(reply, mdr->peer_to_mds);
+ mdr->peer_request->clear_notify_blocking();
+ }
+ return;
+}
+
+void Server::handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
+{
+ dout(10) << "handle_peer_auth_pin_ack on " << *mdr << " " << *ack << dendl;
+ mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+ if (ack->is_req_blocked()) {
+ mdr->disable_lock_cache();
+ // peer auth pin is blocked, drop locks to avoid deadlock
+ mds->locker->drop_locks(mdr.get(), nullptr);
+ return;
+ }
+
+ // added auth pins?
+ set<MDSCacheObject*> pinned;
+ for (const auto &oi : ack->get_authpins()) {
+ MDSCacheObject *object = mdcache->get_object(oi);
+ ceph_assert(object); // we pinned it
+ dout(10) << " remote has pinned " << *object << dendl;
+ mdr->set_remote_auth_pinned(object, from);
+ if (oi == ack->get_authpin_freeze())
+ mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
+ pinned.insert(object);
+ }
+
+ // removed frozen auth pin ?
+ if (mdr->more()->is_remote_frozen_authpin &&
+ ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
+ auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
+ ceph_assert(stat_p);
+ if (stat_p->remote_auth_pinned == from) {
+ mdr->more()->is_remote_frozen_authpin = false;
+ }
+ }
+
+ // removed auth pins?
+ for (auto& p : mdr->object_states) {
+ if (p.second.remote_auth_pinned == MDS_RANK_NONE)
+ continue;
+ MDSCacheObject* object = p.first;
+ if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
+ dout(10) << " remote has unpinned " << *object << dendl;
+ mdr->_clear_remote_auth_pinned(p.second);
+ }
+ }
+
+ // note peer
+ mdr->more()->peers.insert(from);
+
+ // clear from waiting list
+ auto ret = mdr->more()->waiting_on_peer.erase(from);
+ ceph_assert(ret);
+
+ if (ack->is_error_rofs()) {
+ mdr->more()->peer_error = -CEPHFS_EROFS;
+ } else if (ack->is_error_wouldblock()) {
+ mdr->more()->peer_error = -CEPHFS_EWOULDBLOCK;
+ }
+
+ // go again?
+ if (mdr->more()->waiting_on_peer.empty())
+ mdcache->dispatch_request(mdr);
+ else
+ dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
+}
+
+
+// ---------------------------------------
+// HELPERS
+
+
+/**
+ * check whether we are permitted to complete a request
+ *
+ * Check whether we have permission to perform the operation specified
+ * by mask on the given inode, based on the capability in the mdr's
+ * session.
+ */
+bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
+{
+ if (mdr->session) {
+ int r = mdr->session->check_access(
+ in, mask,
+ mdr->client_request->get_caller_uid(),
+ mdr->client_request->get_caller_gid(),
+ &mdr->client_request->get_caller_gid_list(),
+ mdr->client_request->head.args.setattr.uid,
+ mdr->client_request->head.args.setattr.gid);
+ if (r < 0) {
+ respond_to_request(mdr, r);
+ return false;
+ }
+ }
+ return true;
+}
+
+/**
+ * check whether fragment has reached maximum size
+ *
+ */
+bool Server::check_fragment_space(MDRequestRef &mdr, CDir *dir)
+{
+ const auto size = dir->get_frag_size();
+ const auto max = bal_fragment_size_max;
+ if (size >= max) {
+ dout(10) << "fragment " << *dir << " size exceeds " << max << " (CEPHFS_ENOSPC)" << dendl;
+ respond_to_request(mdr, -CEPHFS_ENOSPC);
+ return false;
+ } else {
+ dout(20) << "fragment " << *dir << " size " << size << " < " << max << dendl;
+ }
+
+ return true;
+}
+
+/**
+ * check whether entries in a dir reached maximum size
+ *
+ */
+bool Server::check_dir_max_entries(MDRequestRef &mdr, CDir *in)
+{
+ const uint64_t size = in->inode->get_projected_inode()->dirstat.nfiles +
+ in->inode->get_projected_inode()->dirstat.nsubdirs;
+ if (dir_max_entries && size >= dir_max_entries) {
+ dout(10) << "entries per dir " << *in << " size exceeds " << dir_max_entries << " (ENOSPC)" << dendl;
+ respond_to_request(mdr, -ENOSPC);
+ return false;
+ }
+ return true;
+}
+
+
+CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
+{
+ string straydname;
+ in->name_stray_dentry(straydname);
+
+ CDentry *straydn = mdr->straydn;
+ if (straydn) {
+ ceph_assert(straydn->get_name() == straydname);
+ return straydn;
+ }
+ CDir *straydir = mdcache->get_stray_dir(in);
+
+ if (!mdr->client_request->is_replay() &&
+ !check_fragment_space(mdr, straydir))
+ return nullptr;
+
+ straydn = straydir->lookup(straydname);
+ if (!straydn) {
+ if (straydir->is_frozen_dir()) {
+ dout(10) << __func__ << ": " << *straydir << " is frozen, waiting" << dendl;
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ straydir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ return nullptr;
+ }
+ straydn = straydir->add_null_dentry(straydname);
+ straydn->mark_new();
+ } else {
+ ceph_assert(straydn->get_projected_linkage()->is_null());
+ }
+
+ straydn->state_set(CDentry::STATE_STRAY);
+ mdr->straydn = straydn;
+ mdr->pin(straydn);
+
+ return straydn;
+}
+
+/** prepare_new_inode
+ *
+ * create a new inode. set c/m/atime. hit dir pop.
+ */
+CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
+ const file_layout_t *layout)
+{
+ CInode *in = new CInode(mdcache);
+ auto _inode = in->_get_inode();
+
+ // Server::prepare_force_open_sessions() can re-open session in closing
+ // state. In that corner case, session's prealloc_inos are being freed.
+ // To simplify the code, we disallow using/refilling session's prealloc_ino
+ // while session is opening.
+ bool allow_prealloc_inos = mdr->session->is_open();
+
+ inodeno_t _useino = useino;
+
+ // assign ino
+ do {
+ if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(_useino))) {
+ if (mdcache->test_and_clear_taken_inos(_inode->ino)) {
+ _inode->ino = 0;
+ dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
+ << " (" << mdr->session->info.prealloc_inos.size() << " left)"
+ << " but has been taken, will try again!" << dendl;
+ } else {
+ mds->sessionmap.mark_projected(mdr->session);
+ dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
+ << " (" << mdr->session->info.prealloc_inos.size() << " left)"
+ << dendl;
+ }
+ } else {
+ mdr->alloc_ino =
+ _inode->ino = mds->inotable->project_alloc_id(_useino);
+ if (mdcache->test_and_clear_taken_inos(_inode->ino)) {
+ mds->inotable->apply_alloc_id(_inode->ino);
+ _inode->ino = 0;
+ dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino
+ << " but has been taken, will try again!" << dendl;
+ } else {
+ dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
+ }
+ }
+ _useino = 0;
+ } while (!_inode->ino);
+
+ if (useino && useino != _inode->ino) {
+ dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
+ mds->clog->error() << mdr->client_request->get_source()
+ << " specified ino " << useino
+ << " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
+ //ceph_abort(); // just for now.
+ }
+
+ if (allow_prealloc_inos &&
+ mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
+ int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
+ mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
+ ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
+ mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
+ mds->sessionmap.mark_projected(mdr->session);
+ dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
+ }
+
+ _inode->version = 1;
+ _inode->xattr_version = 1;
+ _inode->nlink = 1; // FIXME
+
+ _inode->mode = mode;
+
+ // FIPS zeroization audit 20191117: this memset is not security related.
+ memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
+ if (_inode->is_dir()) {
+ _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+ } else if (layout) {
+ _inode->layout = *layout;
+ } else {
+ _inode->layout = mdcache->default_file_layout;
+ }
+
+ _inode->truncate_size = -1ull; // not truncated, yet!
+ _inode->truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
+
+ CInode *diri = dir->get_inode();
+ auto pip = diri->get_projected_inode();
+
+ dout(10) << oct << " dir mode 0" << pip->mode << " new mode 0" << mode << dec << dendl;
+
+ if (pip->mode & S_ISGID) {
+ dout(10) << " dir is sticky" << dendl;
+ _inode->gid = pip->gid;
+ if (S_ISDIR(mode)) {
+ dout(10) << " new dir also sticky" << dendl;
+ _inode->mode |= S_ISGID;
+ }
+ } else {
+ _inode->gid = mdr->client_request->get_owner_gid();
+ ceph_assert(_inode->gid != (unsigned)-1);
+ }
+
+ _inode->uid = mdr->client_request->get_owner_uid();
+ ceph_assert(_inode->uid != (unsigned)-1);
+
+ _inode->btime = _inode->ctime = _inode->mtime = _inode->atime =
+ mdr->get_op_stamp();
+
+ _inode->change_attr = 0;
+
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ dout(10) << "copying fscrypt_auth len " << req->fscrypt_auth.size() << dendl;
+ _inode->fscrypt_auth = req->fscrypt_auth;
+ _inode->fscrypt_file = req->fscrypt_file;
+
+ if (req->get_data().length()) {
+ auto p = req->get_data().cbegin();
+
+ // xattrs on new inode?
+ auto _xattrs = CInode::allocate_xattr_map();
+ decode_noshare(*_xattrs, p);
+ dout(10) << "prepare_new_inode setting xattrs " << *_xattrs << dendl;
+ in->reset_xattrs(std::move(_xattrs));
+ }
+
+ if (!mds->mdsmap->get_inline_data_enabled() ||
+ !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
+ _inode->inline_data.version = CEPH_INLINE_NONE;
+
+ mdcache->add_inode(in); // add
+ dout(10) << "prepare_new_inode " << *in << dendl;
+ return in;
+}
+
+void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
+{
+ dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
+ << " inotablev " << mds->inotable->get_projected_version()
+ << dendl;
+ blob->set_ino_alloc(mdr->alloc_ino,
+ mdr->used_prealloc_ino,
+ mdr->prealloc_inos,
+ mdr->client_request->get_source(),
+ mds->sessionmap.get_projected(),
+ mds->inotable->get_projected_version());
+}
+
+void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
+{
+ dout(10) << "apply_allocated_inos " << mdr->alloc_ino
+ << " / " << mdr->prealloc_inos
+ << " / " << mdr->used_prealloc_ino << dendl;
+
+ if (mdr->alloc_ino) {
+ mds->inotable->apply_alloc_id(mdr->alloc_ino);
+ }
+ if (mdr->prealloc_inos.size()) {
+ ceph_assert(session);
+ session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
+ session->free_prealloc_inos.insert(mdr->prealloc_inos);
+ session->info.prealloc_inos.insert(mdr->prealloc_inos);
+ mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
+ mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
+ }
+ if (mdr->used_prealloc_ino) {
+ ceph_assert(session);
+ session->info.prealloc_inos.erase(mdr->used_prealloc_ino);
+ mds->sessionmap.mark_dirty(session);
+ }
+}
+
+struct C_MDS_TryOpenInode : public ServerContext {
+ MDRequestRef mdr;
+ inodeno_t ino;
+ C_MDS_TryOpenInode(Server *s, MDRequestRef& r, inodeno_t i) :
+ ServerContext(s), mdr(r), ino(i) {}
+ void finish(int r) override {
+ server->_try_open_ino(mdr, r, ino);
+ }
+};
+
+void Server::_try_open_ino(MDRequestRef& mdr, int r, inodeno_t ino)
+{
+ dout(10) << "_try_open_ino " << mdr.get() << " ino " << ino << " r=" << r << dendl;
+
+ // `r` is a rank if >=0, else an error code
+ if (r >= 0) {
+ mds_rank_t dest_rank(r);
+ if (dest_rank == mds->get_nodeid())
+ dispatch_client_request(mdr);
+ else
+ mdcache->request_forward(mdr, dest_rank);
+ return;
+ }
+
+ // give up
+ if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
+ r = -CEPHFS_ESTALE;
+ respond_to_request(mdr, r);
+}
+
+class C_MDS_TryFindInode : public ServerContext {
+ MDRequestRef mdr;
+ MDCache *mdcache;
+ inodeno_t ino;
+public:
+ C_MDS_TryFindInode(Server *s, MDRequestRef& r, MDCache *m, inodeno_t i) :
+ ServerContext(s), mdr(r), mdcache(m), ino(i) {}
+ void finish(int r) override {
+ if (r == -CEPHFS_ESTALE) { // :( find_ino_peers failed
+ /*
+ * There has one case that when the MDS crashes and the
+ * openfiletable journal couldn't be flushed and then
+ * the replacing MDS is possibly won't load some already
+ * opened CInodes into the MDCache. And if the clients
+ * will retry some requests after reconnected, the MDS
+ * will return -ESTALE after failing to find the ino in
+ * all active peers.
+ *
+ * As a workaround users can run `ls -R ${mountpoint}`
+ * to list all the sub-files or sub-direcotries from the
+ * mountpoint.
+ *
+ * We need try to open the ino and try it again.
+ */
+ CInode *in = mdcache->get_inode(ino);
+ if (in && in->state_test(CInode::STATE_PURGING))
+ server->respond_to_request(mdr, r);
+ else
+ mdcache->open_ino(ino, (int64_t)-1, new C_MDS_TryOpenInode(server, mdr, ino));
+ } else {
+ server->dispatch_client_request(mdr);
+ }
+ }
+};
+
+/* If this returns null, the request has been handled
+ * as appropriate: forwarded on, or the client's been replied to */
+CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
+ bool want_auth,
+ bool no_want_auth)
+{
+ const filepath& refpath = mdr->get_filepath();
+ dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
+
+ if (mdr->locking_state & MutationImpl::PATH_LOCKED)
+ return mdr->in[0];
+
+ // traverse
+ CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
+ int flags = 0;
+ if (refpath.is_last_snap()) {
+ if (!no_want_auth)
+ want_auth = true;
+ } else {
+ if (!no_want_auth && forward_all_requests_to_auth)
+ want_auth = true;
+ flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
+ }
+ if (want_auth)
+ flags |= MDS_TRAVERSE_WANT_AUTH;
+ int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
+ if (r > 0)
+ return nullptr; // delayed
+ if (r < 0) { // error
+ if (r == -CEPHFS_ENOENT && !mdr->dn[0].empty()) {
+ if (mdr->client_request &&
+ mdr->client_request->get_dentry_wanted())
+ mdr->tracedn = mdr->dn[0].back();
+ respond_to_request(mdr, r);
+ } else if (r == -CEPHFS_ESTALE) {
+ dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
+ inodeno_t ino = refpath.get_ino();
+ mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
+ } else {
+ dout(10) << "FAIL on error " << r << dendl;
+ respond_to_request(mdr, r);
+ }
+ return nullptr;
+ }
+ CInode *ref = mdr->in[0];
+ dout(10) << "ref is " << *ref << dendl;
+
+ if (want_auth) {
+ // auth_pin?
+ // do NOT proceed if freezing, as cap release may defer in that case, and
+ // we could deadlock when we try to lock @ref.
+ // if we're already auth_pinned, continue; the release has already been processed.
+ if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
+ (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
+ dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
+ ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
+ if (mdr->is_any_remote_auth_pin())
+ mds->locker->notify_freeze_waiter(ref);
+ return 0;
+ }
+ mdr->auth_pin(ref);
+ }
+
+ // set and pin ref
+ mdr->pin(ref);
+ return ref;
+}
+
+
+/** rdlock_path_xlock_dentry
+ * traverse path to the directory that could/would contain dentry.
+ * make sure i am auth for that dentry (or target inode if it exists and authexist),
+ * forward as necessary. create null dentry in place (or use existing if okexist).
+ * get rdlocks on traversed dentries, xlock on new dentry.
+ *
+ * set authexist true if caller requires the target inode to be auth when it exists.
+ * the tail dentry is not always auth any more if authexist because it is impossible
+ * to ensure tail dentry and target inode are both auth in one mds. the tail dentry
+ * will not be xlocked too if authexist and the target inode exists.
+ */
+CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
+ bool create, bool okexist, bool authexist,
+ bool want_layout)
+{
+ const filepath& refpath = mdr->get_filepath();
+ dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
+
+ if (mdr->locking_state & MutationImpl::PATH_LOCKED)
+ return mdr->dn[0].back();
+
+ // figure parent dir vs dname
+ if (refpath.depth() == 0) {
+ dout(7) << "invalid path (zero length)" << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return nullptr;
+ }
+
+ if (refpath.is_last_snap()) {
+ respond_to_request(mdr, -CEPHFS_EROFS);
+ return nullptr;
+ }
+
+ if (refpath.is_last_dot_or_dotdot()) {
+ dout(7) << "invalid path (last dot or dot_dot)" << dendl;
+ if (create)
+ respond_to_request(mdr, -CEPHFS_EEXIST);
+ else
+ respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+ return nullptr;
+ }
+
+ // traverse to parent dir
+ CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
+ int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
+ MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
+ MDS_TRAVERSE_WANT_AUTH;
+ if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
+ flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
+ if (create)
+ flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
+ if (authexist)
+ flags |= MDS_TRAVERSE_WANT_INODE;
+ if (want_layout)
+ flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
+ int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
+ if (r > 0)
+ return nullptr; // delayed
+ if (r < 0) {
+ if (r == -CEPHFS_ESTALE) {
+ dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
+ inodeno_t ino = refpath.get_ino();
+ mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
+ return nullptr;
+ }
+ respond_to_request(mdr, r);
+ return nullptr;
+ }
+
+ CDentry *dn = mdr->dn[0].back();
+ CDir *dir = dn->get_dir();
+ CInode *diri = dir->get_inode();
+
+ if (!mdr->reqid.name.is_mds()) {
+ if (diri->is_system() && !diri->is_root() &&
+ (!diri->is_lost_and_found() ||
+ mdr->client_request->get_op() != CEPH_MDS_OP_UNLINK)) {
+ respond_to_request(mdr, -CEPHFS_EROFS);
+ return nullptr;
+ }
+ }
+
+ if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
+ respond_to_request(mdr, -CEPHFS_ENOENT);
+ return nullptr;
+ }
+
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ if (dnl->is_null()) {
+ if (!create && okexist) {
+ respond_to_request(mdr, -CEPHFS_ENOENT);
+ return nullptr;
+ }
+
+ snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+ dn->first = std::max(dn->first, next_snap);
+ } else {
+ if (!okexist) {
+ respond_to_request(mdr, -CEPHFS_EEXIST);
+ return nullptr;
+ }
+ mdr->in[0] = dnl->get_inode();
+ }
+
+ return dn;
+}
+
+/** rdlock_two_paths_xlock_destdn
+ * traverse two paths and lock the two paths in proper order.
+ * The order of taking locks is:
+ * 1. Lock directory inodes or dentries according to which trees they
+ * are under. Lock objects under fs root before objects under mdsdir.
+ * 2. Lock directory inodes or dentries according to their depth, in
+ * ascending order.
+ * 3. Lock directory inodes or dentries according to inode numbers or
+ * dentries' parent inode numbers, in ascending order.
+ * 4. Lock dentries in the same directory in order of their keys.
+ * 5. Lock non-directory inodes according to inode numbers, in ascending
+ * order.
+ */
+std::pair<CDentry*, CDentry*>
+Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
+{
+
+ const filepath& refpath = mdr->get_filepath();
+ const filepath& refpath2 = mdr->get_filepath2();
+
+ dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
+
+ if (mdr->locking_state & MutationImpl::PATH_LOCKED)
+ return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
+
+ if (refpath.depth() != 1 || refpath2.depth() != 1) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
+ }
+
+ if (refpath.is_last_snap() || refpath2.is_last_snap()) {
+ respond_to_request(mdr, -CEPHFS_EROFS);
+ return std::make_pair(nullptr, nullptr);
+ }
+
+ // traverse to parent dir
+ CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
+ int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
+ int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
+ if (r != 0) {
+ if (r == -CEPHFS_ESTALE) {
+ dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl;
+ inodeno_t ino = refpath.get_ino();
+ mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
+ } else if (r < 0) {
+ respond_to_request(mdr, r);
+ }
+ return std::make_pair(nullptr, nullptr);
+ }
+
+ flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
+ r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
+ if (r != 0) {
+ if (r == -CEPHFS_ESTALE) {
+ dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
+ inodeno_t ino = refpath2.get_ino();
+ mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
+ } else if (r < 0) {
+ respond_to_request(mdr, r);
+ }
+ return std::make_pair(nullptr, nullptr);
+ }
+
+ CDentry *srcdn = mdr->dn[1].back();
+ CDir *srcdir = srcdn->get_dir();
+ CDentry *destdn = mdr->dn[0].back();
+ CDir *destdir = destdn->get_dir();
+
+ if (!mdr->reqid.name.is_mds()) {
+ if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
+ (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
+ respond_to_request(mdr, -CEPHFS_EROFS);
+ return std::make_pair(nullptr, nullptr);
+ }
+ }
+
+ if (!destdir->get_inode()->is_base() &&
+ destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
+ respond_to_request(mdr, -CEPHFS_ENOENT);
+ return std::make_pair(nullptr, nullptr);
+ }
+
+ MutationImpl::LockOpVec lov;
+ if (srcdir->get_inode() == destdir->get_inode()) {
+ lov.add_wrlock(&destdir->inode->filelock);
+ lov.add_wrlock(&destdir->inode->nestlock);
+ if (xlock_srcdn && srcdir != destdir) {
+ mds_rank_t srcdir_auth = srcdir->authority().first;
+ if (srcdir_auth != mds->get_nodeid()) {
+ lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
+ lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
+ }
+ }
+
+ if (srcdn->get_name() > destdn->get_name())
+ lov.add_xlock(&destdn->lock);
+
+ if (xlock_srcdn)
+ lov.add_xlock(&srcdn->lock);
+ else
+ lov.add_rdlock(&srcdn->lock);
+
+ if (srcdn->get_name() < destdn->get_name())
+ lov.add_xlock(&destdn->lock);
+ } else {
+ int cmp = mdr->compare_paths();
+ bool lock_destdir_first =
+ (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
+
+ if (lock_destdir_first) {
+ lov.add_wrlock(&destdir->inode->filelock);
+ lov.add_wrlock(&destdir->inode->nestlock);
+ lov.add_xlock(&destdn->lock);
+ }
+
+ if (xlock_srcdn) {
+ mds_rank_t srcdir_auth = srcdir->authority().first;
+ if (srcdir_auth == mds->get_nodeid()) {
+ lov.add_wrlock(&srcdir->inode->filelock);
+ lov.add_wrlock(&srcdir->inode->nestlock);
+ } else {
+ lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
+ lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
+ }
+ lov.add_xlock(&srcdn->lock);
+ } else {
+ lov.add_rdlock(&srcdn->lock);
+ }
+
+ if (!lock_destdir_first) {
+ lov.add_wrlock(&destdir->inode->filelock);
+ lov.add_wrlock(&destdir->inode->nestlock);
+ lov.add_xlock(&destdn->lock);
+ }
+ }
+
+ CInode *auth_pin_freeze = nullptr;
+ // XXX any better way to do this?
+ if (xlock_srcdn && !srcdn->is_auth()) {
+ CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+ auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
+ }
+ if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
+ return std::make_pair(nullptr, nullptr);
+
+ if (srcdn->get_projected_linkage()->is_null()) {
+ respond_to_request(mdr, -CEPHFS_ENOENT);
+ return std::make_pair(nullptr, nullptr);
+ }
+
+ if (destdn->get_projected_linkage()->is_null()) {
+ snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+ destdn->first = std::max(destdn->first, next_snap);
+ }
+
+ mdr->locking_state |= MutationImpl::PATH_LOCKED;
+
+ return std::make_pair(destdn, srcdn);
+}
+
+/**
+ * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
+ *
+ * @param diri base inode
+ * @param fg the exact frag we want
+ * @param mdr request
+ * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
+ */
+CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
+{
+ CDir *dir = diri->get_dirfrag(fg);
+
+ if (dir) {
+ // am i auth for the dirfrag?
+ if (!dir->is_auth()) {
+ mds_rank_t auth = dir->authority().first;
+ dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
+ << ", fw to mds." << auth << dendl;
+ mdcache->request_forward(mdr, auth);
+ return nullptr;
+ }
+ } else {
+ // not open and inode not mine?
+ if (!diri->is_auth()) {
+ mds_rank_t inauth = diri->authority().first;
+ dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
+ mdcache->request_forward(mdr, inauth);
+ return nullptr;
+ }
+
+ // not open and inode frozen?
+ if (diri->is_frozen()) {
+ dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
+ ceph_assert(diri->get_parent_dir());
+ diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ return nullptr;
+ }
+
+ // invent?
+ dir = diri->get_or_open_dirfrag(mdcache, fg);
+ }
+
+ return dir;
+}
+
+
+// ===============================================================================
+// STAT
+
+void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ if (req->get_filepath().depth() == 0 && is_lookup) {
+ // refpath can't be empty for lookup but it can for
+ // getattr (we do getattr with empty refpath for mount of '/')
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ bool want_auth = false;
+ int mask = req->head.args.getattr.mask;
+ if (mask & CEPH_STAT_RSTAT)
+ want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
+
+ if (!mdr->is_batch_head() && mdr->can_batch()) {
+ CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
+ int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
+ (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
+ &mdr->dn[0], &mdr->in[0]);
+ if (r > 0)
+ return; // delayed
+
+ if (r < 0) {
+ // fall-thru. let rdlock_path_pin_ref() check again.
+ } else if (is_lookup) {
+ CDentry* dn = mdr->dn[0].back();
+ mdr->pin(dn);
+ auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
+ if (em.second) {
+ em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
+ } else {
+ dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
+ em.first->second->add_request(mdr);
+ mdr->mark_event("joining batch lookup");
+ return;
+ }
+ } else {
+ CInode *in = mdr->in[0];
+ mdr->pin(in);
+ auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
+ if (em.second) {
+ em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
+ } else {
+ dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
+ em.first->second->add_request(mdr);
+ mdr->mark_event("joining batch getattr");
+ return;
+ }
+ }
+ }
+
+ CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
+ if (!ref)
+ return;
+
+ /*
+ * if client currently holds the EXCL cap on a field, do not rdlock
+ * it; client's stat() will result in valid info if _either_ EXCL
+ * cap is held or MDS rdlocks and reads the value here.
+ *
+ * handling this case here is easier than weakening rdlock
+ * semantics... that would cause problems elsewhere.
+ */
+ client_t client = mdr->get_client();
+ int issued = 0;
+ Capability *cap = ref->get_client_cap(client);
+ if (cap && (mdr->snapid == CEPH_NOSNAP ||
+ mdr->snapid <= cap->client_follows))
+ issued = cap->issued();
+
+ // FIXME
+ MutationImpl::LockOpVec lov;
+ if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
+ lov.add_rdlock(&ref->linklock);
+ if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
+ lov.add_rdlock(&ref->authlock);
+ if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
+ lov.add_rdlock(&ref->xattrlock);
+ if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
+ // Don't wait on unstable filelock if client is allowed to read file size.
+ // This can reduce the response time of getattr in the case that multiple
+ // clients do stat(2) and there are writers.
+ // The downside of this optimization is that mds may not issue Fs caps along
+ // with getattr reply. Client may need to send more getattr requests.
+ if (mdr->is_rdlocked(&ref->filelock)) {
+ lov.add_rdlock(&ref->filelock);
+ } else if (ref->filelock.is_stable() ||
+ ref->filelock.get_num_wrlocks() > 0 ||
+ !ref->filelock.can_read(mdr->get_client())) {
+ /* Since we're taking advantage of an optimization here:
+ *
+ * We cannot suddenly, due to a changing condition, add this filelock as
+ * it can cause lock-order deadlocks. In this case, that condition is the
+ * lock state changes between request retries. If that happens, we need
+ * to check if we've acquired the other locks in this vector. If we have,
+ * then we need to drop those locks and retry.
+ */
+ if (mdr->is_rdlocked(&ref->linklock) ||
+ mdr->is_rdlocked(&ref->authlock) ||
+ mdr->is_rdlocked(&ref->xattrlock)) {
+ /* start over */
+ dout(20) << " dropping locks and restarting request because filelock state change" << dendl;
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ mds->queue_waiter(new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ lov.add_rdlock(&ref->filelock);
+ mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
+ }
+ }
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, ref, MAY_READ))
+ return;
+
+ utime_t now = ceph_clock_now();
+ mdr->set_mds_stamp(now);
+
+ // note which caps are requested, so we return at least a snapshot
+ // value for them. (currently this matters for xattrs and inline data)
+ mdr->getattr_caps = mask;
+
+ mds->balancer->hit_inode(ref, META_POP_IRD);
+
+ // reply
+ dout(10) << "reply to stat on " << *req << dendl;
+ mdr->tracei = ref;
+ if (is_lookup)
+ mdr->tracedn = mdr->dn[0].back();
+ respond_to_request(mdr, 0);
+}
+
+struct C_MDS_LookupIno2 : public ServerContext {
+ MDRequestRef mdr;
+ C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
+ void finish(int r) override {
+ server->_lookup_ino_2(mdr, r);
+ }
+};
+
+/*
+ * filepath: ino
+ */
+void Server::handle_client_lookup_ino(MDRequestRef& mdr,
+ bool want_parent, bool want_dentry)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ if ((uint64_t)req->head.args.lookupino.snapid > 0)
+ return _lookup_snap_ino(mdr);
+
+ inodeno_t ino = req->get_filepath().get_ino();
+ auto _ino = ino.val;
+
+ /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
+ * I do not have an explanation for how that happened organically but this
+ * check will ensure that the client can no longer do that.
+ *
+ * [1] https://tracker.ceph.com/issues/49922
+ */
+ if (MDS_IS_PRIVATE_INO(_ino)) {
+ respond_to_request(mdr, -CEPHFS_ESTALE);
+ return;
+ }
+
+ CInode *in = mdcache->get_inode(ino);
+ if (in && in->state_test(CInode::STATE_PURGING)) {
+ respond_to_request(mdr, -CEPHFS_ESTALE);
+ return;
+ }
+ if (!in) {
+ mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
+ return;
+ }
+
+ // check for nothing (not read or write); this still applies the
+ // path check.
+ if (!check_access(mdr, in, 0))
+ return;
+
+ CDentry *dn = in->get_projected_parent_dn();
+ CInode *diri = dn ? dn->get_dir()->inode : NULL;
+
+ MutationImpl::LockOpVec lov;
+ if (dn && (want_parent || want_dentry)) {
+ mdr->pin(dn);
+ lov.add_rdlock(&dn->lock);
+ }
+
+ unsigned mask = req->head.args.lookupino.mask;
+ if (mask) {
+ Capability *cap = in->get_client_cap(mdr->get_client());
+ int issued = 0;
+ if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
+ issued = cap->issued();
+ // FIXME
+ // permission bits, ACL/security xattrs
+ if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
+ lov.add_rdlock(&in->authlock);
+ if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
+ lov.add_rdlock(&in->xattrlock);
+
+ mdr->getattr_caps = mask;
+ }
+
+ if (!lov.empty()) {
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (diri != NULL) {
+ // need read access to directory inode
+ if (!check_access(mdr, diri, MAY_READ))
+ return;
+ }
+ }
+
+ if (want_parent) {
+ if (in->is_base()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+ if (!diri || diri->is_stray()) {
+ respond_to_request(mdr, -CEPHFS_ESTALE);
+ return;
+ }
+ dout(10) << "reply to lookup_parent " << *in << dendl;
+ mdr->tracei = diri;
+ respond_to_request(mdr, 0);
+ } else {
+ if (want_dentry) {
+ inodeno_t dirino = req->get_filepath2().get_ino();
+ if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
+ respond_to_request(mdr, -CEPHFS_ENOENT);
+ return;
+ }
+ dout(10) << "reply to lookup_name " << *in << dendl;
+ } else
+ dout(10) << "reply to lookup_ino " << *in << dendl;
+
+ mdr->tracei = in;
+ if (want_dentry)
+ mdr->tracedn = dn;
+ respond_to_request(mdr, 0);
+ }
+}
+
+void Server::_lookup_snap_ino(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ vinodeno_t vino;
+ vino.ino = req->get_filepath().get_ino();
+ vino.snapid = (__u64)req->head.args.lookupino.snapid;
+ inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
+ __u32 hash = req->head.args.lookupino.hash;
+
+ dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
+
+ CInode *in = mdcache->lookup_snap_inode(vino);
+ if (!in) {
+ in = mdcache->get_inode(vino.ino);
+ if (in) {
+ if (in->state_test(CInode::STATE_PURGING) ||
+ !in->has_snap_data(vino.snapid)) {
+ if (in->is_dir() || !parent_ino) {
+ respond_to_request(mdr, -CEPHFS_ESTALE);
+ return;
+ }
+ in = NULL;
+ }
+ }
+ }
+
+ if (in) {
+ dout(10) << "reply to lookup_snap_ino " << *in << dendl;
+ mdr->snapid = vino.snapid;
+ mdr->tracei = in;
+ respond_to_request(mdr, 0);
+ return;
+ }
+
+ CInode *diri = NULL;
+ if (parent_ino) {
+ diri = mdcache->get_inode(parent_ino);
+ if (!diri) {
+ mdcache->open_ino(parent_ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
+ return;
+ }
+
+ if (!diri->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+ lov.add_rdlock(&diri->dirfragtreelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ frag_t frag = diri->dirfragtree[hash];
+ CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
+ if (!dir)
+ return;
+
+ if (!dir->is_complete()) {
+ if (dir->is_frozen()) {
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
+ return;
+ }
+
+ respond_to_request(mdr, -CEPHFS_ESTALE);
+ } else {
+ mdcache->open_ino(vino.ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
+ }
+}
+
+void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
+{
+ inodeno_t ino = mdr->client_request->get_filepath().get_ino();
+ dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
+
+ // `r` is a rank if >=0, else an error code
+ if (r >= 0) {
+ mds_rank_t dest_rank(r);
+ if (dest_rank == mds->get_nodeid())
+ dispatch_client_request(mdr);
+ else
+ mdcache->request_forward(mdr, dest_rank);
+ return;
+ }
+
+ // give up
+ if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
+ r = -CEPHFS_ESTALE;
+ respond_to_request(mdr, r);
+}
+
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_open(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ dout(7) << "open on " << req->get_filepath() << dendl;
+
+ int flags = req->head.args.open.flags;
+ int cmode = ceph_flags_to_mode(flags);
+ if (cmode < 0) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ bool need_auth = !file_mode_is_readonly(cmode) ||
+ (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
+
+ if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
+ dout(7) << "read-only FS" << dendl;
+ respond_to_request(mdr, -CEPHFS_EROFS);
+ return;
+ }
+
+ CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
+ if (!cur)
+ return;
+
+ if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
+ ceph_assert(!need_auth);
+ mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
+ CInode *cur = rdlock_path_pin_ref(mdr, true);
+ if (!cur)
+ return;
+ }
+
+ if (!cur->is_file()) {
+ // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
+ cmode = CEPH_FILE_MODE_PIN;
+ // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
+ if (cur->is_symlink() && !(flags & CEPH_O_NOFOLLOW))
+ flags &= ~CEPH_O_TRUNC;
+ }
+
+ dout(10) << "open flags = " << flags
+ << ", filemode = " << cmode
+ << ", need_auth = " << need_auth
+ << dendl;
+
+ // regular file?
+ /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
+ dout(7) << "not a file or dir " << *cur << dendl;
+ respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
+ return;
+ }*/
+ if ((flags & CEPH_O_DIRECTORY) && !cur->is_dir() && !cur->is_symlink()) {
+ dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ if ((flags & CEPH_O_TRUNC) && !cur->is_file()) {
+ dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
+ // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
+ respond_to_request(mdr, cur->is_dir() ? -CEPHFS_EISDIR : -CEPHFS_EINVAL);
+ return;
+ }
+
+ if (cur->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
+ !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
+ dout(7) << "old client cannot open inline data file " << *cur << dendl;
+ respond_to_request(mdr, -CEPHFS_EPERM);
+ return;
+ }
+
+ // snapped data is read only
+ if (mdr->snapid != CEPH_NOSNAP &&
+ ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
+ dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
+ respond_to_request(mdr, -CEPHFS_EROFS);
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+ lov.add_rdlock(&cur->snaplock);
+
+ unsigned mask = req->head.args.open.mask;
+ if (mask) {
+ Capability *cap = cur->get_client_cap(mdr->get_client());
+ int issued = 0;
+ if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
+ issued = cap->issued();
+ // permission bits, ACL/security xattrs
+ if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
+ lov.add_rdlock(&cur->authlock);
+ if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
+ lov.add_rdlock(&cur->xattrlock);
+
+ mdr->getattr_caps = mask;
+ }
+
+ // O_TRUNC
+ if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
+ ceph_assert(cur->is_auth());
+
+ lov.add_xlock(&cur->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, cur, MAY_WRITE))
+ return;
+
+ // wait for pending truncate?
+ const auto& pi = cur->get_projected_inode();
+ if (pi->is_truncating()) {
+ dout(10) << " waiting for pending truncate from " << pi->truncate_from
+ << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ do_open_truncate(mdr, cmode);
+ return;
+ }
+
+ // sync filelock if snapped.
+ // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
+ // and that data itself is flushed so that we can read the snapped data off disk.
+ if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
+ lov.add_rdlock(&cur->filelock);
+ }
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ mask = MAY_READ;
+ if (cmode & CEPH_FILE_MODE_WR)
+ mask |= MAY_WRITE;
+ if (!check_access(mdr, cur, mask))
+ return;
+
+ utime_t now = ceph_clock_now();
+ mdr->set_mds_stamp(now);
+
+ if (cur->is_file() || cur->is_dir()) {
+ if (mdr->snapid == CEPH_NOSNAP) {
+ // register new cap
+ Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
+ if (cap)
+ dout(12) << "open issued caps " << ccap_string(cap->pending())
+ << " for " << req->get_source()
+ << " on " << *cur << dendl;
+ } else {
+ int caps = ceph_caps_for_mode(cmode);
+ dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
+ << " for " << req->get_source()
+ << " snapid " << mdr->snapid
+ << " on " << *cur << dendl;
+ mdr->snap_caps = caps;
+ }
+ }
+
+ // increase max_size?
+ if (cmode & CEPH_FILE_MODE_WR)
+ mds->locker->check_inode_max_size(cur);
+
+ // make sure this inode gets into the journal
+ if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
+ mdcache->open_file_table.should_log_open(cur)) {
+ EOpen *le = new EOpen(mds->mdlog);
+ mdlog->start_entry(le);
+ le->add_clean_inode(cur);
+ mdlog->submit_entry(le);
+ }
+
+ // hit pop
+ if (cmode & CEPH_FILE_MODE_WR)
+ mds->balancer->hit_inode(cur, META_POP_IWR);
+ else
+ mds->balancer->hit_inode(cur, META_POP_IRD);
+
+ CDentry *dn = 0;
+ if (req->get_dentry_wanted()) {
+ ceph_assert(mdr->dn[0].size());
+ dn = mdr->dn[0].back();
+ }
+
+ mdr->tracei = cur;
+ mdr->tracedn = dn;
+ respond_to_request(mdr, 0);
+}
+
+class C_MDS_openc_finish : public ServerLogContext {
+ CDentry *dn;
+ CInode *newi;
+public:
+ C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
+ ServerLogContext(s, r), dn(d), newi(ni) {}
+ void finish(int r) override {
+ ceph_assert(r == 0);
+
+ // crash current MDS and the replacing MDS will test the journal
+ ceph_assert(!g_conf()->mds_kill_skip_replaying_inotable);
+
+ dn->pop_projected_linkage();
+
+ // dirty inode, dn, dir
+ newi->mark_dirty(mdr->ls);
+ newi->mark_dirty_parent(mdr->ls, true);
+
+ mdr->apply();
+
+ get_mds()->locker->share_inode_max_size(newi);
+
+ MDRequestRef null_ref;
+ get_mds()->mdcache->send_dentry_link(dn, null_ref);
+
+ get_mds()->balancer->hit_inode(newi, META_POP_IWR);
+
+ server->respond_to_request(mdr, 0);
+
+ ceph_assert(g_conf()->mds_kill_openc_at != 1);
+ }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_openc(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ client_t client = mdr->get_client();
+
+ dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
+
+ int cmode = ceph_flags_to_mode(req->head.args.open.flags);
+ if (cmode < 0) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ bool excl = req->head.args.open.flags & CEPH_O_EXCL;
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true, true);
+ if (!dn)
+ return;
+
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ if (!excl && !dnl->is_null()) {
+ // it existed.
+ ceph_assert(mdr.get()->is_rdlocked(&dn->lock));
+
+ handle_client_open(mdr);
+ return;
+ }
+
+ ceph_assert(dnl->is_null());
+
+ if (req->get_alternate_name().size() > alternate_name_max) {
+ dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
+ respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
+ return;
+ }
+ dn->set_alternate_name(req->get_alternate_name());
+
+ // set layout
+ file_layout_t layout;
+ if (mdr->dir_layout != file_layout_t())
+ layout = mdr->dir_layout;
+ else
+ layout = mdcache->default_file_layout;
+
+ // What kind of client caps are required to complete this operation
+ uint64_t access = MAY_WRITE;
+
+ const auto default_layout = layout;
+
+ // fill in any special params from client
+ if (req->head.args.open.stripe_unit)
+ layout.stripe_unit = req->head.args.open.stripe_unit;
+ if (req->head.args.open.stripe_count)
+ layout.stripe_count = req->head.args.open.stripe_count;
+ if (req->head.args.open.object_size)
+ layout.object_size = req->head.args.open.object_size;
+ if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
+ (__s32)req->head.args.open.pool >= 0) {
+ layout.pool_id = req->head.args.open.pool;
+
+ // make sure we have as new a map as the client
+ if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+ mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ }
+
+ // If client doesn't have capability to modify layout pools, then
+ // only permit this request if the requested pool matches what the
+ // file would have inherited anyway from its parent.
+ if (default_layout != layout) {
+ access |= MAY_SET_VXATTR;
+ }
+
+ if (!layout.is_valid()) {
+ dout(10) << " invalid initial file layout" << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+ if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
+ dout(10) << " invalid data pool " << layout.pool_id << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ // created null dn.
+ CDir *dir = dn->get_dir();
+ CInode *diri = dir->get_inode();
+ if (!check_access(mdr, diri, access))
+ return;
+ if (!check_fragment_space(mdr, dir))
+ return;
+ if (!check_dir_max_entries(mdr, dir))
+ return;
+
+ if (mdr->dn[0].size() == 1)
+ mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
+
+ // create inode.
+ CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
+ req->head.args.open.mode | S_IFREG, &layout);
+ ceph_assert(newi);
+
+ // it's a file.
+ dn->push_projected_linkage(newi);
+
+ auto _inode = newi->_get_inode();
+ _inode->version = dn->pre_dirty();
+ if (layout.pool_id != mdcache->default_file_layout.pool_id)
+ _inode->add_old_pool(mdcache->default_file_layout.pool_id);
+ _inode->update_backtrace();
+ _inode->rstat.rfiles = 1;
+ _inode->accounted_rstat = _inode->rstat;
+
+ SnapRealm *realm = diri->find_snaprealm();
+ snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+ ceph_assert(follows >= realm->get_newest_seq());
+
+ ceph_assert(dn->first == follows+1);
+ newi->first = dn->first;
+
+ // do the open
+ Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
+ newi->authlock.set_state(LOCK_EXCL);
+ newi->xattrlock.set_state(LOCK_EXCL);
+
+ if (cap && (cmode & CEPH_FILE_MODE_WR)) {
+ _inode->client_ranges[client].range.first = 0;
+ _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
+ _inode->client_ranges[client].follows = follows;
+ newi->mark_clientwriteable();
+ cap->mark_clientwriteable();
+ }
+
+ // prepare finisher
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "openc");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ journal_allocated_inos(mdr, &le->metablob);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+ le->metablob.add_primary_dentry(dn, newi, true, true, true);
+
+ // make sure this inode gets into the journal
+ le->metablob.add_opened_ino(newi->ino());
+
+ C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, newi);
+
+ if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
+ openc_response_t ocresp;
+
+ dout(10) << "adding created_ino and delegated_inos" << dendl;
+ ocresp.created_ino = _inode->ino;
+
+ if (delegate_inos_pct && !req->is_queued_for_replay()) {
+ // Try to delegate some prealloc_inos to the client, if it's down to half the max
+ unsigned frac = 100 / delegate_inos_pct;
+ if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
+ mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
+ }
+
+ encode(ocresp, mdr->reply_extra_bl);
+ } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
+ dout(10) << "adding ino to reply to indicate inode was created" << dendl;
+ // add the file created flag onto the reply if create_flags features is supported
+ encode(newi->ino(), mdr->reply_extra_bl);
+ }
+
+ journal_and_reply(mdr, newi, dn, le, fin);
+
+ // We hit_dir (via hit_inode) in our finish callback, but by then we might
+ // have overshot the split size (multiple opencs in flight), so here is
+ // an early chance to split the dir if this openc makes it oversized.
+ mds->balancer->maybe_fragment(dir, false);
+}
+
+
+void Server::_finalize_readdir(MDRequestRef& mdr,
+ CInode *diri,
+ CDir* dir,
+ bool start,
+ bool end,
+ __u16 flags,
+ __u32 numfiles,
+ bufferlist& dirbl,
+ bufferlist& dnbl)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ Session *session = mds->get_session(req);
+
+ session->touch_readdir_cap(numfiles);
+
+ if (end) {
+ flags |= CEPH_READDIR_FRAG_END;
+ if (start)
+ flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
+ }
+
+ // finish final blob
+ encode(numfiles, dirbl);
+ encode(flags, dirbl);
+ dirbl.claim_append(dnbl);
+
+ // yay, reply
+ dout(10) << "reply to " << *req << " readdir num=" << numfiles
+ << " bytes=" << dirbl.length()
+ << " start=" << (int)start
+ << " end=" << (int)end
+ << dendl;
+ mdr->reply_extra_bl = dirbl;
+
+ // bump popularity. NOTE: this doesn't quite capture it.
+ mds->balancer->hit_dir(dir, META_POP_READDIR, numfiles);
+
+ // reply
+ mdr->tracei = diri;
+ respond_to_request(mdr, 0);
+}
+
+void Server::handle_client_readdir(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ Session *session = mds->get_session(req);
+ client_t client = req->get_source().num();
+ MutationImpl::LockOpVec lov;
+ CInode *diri = rdlock_path_pin_ref(mdr, false, true);
+ if (!diri) return;
+
+ // it's a directory, right?
+ if (!diri->is_dir()) {
+ // not a dir
+ dout(10) << "reply to " << *req << " readdir -CEPHFS_ENOTDIR" << dendl;
+ respond_to_request(mdr, -CEPHFS_ENOTDIR);
+ return;
+ }
+
+ auto num_caps = session->get_num_caps();
+ auto session_cap_acquisition = session->get_cap_acquisition();
+
+ if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
+ dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
+ << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
+ if (logger)
+ logger->inc(l_mdss_cap_acquisition_throttle);
+
+ mdr->mark_event("cap_acquisition_throttle");
+ mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ lov.add_rdlock(&diri->filelock);
+ lov.add_rdlock(&diri->dirfragtreelock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, diri, MAY_READ))
+ return;
+
+ // which frag?
+ frag_t fg = (__u32)req->head.args.readdir.frag;
+ unsigned req_flags = (__u32)req->head.args.readdir.flags;
+ string offset_str = req->get_path2();
+
+ __u32 offset_hash = 0;
+ if (!offset_str.empty())
+ offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
+ else
+ offset_hash = (__u32)req->head.args.readdir.offset_hash;
+
+ dout(10) << " frag " << fg << " offset '" << offset_str << "'"
+ << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
+
+ // does the frag exist?
+ if (diri->dirfragtree[fg.value()] != fg) {
+ frag_t newfg;
+ if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
+ if (fg.contains((unsigned)offset_hash)) {
+ newfg = diri->dirfragtree[offset_hash];
+ } else {
+ // client actually wants next frag
+ newfg = diri->dirfragtree[fg.value()];
+ }
+ } else {
+ offset_str.clear();
+ newfg = diri->dirfragtree[fg.value()];
+ }
+ dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
+ fg = newfg;
+ }
+
+ CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
+ if (!dir) return;
+
+ // ok!
+ dout(10) << "handle_client_readdir on " << *dir << dendl;
+ ceph_assert(dir->is_auth());
+
+ if (!dir->is_complete()) {
+ if (dir->is_frozen()) {
+ dout(7) << "dir is frozen " << *dir << dendl;
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ // fetch
+ dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
+ dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
+ return;
+ }
+
+#ifdef MDS_VERIFY_FRAGSTAT
+ dir->verify_fragstat();
+#endif
+
+ utime_t now = ceph_clock_now();
+ mdr->set_mds_stamp(now);
+
+ snapid_t snapid = mdr->snapid;
+ dout(10) << "snapid " << snapid << dendl;
+
+ SnapRealm *realm = diri->find_snaprealm();
+
+ unsigned max = req->head.args.readdir.max_entries;
+ if (!max)
+ max = dir->get_num_any(); // whatever, something big.
+ unsigned max_bytes = req->head.args.readdir.max_bytes;
+ if (!max_bytes)
+ // make sure at least one item can be encoded
+ max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
+
+ // start final blob
+ bufferlist dirbl;
+ DirStat ds;
+ ds.frag = dir->get_frag();
+ ds.auth = dir->get_dir_auth().first;
+ if (dir->is_auth() && !forward_all_requests_to_auth)
+ dir->get_dist_spec(ds.dist, mds->get_nodeid());
+
+ dir->encode_dirstat(dirbl, mdr->session->info, ds);
+
+ // count bytes available.
+ // this isn't perfect, but we should capture the main variable/unbounded size items!
+ int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
+ int bytes_left = max_bytes - front_bytes;
+ bytes_left -= get_snap_trace(session, realm).length();
+
+ // build dir contents
+ bufferlist dnbl;
+ __u32 numfiles = 0;
+ bool start = !offset_hash && offset_str.empty();
+ // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
+ dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
+ auto it = start ? dir->begin() : dir->lower_bound(skip_key);
+ bool end = (it == dir->end());
+ for (; !end && numfiles < max; end = (it == dir->end())) {
+ CDentry *dn = it->second;
+ ++it;
+
+ if (dn->state_test(CDentry::STATE_PURGING))
+ continue;
+
+ bool dnp = dn->use_projected(client, mdr);
+ CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
+
+ if (dnl->is_null()) {
+ if (dn->get_num_ref() == 0 && !dn->is_projected())
+ dir->remove_dentry(dn);
+ continue;
+ }
+
+ if (dn->last < snapid || dn->first > snapid) {
+ dout(20) << "skipping non-overlapping snap " << *dn << dendl;
+ continue;
+ }
+
+ if (!start) {
+ dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
+ if (!(offset_key < dn->key()))
+ continue;
+ }
+
+ CInode *in = dnl->get_inode();
+
+ if (in && in->ino() == CEPH_INO_CEPH)
+ continue;
+
+ // remote link?
+ // better for the MDS to do the work, if we think the client will stat any of these files.
+ if (dnl->is_remote() && !in) {
+ in = mdcache->get_inode(dnl->get_remote_ino());
+ if (in) {
+ dn->link_remote(dnl, in);
+ } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
+ dout(10) << "skipping bad remote ino on " << *dn << dendl;
+ continue;
+ } else {
+ // touch everything i _do_ have
+ for (auto &p : *dir) {
+ if (!p.second->get_linkage()->is_null())
+ mdcache->lru.lru_touch(p.second);
+ }
+
+ // already issued caps and leases, reply immediately.
+ if (dnbl.length() > 0) {
+ mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
+ dout(10) << " open remote dentry after caps were issued, stopping at "
+ << dnbl.length() << " < " << bytes_left << dendl;
+ break;
+ }
+
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ }
+ ceph_assert(in);
+
+ if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
+ dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
+ break;
+ }
+
+ unsigned start_len = dnbl.length();
+
+ // dentry
+ dout(12) << "including dn " << *dn << dendl;
+ encode(dn->get_name(), dnbl);
+ mds->locker->issue_client_lease(dn, in, mdr, now, dnbl);
+
+ // inode
+ dout(12) << "including inode in " << *in << " snap " << snapid << dendl;
+ int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
+ if (r < 0) {
+ // chop off dn->name, lease
+ dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
+ bufferlist keep;
+ keep.substr_of(dnbl, 0, start_len);
+ dnbl.swap(keep);
+ break;
+ }
+ ceph_assert(r >= 0);
+ numfiles++;
+
+ // touch dn
+ mdcache->lru.lru_touch(dn);
+ }
+ __u16 flags = 0;
+ // client only understand END and COMPLETE flags ?
+ if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
+ flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
+ }
+ _finalize_readdir(mdr, diri, dir, start, end, flags, numfiles, dirbl, dnbl);
+}
+
+
+
+// ===============================================================================
+// INODE UPDATES
+
+
+/*
+ * finisher for basic inode updates
+ */
+class C_MDS_inode_update_finish : public ServerLogContext {
+ CInode *in;
+ bool truncating_smaller, changed_ranges, adjust_realm;
+public:
+ C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
+ bool sm=false, bool cr=false, bool ar=false) :
+ ServerLogContext(s, r), in(i),
+ truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
+ void finish(int r) override {
+ ceph_assert(r == 0);
+
+ int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
+
+ // apply
+ mdr->apply();
+
+ MDSRank *mds = get_mds();
+
+ // notify any clients
+ if (truncating_smaller && in->get_inode()->is_truncating()) {
+ mds->locker->issue_truncate(in);
+ mds->mdcache->truncate_inode(in, mdr->ls);
+ }
+
+ if (adjust_realm) {
+ mds->mdcache->send_snap_update(in, 0, snap_op);
+ mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
+ }
+
+ get_mds()->balancer->hit_inode(in, META_POP_IWR);
+
+ server->respond_to_request(mdr, 0);
+
+ if (changed_ranges)
+ get_mds()->locker->share_inode_max_size(in);
+ }
+};
+
+void Server::handle_client_file_setlock(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
+
+ // get the inode to operate on, and set up any locks needed for that
+ CInode *cur = rdlock_path_pin_ref(mdr, true);
+ if (!cur)
+ return;
+
+ lov.add_xlock(&cur->flocklock);
+ /* acquire_locks will return true if it gets the locks. If it fails,
+ it will redeliver this request at a later date, so drop the request.
+ */
+ if (!mds->locker->acquire_locks(mdr, lov)) {
+ dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
+ return;
+ }
+
+ // copy the lock change into a ceph_filelock so we can store/apply it
+ ceph_filelock set_lock;
+ set_lock.start = req->head.args.filelock_change.start;
+ set_lock.length = req->head.args.filelock_change.length;
+ set_lock.client = req->get_orig_source().num();
+ set_lock.owner = req->head.args.filelock_change.owner;
+ set_lock.pid = req->head.args.filelock_change.pid;
+ set_lock.type = req->head.args.filelock_change.type;
+ bool will_wait = req->head.args.filelock_change.wait;
+
+ dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
+
+ ceph_lock_state_t *lock_state = NULL;
+ bool interrupt = false;
+
+ // get the appropriate lock state
+ switch (req->head.args.filelock_change.rule) {
+ case CEPH_LOCK_FLOCK_INTR:
+ interrupt = true;
+ // fall-thru
+ case CEPH_LOCK_FLOCK:
+ lock_state = cur->get_flock_lock_state();
+ break;
+
+ case CEPH_LOCK_FCNTL_INTR:
+ interrupt = true;
+ // fall-thru
+ case CEPH_LOCK_FCNTL:
+ lock_state = cur->get_fcntl_lock_state();
+ break;
+
+ default:
+ dout(10) << "got unknown lock type " << set_lock.type
+ << ", dropping request!" << dendl;
+ respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
+ return;
+ }
+
+ dout(10) << " state prior to lock change: " << *lock_state << dendl;
+ if (CEPH_LOCK_UNLOCK == set_lock.type) {
+ list<ceph_filelock> activated_locks;
+ MDSContext::vec waiters;
+ if (lock_state->is_waiting(set_lock)) {
+ dout(10) << " unlock removing waiting lock " << set_lock << dendl;
+ lock_state->remove_waiting(set_lock);
+ cur->take_waiting(CInode::WAIT_FLOCK, waiters);
+ } else if (!interrupt) {
+ dout(10) << " unlock attempt on " << set_lock << dendl;
+ lock_state->remove_lock(set_lock, activated_locks);
+ cur->take_waiting(CInode::WAIT_FLOCK, waiters);
+ }
+ mds->queue_waiters(waiters);
+
+ respond_to_request(mdr, 0);
+ } else {
+ dout(10) << " lock attempt on " << set_lock << dendl;
+ bool deadlock = false;
+ if (mdr->more()->flock_was_waiting &&
+ !lock_state->is_waiting(set_lock)) {
+ dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
+ respond_to_request(mdr, -CEPHFS_EINTR);
+ } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
+ dout(10) << " it failed on this attempt" << dendl;
+ // couldn't set lock right now
+ if (deadlock) {
+ respond_to_request(mdr, -CEPHFS_EDEADLK);
+ } else if (!will_wait) {
+ respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
+ } else {
+ dout(10) << " added to waiting list" << dendl;
+ ceph_assert(lock_state->is_waiting(set_lock));
+ mdr->more()->flock_was_waiting = true;
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ mdr->mark_event("failed to add lock, waiting");
+ mdr->mark_nowarn();
+ cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
+ }
+ } else
+ respond_to_request(mdr, 0);
+ }
+ dout(10) << " state after lock change: " << *lock_state << dendl;
+}
+
+void Server::handle_client_file_readlock(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
+
+ // get the inode to operate on, and set up any locks needed for that
+ CInode *cur = rdlock_path_pin_ref(mdr, true);
+ if (!cur)
+ return;
+
+ /* acquire_locks will return true if it gets the locks. If it fails,
+ it will redeliver this request at a later date, so drop the request.
+ */
+ lov.add_rdlock(&cur->flocklock);
+ if (!mds->locker->acquire_locks(mdr, lov)) {
+ dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
+ return;
+ }
+
+ // copy the lock change into a ceph_filelock so we can store/apply it
+ ceph_filelock checking_lock;
+ checking_lock.start = req->head.args.filelock_change.start;
+ checking_lock.length = req->head.args.filelock_change.length;
+ checking_lock.client = req->get_orig_source().num();
+ checking_lock.owner = req->head.args.filelock_change.owner;
+ checking_lock.pid = req->head.args.filelock_change.pid;
+ checking_lock.type = req->head.args.filelock_change.type;
+
+ // get the appropriate lock state
+ ceph_lock_state_t *lock_state = NULL;
+ switch (req->head.args.filelock_change.rule) {
+ case CEPH_LOCK_FLOCK:
+ lock_state = cur->get_flock_lock_state();
+ break;
+
+ case CEPH_LOCK_FCNTL:
+ lock_state = cur->get_fcntl_lock_state();
+ break;
+
+ default:
+ dout(10) << "got unknown lock type " << checking_lock.type << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+ lock_state->look_for_lock(checking_lock);
+
+ bufferlist lock_bl;
+ encode(checking_lock, lock_bl);
+
+ mdr->reply_extra_bl = lock_bl;
+ respond_to_request(mdr, 0);
+}
+
+void Server::handle_client_setattr(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
+ CInode *cur = rdlock_path_pin_ref(mdr, true);
+ if (!cur) return;
+
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -CEPHFS_EROFS);
+ return;
+ }
+ if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
+ respond_to_request(mdr, -CEPHFS_EPERM);
+ return;
+ }
+
+ __u32 mask = req->head.args.setattr.mask;
+ __u32 access_mask = MAY_WRITE;
+
+ if (req->get_header().version < 6) {
+ // No changes to fscrypted inodes by downrevved clients
+ if (!cur->get_inode()->fscrypt_auth.empty()) {
+ respond_to_request(mdr, -CEPHFS_EPERM);
+ return;
+ }
+
+ // Only allow fscrypt field changes by capable clients
+ if (mask & (CEPH_SETATTR_FSCRYPT_FILE|CEPH_SETATTR_FSCRYPT_AUTH)) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+ }
+
+ // xlock inode
+ if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID|CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_KILL_SUID|CEPH_SETATTR_KILL_SGID))
+ lov.add_xlock(&cur->authlock);
+ if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE|CEPH_SETATTR_FSCRYPT_FILE))
+ lov.add_xlock(&cur->filelock);
+ if (mask & CEPH_SETATTR_CTIME)
+ lov.add_wrlock(&cur->versionlock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
+ access_mask |= MAY_CHOWN;
+
+ if ((mask & CEPH_SETATTR_GID) && (cur->get_inode()->gid != req->head.args.setattr.gid))
+ access_mask |= MAY_CHGRP;
+
+ if (!check_access(mdr, cur, access_mask))
+ return;
+
+ // trunc from bigger -> smaller?
+ const auto& pip = cur->get_projected_inode();
+
+ uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
+
+ // CEPHFS_ENOSPC on growing file while full, but allow shrinks
+ if (is_full && req->head.args.setattr.size > old_size) {
+ dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl;
+ respond_to_request(mdr, -CEPHFS_ENOSPC);
+ return;
+ }
+
+ bool truncating_smaller = false;
+ if (mask & CEPH_SETATTR_SIZE) {
+ if (req->get_data().length() >
+ sizeof(struct ceph_fscrypt_last_block_header) + fscrypt_last_block_max_size) {
+ dout(10) << __func__ << ": the last block size is too large" << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ truncating_smaller = req->head.args.setattr.size < old_size ||
+ (req->head.args.setattr.size == old_size && req->get_data().length());
+ if (truncating_smaller && pip->is_truncating()) {
+ dout(10) << " waiting for pending truncate from " << pip->truncate_from
+ << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ if (truncating_smaller && req->get_data().length()) {
+ struct ceph_fscrypt_last_block_header header;
+ memset(&header, 0, sizeof(header));
+ auto bl = req->get_data().cbegin();
+ DECODE_START(1, bl);
+ decode(header.change_attr, bl);
+ DECODE_FINISH(bl);
+
+ dout(20) << __func__ << " mdr->retry:" << mdr->retry
+ << " header.change_attr: " << header.change_attr
+ << " header.file_offset: " << header.file_offset
+ << " header.block_size: " << header.block_size
+ << dendl;
+
+ if (header.change_attr != pip->change_attr) {
+ dout(5) << __func__ << ": header.change_attr:" << header.change_attr
+ << " != current change_attr:" << pip->change_attr
+ << ", let client retry it!" << dendl;
+ // flush the journal to make sure the clients will get the lasted
+ // change_attr as possible for the next retry
+ mds->mdlog->flush();
+ respond_to_request(mdr, -CEPHFS_EAGAIN);
+ return;
+ }
+ }
+ }
+
+ bool changed_ranges = false;
+
+ // project update
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "setattr");
+ mdlog->start_entry(le);
+
+ auto pi = cur->project_inode(mdr);
+
+ if (mask & CEPH_SETATTR_UID)
+ pi.inode->uid = req->head.args.setattr.uid;
+ if (mask & CEPH_SETATTR_GID)
+ pi.inode->gid = req->head.args.setattr.gid;
+
+ if (mask & CEPH_SETATTR_MODE)
+ pi.inode->mode = (pi.inode->mode & ~07777) | (req->head.args.setattr.mode & 07777);
+ else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID|
+ CEPH_SETATTR_KILL_SUID|CEPH_SETATTR_KILL_SGID)) &&
+ S_ISREG(pi.inode->mode)) {
+ if (mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID) &&
+ (pi.inode->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
+ pi.inode->mode &= ~(S_ISUID|S_ISGID);
+ } else {
+ if (mask & CEPH_SETATTR_KILL_SUID) {
+ pi.inode->mode &= ~S_ISUID;
+ }
+ if (mask & CEPH_SETATTR_KILL_SGID) {
+ pi.inode->mode &= ~S_ISGID;
+ }
+ }
+ }
+
+ if (mask & CEPH_SETATTR_MTIME)
+ pi.inode->mtime = req->head.args.setattr.mtime;
+ if (mask & CEPH_SETATTR_ATIME)
+ pi.inode->atime = req->head.args.setattr.atime;
+ if (mask & CEPH_SETATTR_BTIME)
+ pi.inode->btime = req->head.args.setattr.btime;
+ if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
+ pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
+ if (mask & CEPH_SETATTR_SIZE) {
+ if (truncating_smaller) {
+ pi.inode->truncate(old_size, req->head.args.setattr.size, req->get_data());
+ le->metablob.add_truncate_start(cur->ino());
+ } else {
+ pi.inode->size = req->head.args.setattr.size;
+ pi.inode->rstat.rbytes = pi.inode->size;
+ }
+ pi.inode->mtime = mdr->get_op_stamp();
+
+ // adjust client's max_size?
+ if (mds->locker->calc_new_client_ranges(cur, pi.inode->size)) {
+ dout(10) << " client_ranges " << cur->get_previous_projected_inode()->client_ranges
+ << " -> " << pi.inode->client_ranges << dendl;
+ changed_ranges = true;
+ }
+ }
+
+ if (mask & CEPH_SETATTR_FSCRYPT_AUTH)
+ pi.inode->fscrypt_auth = req->fscrypt_auth;
+ if (mask & CEPH_SETATTR_FSCRYPT_FILE)
+ pi.inode->fscrypt_file = req->fscrypt_file;
+
+ pi.inode->version = cur->pre_dirty();
+ pi.inode->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+ pi.inode->rstat.rctime = mdr->get_op_stamp();
+ pi.inode->change_attr++;
+
+ // log + wait
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
+ truncating_smaller, changed_ranges));
+
+ // flush immediately if there are readers/writers waiting
+ if (mdr->is_xlocked(&cur->filelock) &&
+ (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+ mds->mdlog->flush();
+}
+
+/* Takes responsibility for mdr */
+void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
+{
+ CInode *in = mdr->in[0];
+ client_t client = mdr->get_client();
+ ceph_assert(in);
+
+ dout(10) << "do_open_truncate " << *in << dendl;
+
+ SnapRealm *realm = in->find_snaprealm();
+ Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
+
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "open_truncate");
+ mdlog->start_entry(le);
+
+ // prepare
+ auto pi = in->project_inode(mdr);
+ pi.inode->version = in->pre_dirty();
+ pi.inode->mtime = pi.inode->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+ pi.inode->rstat.rctime = mdr->get_op_stamp();
+ pi.inode->change_attr++;
+
+ uint64_t old_size = std::max<uint64_t>(pi.inode->size, mdr->client_request->head.args.open.old_size);
+ if (old_size > 0) {
+ pi.inode->truncate(old_size, 0);
+ le->metablob.add_truncate_start(in->ino());
+ }
+
+ bool changed_ranges = false;
+ if (cap && (cmode & CEPH_FILE_MODE_WR)) {
+ pi.inode->client_ranges[client].range.first = 0;
+ pi.inode->client_ranges[client].range.last = pi.inode->get_layout_size_increment();
+ pi.inode->client_ranges[client].follows = realm->get_newest_seq();
+ changed_ranges = true;
+ in->mark_clientwriteable();
+ cap->mark_clientwriteable();
+ }
+
+ le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+
+ mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
+
+ // make sure ino gets into the journal
+ le->metablob.add_opened_ino(in->ino());
+
+ mdr->o_trunc = true;
+
+ CDentry *dn = 0;
+ if (mdr->client_request->get_dentry_wanted()) {
+ ceph_assert(mdr->dn[0].size());
+ dn = mdr->dn[0].back();
+ }
+
+ journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
+ changed_ranges));
+ // Although the `open` part can give an early reply, the truncation won't
+ // happen until our EUpdate is persistent, to give the client a prompt
+ // response we must also flush that event.
+ mdlog->flush();
+}
+
+
+/* This function cleans up the passed mdr */
+void Server::handle_client_setlayout(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ CInode *cur = rdlock_path_pin_ref(mdr, true);
+ if (!cur) return;
+
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -CEPHFS_EROFS);
+ return;
+ }
+ if (!cur->is_file()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+ if (cur->get_projected_inode()->size ||
+ cur->get_projected_inode()->truncate_seq > 1) {
+ respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+ return;
+ }
+
+ // validate layout
+ file_layout_t layout = cur->get_projected_inode()->layout;
+ // save existing layout for later
+ const auto old_layout = layout;
+
+ int access = MAY_WRITE;
+
+ if (req->head.args.setlayout.layout.fl_object_size > 0)
+ layout.object_size = req->head.args.setlayout.layout.fl_object_size;
+ if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
+ layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
+ if (req->head.args.setlayout.layout.fl_stripe_count > 0)
+ layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
+ if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
+ layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
+
+ // make sure we have as new a map as the client
+ if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+ mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ }
+
+ // Don't permit layout modifications without 'p' caps
+ if (layout != old_layout) {
+ access |= MAY_SET_VXATTR;
+ }
+
+ if (!layout.is_valid()) {
+ dout(10) << "bad layout" << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+ if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
+ dout(10) << " invalid data pool " << layout.pool_id << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&cur->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, cur, access))
+ return;
+
+ // project update
+ auto pi = cur->project_inode(mdr);
+ pi.inode->layout = layout;
+ // add the old pool to the inode
+ pi.inode->add_old_pool(old_layout.pool_id);
+ pi.inode->version = cur->pre_dirty();
+ pi.inode->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+ pi.inode->rstat.rctime = mdr->get_op_stamp();
+ pi.inode->change_attr++;
+
+ // log + wait
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "setlayout");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+}
+
+bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
+{
+ if (mdr->locking_state & MutationImpl::ALL_LOCKED)
+ return true;
+
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&in->policylock);
+ if (xlock_snaplock)
+ lov.add_xlock(&in->snaplock);
+ else
+ lov.add_rdlock(&in->snaplock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return false;
+
+ if (want_layout && in->get_projected_inode()->has_layout()) {
+ mdr->dir_layout = in->get_projected_inode()->layout;
+ want_layout = false;
+ }
+ if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
+ if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
+ return false;
+ }
+
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ return true;
+}
+
+CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
+{
+ CInode *in = mdcache->get_inode(ino);
+ if (!in || in->state_test(CInode::STATE_PURGING)) {
+ respond_to_request(mdr, -CEPHFS_ESTALE);
+ return nullptr;
+ }
+ if (!in->is_auth()) {
+ mdcache->request_forward(mdr, in->authority().first);
+ return nullptr;
+ }
+
+ return in;
+}
+
+void Server::handle_client_setdirlayout(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
+ CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!cur)
+ return;
+
+ if (!cur->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_ENOTDIR);
+ return;
+ }
+
+ if (!xlock_policylock(mdr, cur, true))
+ return;
+
+ // validate layout
+ const auto& old_pi = cur->get_projected_inode();
+ file_layout_t layout;
+ if (old_pi->has_layout())
+ layout = old_pi->layout;
+ else if (mdr->dir_layout != file_layout_t())
+ layout = mdr->dir_layout;
+ else
+ layout = mdcache->default_file_layout;
+
+ // Level of access required to complete
+ int access = MAY_WRITE;
+
+ const auto old_layout = layout;
+
+ if (req->head.args.setlayout.layout.fl_object_size > 0)
+ layout.object_size = req->head.args.setlayout.layout.fl_object_size;
+ if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
+ layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
+ if (req->head.args.setlayout.layout.fl_stripe_count > 0)
+ layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
+ if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
+ layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
+ // make sure we have as new a map as the client
+ if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+ mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ }
+
+ if (layout != old_layout) {
+ access |= MAY_SET_VXATTR;
+ }
+
+ if (!layout.is_valid()) {
+ dout(10) << "bad layout" << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+ if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
+ dout(10) << " invalid data pool " << layout.pool_id << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ if (!check_access(mdr, cur, access))
+ return;
+
+ auto pi = cur->project_inode(mdr);
+ pi.inode->layout = layout;
+ pi.inode->version = cur->pre_dirty();
+
+ // log + wait
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "setlayout");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ mdr->no_early_reply = true;
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+}
+
+// XATTRS
+int Server::parse_layout_vxattr_json(
+ string name, string value, const OSDMap& osdmap, file_layout_t *layout)
+{
+ auto parse_pool = [&](std::string pool_name, int64_t pool_id) -> int64_t {
+ if (pool_name != "") {
+ int64_t _pool_id = osdmap.lookup_pg_pool_name(pool_name);
+ if (_pool_id < 0) {
+ dout(10) << __func__ << ": unknown pool name:" << pool_name << dendl;
+ return -CEPHFS_EINVAL;
+ }
+ return _pool_id;
+ } else if (pool_id >= 0) {
+ const auto pools = osdmap.get_pools();
+ if (pools.find(pool_id) == pools.end()) {
+ dout(10) << __func__ << ": unknown pool id:" << pool_id << dendl;
+ return -CEPHFS_EINVAL;
+ }
+ return pool_id;
+ } else {
+ return -CEPHFS_EINVAL;
+ }
+ };
+
+ try {
+ if (name == "layout.json") {
+ JSONParser json_parser;
+ if (json_parser.parse(value.c_str(), value.length()) and json_parser.is_object()) {
+ std::string field;
+ try {
+ field = "object_size";
+ JSONDecoder::decode_json("object_size", layout->object_size, &json_parser, true);
+
+ field = "stripe_unit";
+ JSONDecoder::decode_json("stripe_unit", layout->stripe_unit, &json_parser, true);
+
+ field = "stripe_count";
+ JSONDecoder::decode_json("stripe_count", layout->stripe_count, &json_parser, true);
+
+ field = "pool_namespace";
+ JSONDecoder::decode_json("pool_namespace", layout->pool_ns, &json_parser, false);
+
+ field = "pool_id";
+ int64_t pool_id = 0;
+ JSONDecoder::decode_json("pool_id", pool_id, &json_parser, false);
+
+ field = "pool_name";
+ std::string pool_name;
+ JSONDecoder::decode_json("pool_name", pool_name, &json_parser, false);
+
+ pool_id = parse_pool(pool_name, pool_id);
+ if (pool_id < 0) {
+ return (int)pool_id;
+ }
+ layout->pool_id = pool_id;
+ } catch (JSONDecoder::err&) {
+ dout(10) << __func__ << ": json is missing a mandatory field named "
+ << field << dendl;
+ return -CEPHFS_EINVAL;
+ }
+ } else {
+ dout(10) << __func__ << ": bad json" << dendl;
+ return -CEPHFS_EINVAL;
+ }
+ } else {
+ dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
+ return -CEPHFS_ENODATA; // no such attribute
+ }
+ } catch (boost::bad_lexical_cast const&) {
+ dout(10) << __func__ << ": bad vxattr value:" << value
+ << ", unable to parse for xattr:" << name << dendl;
+ return -CEPHFS_EINVAL;
+ }
+ return 0;
+}
+
+// parse old style layout string
+int Server::parse_layout_vxattr_string(
+ string name, string value, const OSDMap& osdmap, file_layout_t *layout)
+{
+ try {
+ if (name == "layout") {
+ string::iterator begin = value.begin();
+ string::iterator end = value.end();
+ keys_and_values<string::iterator> p; // create instance of parser
+ std::map<string, string> m; // map to receive results
+ if (!qi::parse(begin, end, p, m)) { // returns true if successful
+ return -CEPHFS_EINVAL;
+ }
+ string left(begin, end);
+ dout(10) << __func__ << ": parsed " << m << " left '" << left << "'" << dendl;
+ if (begin != end)
+ return -CEPHFS_EINVAL;
+ for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
+ // Skip validation on each attr, we do it once at the end (avoid
+ // rejecting intermediate states if the overall result is ok)
+ int r = parse_layout_vxattr_string(string("layout.") + q->first, q->second,
+ osdmap, layout);
+ if (r < 0)
+ return r;
+ }
+ } else if (name == "layout.object_size") {
+ layout->object_size = boost::lexical_cast<unsigned>(value);
+ } else if (name == "layout.stripe_unit") {
+ layout->stripe_unit = boost::lexical_cast<unsigned>(value);
+ } else if (name == "layout.stripe_count") {
+ layout->stripe_count = boost::lexical_cast<unsigned>(value);
+ } else if (name == "layout.pool") {
+ try {
+ layout->pool_id = boost::lexical_cast<unsigned>(value);
+ } catch (boost::bad_lexical_cast const&) {
+ int64_t pool = osdmap.lookup_pg_pool_name(value);
+ if (pool < 0) {
+ dout(10) << __func__ << ": unknown pool " << value << dendl;
+ return -CEPHFS_ENOENT;
+ }
+ layout->pool_id = pool;
+ }
+ } else if (name == "layout.pool_id") {
+ layout->pool_id = boost::lexical_cast<int64_t>(value);
+ } else if (name == "layout.pool_name") {
+ layout->pool_id = osdmap.lookup_pg_pool_name(value);
+ if (layout->pool_id < 0) {
+ dout(10) << __func__ << ": unknown pool " << value << dendl;
+ return -CEPHFS_EINVAL;
+ }
+ } else if (name == "layout.pool_namespace") {
+ layout->pool_ns = value;
+ } else {
+ dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
+ return -CEPHFS_ENODATA; // no such attribute
+ }
+ } catch (boost::bad_lexical_cast const&) {
+ dout(10) << __func__ << ": bad vxattr value, unable to parse int for "
+ << name << dendl;
+ return -CEPHFS_EINVAL;
+ }
+ return 0;
+}
+
+int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
+ file_layout_t *layout, bool validate)
+{
+ dout(20) << __func__ << ": name:" << name << " value:'" << value << "'" << dendl;
+
+ int r;
+ if (name == "layout.json") {
+ r = parse_layout_vxattr_json(name, value, osdmap, layout);
+ } else {
+ r = parse_layout_vxattr_string(name, value, osdmap, layout);
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ if (validate && !layout->is_valid()) {
+ dout(10) << __func__ << ": bad layout" << dendl;
+ return -CEPHFS_EINVAL;
+ }
+ if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
+ dout(10) << __func__ << ": invalid data pool " << layout->pool_id << dendl;
+ return -CEPHFS_EINVAL;
+ }
+ return 0;
+}
+
+int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
+{
+ dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
+ try {
+ if (name == "quota") {
+ string::iterator begin = value.begin();
+ string::iterator end = value.end();
+ if (begin == end) {
+ // keep quota unchanged. (for create_quota_realm())
+ return 0;
+ }
+ keys_and_values<string::iterator> p; // create instance of parser
+ std::map<string, string> m; // map to receive results
+ if (!qi::parse(begin, end, p, m)) { // returns true if successful
+ return -CEPHFS_EINVAL;
+ }
+ string left(begin, end);
+ dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
+ if (begin != end)
+ return -CEPHFS_EINVAL;
+ for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
+ int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
+ if (r < 0)
+ return r;
+ }
+ } else if (name == "quota.max_bytes") {
+ int64_t q = boost::lexical_cast<int64_t>(value);
+ if (q < 0)
+ return -CEPHFS_EINVAL;
+ quota->max_bytes = q;
+ } else if (name == "quota.max_files") {
+ int64_t q = boost::lexical_cast<int64_t>(value);
+ if (q < 0)
+ return -CEPHFS_EINVAL;
+ quota->max_files = q;
+ } else {
+ dout(10) << " unknown quota vxattr " << name << dendl;
+ return -CEPHFS_EINVAL;
+ }
+ } catch (boost::bad_lexical_cast const&) {
+ dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
+ return -CEPHFS_EINVAL;
+ }
+
+ if (!quota->is_valid()) {
+ dout(10) << "bad quota" << dendl;
+ return -CEPHFS_EINVAL;
+ }
+ return 0;
+}
+
+void Server::create_quota_realm(CInode *in)
+{
+ dout(10) << __func__ << " " << *in << dendl;
+
+ auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
+ req->set_filepath(filepath(in->ino()));
+ req->set_string2("ceph.quota");
+ // empty vxattr value
+ req->set_tid(mds->issue_tid());
+
+ mds->send_message_mds(req, in->authority().first);
+}
+
+/*
+ * Verify that the file layout attribute carried by client
+ * is well-formatted.
+ * Return 0 on success, otherwise this function takes
+ * responsibility for the passed mdr.
+ */
+int Server::check_layout_vxattr(MDRequestRef& mdr,
+ string name,
+ string value,
+ file_layout_t *layout)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ epoch_t epoch;
+ int r;
+
+ mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
+ r = parse_layout_vxattr(name, value, osdmap, layout);
+ epoch = osdmap.get_epoch();
+ });
+
+ if (r == -CEPHFS_ENOENT) {
+
+ // we don't have the specified pool, make sure our map
+ // is newer than or as new as the client.
+ epoch_t req_epoch = req->get_osdmap_epoch();
+
+ if (req_epoch > epoch) {
+
+ // well, our map is older. consult mds.
+ auto fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
+
+ mds->objecter->wait_for_map(req_epoch, lambdafy(fin));
+ return r;
+ } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
+
+ // For compatibility with client w/ old code, we still need get the
+ // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
+ // we can remove those code.
+ mdr->waited_for_osdmap = true;
+ mds->objecter->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
+ mds, new C_MDS_RetryRequest(mdcache, mdr))));
+ return r;
+ }
+ }
+
+ if (r < 0) {
+
+ if (r == -CEPHFS_ENOENT)
+ r = -CEPHFS_EINVAL;
+
+ respond_to_request(mdr, r);
+ return r;
+ }
+
+ // all is well
+ return 0;
+}
+
+void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ MutationImpl::LockOpVec lov;
+ string name(req->get_path2());
+ bufferlist bl = req->get_data();
+ string value (bl.c_str(), bl.length());
+ dout(10) << "handle_set_vxattr " << name
+ << " val " << value.length()
+ << " bytes on " << *cur
+ << dendl;
+
+ CInode::mempool_inode *pip = nullptr;
+ string rest;
+
+ if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
+ return;
+ }
+
+ bool adjust_realm = false;
+ if (name.compare(0, 15, "ceph.dir.layout") == 0) {
+ if (!cur->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ if (!xlock_policylock(mdr, cur, true))
+ return;
+
+ /* We need 'As' caps for the fscrypt context */
+ lov.add_xlock(&cur->authlock);
+ if (!mds->locker->acquire_locks(mdr, lov)) {
+ return;
+ }
+
+ /* encrypted directories can't have their layout changed */
+ if (!cur->get_inode()->fscrypt_auth.empty()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ file_layout_t layout;
+ if (cur->get_projected_inode()->has_layout())
+ layout = cur->get_projected_inode()->layout;
+ else if (mdr->dir_layout != file_layout_t())
+ layout = mdr->dir_layout;
+ else
+ layout = mdcache->default_file_layout;
+
+ rest = name.substr(name.find("layout"));
+ if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
+ return;
+
+ auto pi = cur->project_inode(mdr);
+ pi.inode->layout = layout;
+ mdr->no_early_reply = true;
+ pip = pi.inode.get();
+ } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
+ if (!cur->is_file()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+ if (cur->get_projected_inode()->size ||
+ cur->get_projected_inode()->truncate_seq > 1) {
+ respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+ return;
+ }
+ file_layout_t layout = cur->get_projected_inode()->layout;
+ rest = name.substr(name.find("layout"));
+ if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
+ return;
+
+ lov.add_xlock(&cur->filelock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ /* encrypted files can't have their layout changed */
+ if (!cur->get_inode()->fscrypt_auth.empty()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ auto pi = cur->project_inode(mdr);
+ int64_t old_pool = pi.inode->layout.pool_id;
+ pi.inode->add_old_pool(old_pool);
+ pi.inode->layout = layout;
+ pip = pi.inode.get();
+ } else if (name.compare(0, 10, "ceph.quota") == 0) {
+ if (!cur->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ quota_info_t quota = cur->get_projected_inode()->quota;
+
+ rest = name.substr(name.find("quota"));
+ int r = parse_quota_vxattr(rest, value, &quota);
+ if (r < 0) {
+ respond_to_request(mdr, r);
+ return;
+ }
+
+ if (quota.is_enabled() && !cur->get_projected_srnode())
+ adjust_realm = true;
+
+ if (!xlock_policylock(mdr, cur, false, adjust_realm))
+ return;
+
+ if (cur->get_projected_inode()->quota == quota) {
+ respond_to_request(mdr, 0);
+ return;
+ }
+
+ auto pi = cur->project_inode(mdr, false, adjust_realm);
+ pi.inode->quota = quota;
+
+ if (adjust_realm)
+ pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
+
+ mdr->no_early_reply = true;
+ pip = pi.inode.get();
+
+ client_t exclude_ct = mdr->get_client();
+ mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
+ } else if (name == "ceph.dir.subvolume"sv) {
+ if (!cur->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ bool val;
+ try {
+ val = boost::lexical_cast<bool>(value);
+ } catch (boost::bad_lexical_cast const&) {
+ dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ /* Verify it's not already a subvolume with lighter weight
+ * rdlock.
+ */
+ if (!mdr->more()->rdonly_checks) {
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ lov.add_rdlock(&cur->snaplock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
+ const auto srnode = cur->get_projected_srnode();
+ if (val == (srnode && srnode->is_subvolume())) {
+ dout(20) << "already marked subvolume" << dendl;
+ respond_to_request(mdr, 0);
+ return;
+ }
+ mdr->more()->rdonly_checks = true;
+ }
+
+ if ((mdr->locking_state & MutationImpl::ALL_LOCKED) && !mdr->is_xlocked(&cur->snaplock)) {
+ /* drop the rdlock and acquire xlocks */
+ dout(20) << "dropping rdlocks" << dendl;
+ mds->locker->drop_locks(mdr.get());
+ if (!xlock_policylock(mdr, cur, false, true))
+ return;
+ }
+
+ /* repeat rdonly checks in case changed between rdlock -> xlock */
+ SnapRealm *realm = cur->find_snaprealm();
+ if (val) {
+ inodeno_t subvol_ino = realm->get_subvolume_ino();
+ // can't create subvolume inside another subvolume
+ if (subvol_ino && subvol_ino != cur->ino()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+ }
+
+ const auto srnode = cur->get_projected_srnode();
+ if (val == (srnode && srnode->is_subvolume())) {
+ respond_to_request(mdr, 0);
+ return;
+ }
+
+ auto pi = cur->project_inode(mdr, false, true);
+ if (!srnode)
+ pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
+ if (val)
+ pi.snapnode->mark_subvolume();
+ else
+ pi.snapnode->clear_subvolume();
+
+ mdr->no_early_reply = true;
+ pip = pi.inode.get();
+ adjust_realm = true;
+ } else if (name == "ceph.dir.pin"sv) {
+ if (!cur->is_dir() || cur->is_root()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ mds_rank_t rank;
+ try {
+ rank = boost::lexical_cast<mds_rank_t>(value);
+ if (rank < 0) rank = MDS_RANK_NONE;
+ else if (rank >= MAX_MDS) {
+ respond_to_request(mdr, -CEPHFS_EDOM);
+ return;
+ }
+ } catch (boost::bad_lexical_cast const&) {
+ dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ if (!xlock_policylock(mdr, cur))
+ return;
+
+ auto pi = cur->project_inode(mdr);
+ cur->set_export_pin(rank);
+ pip = pi.inode.get();
+ } else if (name == "ceph.dir.pin.random"sv) {
+ if (!cur->is_dir() || cur->is_root()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ double val;
+ try {
+ val = boost::lexical_cast<double>(value);
+ } catch (boost::bad_lexical_cast const&) {
+ dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ if (val < 0.0 || 1.0 < val) {
+ respond_to_request(mdr, -CEPHFS_EDOM);
+ return;
+ } else if (mdcache->export_ephemeral_random_max < val) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ if (!xlock_policylock(mdr, cur))
+ return;
+
+ auto pi = cur->project_inode(mdr);
+ cur->setxattr_ephemeral_rand(val);
+ pip = pi.inode.get();
+ } else if (name == "ceph.dir.pin.distributed"sv) {
+ if (!cur->is_dir() || cur->is_root()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ bool val;
+ try {
+ val = boost::lexical_cast<bool>(value);
+ } catch (boost::bad_lexical_cast const&) {
+ dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ if (!xlock_policylock(mdr, cur))
+ return;
+
+ auto pi = cur->project_inode(mdr);
+ cur->setxattr_ephemeral_dist(val);
+ pip = pi.inode.get();
+ } else {
+ dout(10) << " unknown vxattr " << name << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ pip->change_attr++;
+ pip->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pip->rstat.rctime)
+ pip->rstat.rctime = mdr->get_op_stamp();
+ pip->version = cur->pre_dirty();
+ if (cur->is_file())
+ pip->update_backtrace();
+
+ // log + wait
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
+ false, false, adjust_realm));
+ return;
+}
+
+void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ string name(req->get_path2());
+
+ dout(10) << __func__ << " " << name << " on " << *cur << dendl;
+
+ if (name == "ceph.dir.layout") {
+ if (!cur->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_ENODATA);
+ return;
+ }
+ if (cur->is_root()) {
+ dout(10) << "can't remove layout policy on the root directory" << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ if (!cur->get_projected_inode()->has_layout()) {
+ respond_to_request(mdr, -CEPHFS_ENODATA);
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&cur->policylock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ auto pi = cur->project_inode(mdr);
+ pi.inode->clear_layout();
+ pi.inode->version = cur->pre_dirty();
+
+ // log + wait
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ mdr->no_early_reply = true;
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+ return;
+ } else if (name == "ceph.dir.layout.pool_namespace"
+ || name == "ceph.file.layout.pool_namespace") {
+ // Namespace is the only layout field that has a meaningful
+ // null/none value (empty string, means default layout). Is equivalent
+ // to a setxattr with empty string: pass through the empty payload of
+ // the rmxattr request to do this.
+ handle_set_vxattr(mdr, cur);
+ return;
+ }
+
+ respond_to_request(mdr, -CEPHFS_ENODATA);
+}
+
+const Server::XattrHandler Server::xattr_handlers[] = {
+ {
+ xattr_name: Server::DEFAULT_HANDLER,
+ description: "default xattr handler",
+ validate: &Server::default_xattr_validate,
+ setxattr: &Server::default_setxattr_handler,
+ removexattr: &Server::default_removexattr_handler,
+ },
+ {
+ xattr_name: "ceph.mirror.info",
+ description: "mirror info xattr handler",
+ validate: &Server::mirror_info_xattr_validate,
+ setxattr: &Server::mirror_info_setxattr_handler,
+ removexattr: &Server::mirror_info_removexattr_handler
+ },
+};
+
+const Server::XattrHandler* Server::get_xattr_or_default_handler(std::string_view xattr_name) {
+ const XattrHandler *default_xattr_handler = nullptr;
+
+ for (auto &handler : xattr_handlers) {
+ if (handler.xattr_name == Server::DEFAULT_HANDLER) {
+ ceph_assert(default_xattr_handler == nullptr);
+ default_xattr_handler = &handler;
+ }
+ if (handler.xattr_name == xattr_name) {
+ dout(20) << "handler=" << handler.description << dendl;
+ return &handler;
+ }
+ }
+
+ ceph_assert(default_xattr_handler != nullptr);
+ dout(20) << "handler=" << default_xattr_handler->description << dendl;
+ return default_xattr_handler;
+}
+
+int Server::xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
+ const std::string &xattr_name, int op, int flags) {
+ if (op == CEPH_MDS_OP_SETXATTR) {
+ if (xattrs) {
+ if ((flags & CEPH_XATTR_CREATE) && xattrs->count(mempool::mds_co::string(xattr_name))) {
+ dout(10) << "setxattr '" << xattr_name << "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur << dendl;
+ return -CEPHFS_EEXIST;
+ }
+ }
+ if ((flags & CEPH_XATTR_REPLACE) && !(xattrs && xattrs->count(mempool::mds_co::string(xattr_name)))) {
+ dout(10) << "setxattr '" << xattr_name << "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur << dendl;
+ return -CEPHFS_ENODATA;
+ }
+
+ return 0;
+ }
+
+ if (op == CEPH_MDS_OP_RMXATTR) {
+ if (!xattrs || xattrs->count(mempool::mds_co::string(xattr_name)) == 0) {
+ dout(10) << "removexattr '" << xattr_name << "' and CEPHFS_ENODATA on " << *cur << dendl;
+ return -CEPHFS_ENODATA;
+ }
+
+ return 0;
+ }
+
+ derr << ": unhandled validation for: " << xattr_name << dendl;
+ return -CEPHFS_EINVAL;
+}
+
+void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
+ const bufferlist &xattr_value) {
+ size_t len = xattr_value.length();
+ bufferptr b = buffer::create(len);
+ if (len) {
+ xattr_value.begin().copy(len, b.c_str());
+ }
+ auto em = xattrs->emplace(std::piecewise_construct,
+ std::forward_as_tuple(mempool::mds_co::string(xattr_name)),
+ std::forward_as_tuple(b));
+ if (!em.second) {
+ em.first->second = b;
+ }
+}
+
+void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name) {
+ xattrs->erase(mempool::mds_co::string(xattr_name));
+}
+
+int Server::default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
+ XattrOp *xattr_op) {
+ return xattr_validate(cur, xattrs, xattr_op->xattr_name, xattr_op->op, xattr_op->flags);
+}
+
+void Server::default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+ const XattrOp &xattr_op) {
+ xattr_set(xattrs, xattr_op.xattr_name, xattr_op.xattr_value);
+}
+
+void Server::default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+ const XattrOp &xattr_op) {
+ xattr_rm(xattrs, xattr_op.xattr_name);
+}
+
+// mirror info xattr handlers
+const std::string Server::MirrorXattrInfo::MIRROR_INFO_REGEX = "^cluster_id=([a-f0-9]{8}-" \
+ "[a-f0-9]{4}-[a-f0-9]{4}-" \
+ "[a-f0-9]{4}-[a-f0-9]{12})" \
+ " fs_id=(\\d+)$";
+const std::string Server::MirrorXattrInfo::CLUSTER_ID = "ceph.mirror.info.cluster_id";
+const std::string Server::MirrorXattrInfo::FS_ID = "ceph.mirror.info.fs_id";
+int Server::parse_mirror_info_xattr(const std::string &name, const std::string &value,
+ std::string &cluster_id, std::string &fs_id) {
+ dout(20) << "parsing name=" << name << ", value=" << value << dendl;
+
+ static const std::regex regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX);
+ std::smatch match;
+
+ std::regex_search(value, match, regex);
+ if (match.size() != 3) {
+ derr << "mirror info parse error" << dendl;
+ return -CEPHFS_EINVAL;
+ }
+
+ cluster_id = match[1];
+ fs_id = match[2];
+ dout(20) << " parsed cluster_id=" << cluster_id << ", fs_id=" << fs_id << dendl;
+ return 0;
+}
+
+int Server::mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
+ XattrOp *xattr_op) {
+ if (!cur->is_root()) {
+ return -CEPHFS_EINVAL;
+ }
+
+ int v1 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::CLUSTER_ID, xattr_op->op, xattr_op->flags);
+ int v2 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::FS_ID, xattr_op->op, xattr_op->flags);
+ if (v1 != v2) {
+ derr << "inconsistent mirror info state (" << v1 << "," << v2 << ")" << dendl;
+ return -CEPHFS_EINVAL;
+ }
+
+ if (v1 < 0) {
+ return v1;
+ }
+
+ if (xattr_op->op == CEPH_MDS_OP_RMXATTR) {
+ return 0;
+ }
+
+ std::string cluster_id;
+ std::string fs_id;
+ int r = parse_mirror_info_xattr(xattr_op->xattr_name, xattr_op->xattr_value.to_str(),
+ cluster_id, fs_id);
+ if (r < 0) {
+ return r;
+ }
+
+ xattr_op->xinfo = std::make_unique<MirrorXattrInfo>(cluster_id, fs_id);
+ return 0;
+}
+
+void Server::mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+ const XattrOp &xattr_op) {
+ auto mirror_info = dynamic_cast<MirrorXattrInfo&>(*(xattr_op.xinfo));
+
+ bufferlist bl;
+ bl.append(mirror_info.cluster_id.c_str(), mirror_info.cluster_id.length());
+ xattr_set(xattrs, Server::MirrorXattrInfo::CLUSTER_ID, bl);
+
+ bl.clear();
+ bl.append(mirror_info.fs_id.c_str(), mirror_info.fs_id.length());
+ xattr_set(xattrs, Server::MirrorXattrInfo::FS_ID, bl);
+}
+
+void Server::mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
+ const XattrOp &xattr_op) {
+ xattr_rm(xattrs, Server::MirrorXattrInfo::CLUSTER_ID);
+ xattr_rm(xattrs, Server::MirrorXattrInfo::FS_ID);
+}
+
+void Server::handle_client_setxattr(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ string name(req->get_path2());
+
+ // is a ceph virtual xattr?
+ if (is_ceph_vxattr(name)) {
+ // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
+ CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!cur)
+ return;
+
+ handle_set_vxattr(mdr, cur);
+ return;
+ }
+
+ if (!is_allowed_ceph_xattr(name)) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ CInode *cur = rdlock_path_pin_ref(mdr, true);
+ if (!cur)
+ return;
+
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -CEPHFS_EROFS);
+ return;
+ }
+
+ int flags = req->head.args.setxattr.flags;
+
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&cur->xattrlock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, cur, MAY_WRITE))
+ return;
+
+ size_t len = req->get_data().length();
+ size_t inc = len + name.length();
+
+ auto handler = Server::get_xattr_or_default_handler(name);
+ const auto& pxattrs = cur->get_projected_xattrs();
+ if (pxattrs) {
+ // check xattrs kv pairs size
+ size_t cur_xattrs_size = 0;
+ for (const auto& p : *pxattrs) {
+ if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
+ continue;
+ }
+ cur_xattrs_size += p.first.length() + p.second.length();
+ }
+
+ if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
+ dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
+ << cur_xattrs_size << ", inc " << inc << dendl;
+ respond_to_request(mdr, -CEPHFS_ENOSPC);
+ return;
+ }
+ }
+
+ XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
+ int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
+ if (r < 0) {
+ respond_to_request(mdr, r);
+ return;
+ }
+
+ dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
+
+ // project update
+ auto pi = cur->project_inode(mdr, true);
+ pi.inode->version = cur->pre_dirty();
+ pi.inode->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+ pi.inode->rstat.rctime = mdr->get_op_stamp();
+ pi.inode->change_attr++;
+ pi.inode->xattr_version++;
+
+ if ((flags & CEPH_XATTR_REMOVE)) {
+ std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
+ } else {
+ std::invoke(handler->setxattr, this, cur, pi.xattrs, xattr_op);
+ }
+
+ // log + wait
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "setxattr");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+}
+
+void Server::handle_client_removexattr(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ std::string name(req->get_path2());
+
+ // is a ceph virtual xattr?
+ if (is_ceph_vxattr(name)) {
+ // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
+ CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!cur)
+ return;
+
+ handle_remove_vxattr(mdr, cur);
+ return;
+ }
+
+ if (!is_allowed_ceph_xattr(name)) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ CInode* cur = rdlock_path_pin_ref(mdr, true);
+ if (!cur)
+ return;
+
+ if (mdr->snapid != CEPH_NOSNAP) {
+ respond_to_request(mdr, -CEPHFS_EROFS);
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&cur->xattrlock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+
+ auto handler = Server::get_xattr_or_default_handler(name);
+ bufferlist bl;
+ XattrOp xattr_op(CEPH_MDS_OP_RMXATTR, name, bl, 0);
+
+ const auto& pxattrs = cur->get_projected_xattrs();
+ int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
+ if (r < 0) {
+ respond_to_request(mdr, r);
+ return;
+ }
+
+ dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
+
+ // project update
+ auto pi = cur->project_inode(mdr, true);
+ pi.inode->version = cur->pre_dirty();
+ pi.inode->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+ pi.inode->rstat.rctime = mdr->get_op_stamp();
+ pi.inode->change_attr++;
+ pi.inode->xattr_version++;
+ std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
+
+ // log + wait
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "removexattr");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
+
+ journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
+}
+
+void Server::handle_client_getvxattr(MDRequestRef& mdr)
+{
+ const auto& req = mdr->client_request;
+ string xattr_name{req->get_path2()};
+
+ // is a ceph virtual xattr?
+ if (!is_ceph_vxattr(xattr_name)) {
+ respond_to_request(mdr, -CEPHFS_ENODATA);
+ return;
+ }
+
+ CInode *cur = rdlock_path_pin_ref(mdr, true, false);
+ if (!cur) {
+ return;
+ }
+
+ if (is_ceph_dir_vxattr(xattr_name)) {
+ if (!cur->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_ENODATA);
+ return;
+ }
+ } else if (is_ceph_file_vxattr(xattr_name)) {
+ if (cur->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_ENODATA);
+ return;
+ }
+ }
+
+ CachedStackStringStream css;
+ int r = 0;
+ ceph::bufferlist bl;
+ // handle these vxattrs
+ if ((xattr_name.substr(0, 15) == "ceph.dir.layout"sv) ||
+ (xattr_name.substr(0, 16) == "ceph.file.layout"sv)) {
+ std::string layout_field;
+
+ struct layout_xattr_info_t {
+ enum class InheritanceStatus : uint32_t {
+ DEFAULT = 0,
+ SET = 1,
+ INHERITED = 2
+ };
+
+ const file_layout_t layout;
+ const InheritanceStatus status;
+
+ layout_xattr_info_t(const file_layout_t& l, InheritanceStatus inh)
+ : layout(l), status(inh) { }
+
+ static std::string status_to_string(InheritanceStatus status) {
+ switch (status) {
+ case InheritanceStatus::DEFAULT: return "default"s;
+ case InheritanceStatus::SET: return "set"s;
+ case InheritanceStatus::INHERITED: return "inherited"s;
+ default: return "unknown"s;
+ }
+ }
+ };
+
+ auto is_default_layout = [&](const file_layout_t& layout) -> bool {
+ return (layout == mdcache->default_file_layout);
+ };
+ auto get_inherited_layout = [&](CInode *cur) -> layout_xattr_info_t {
+ auto orig_in = cur;
+
+ while (cur) {
+ if (cur->get_projected_inode()->has_layout()) {
+ auto& curr_layout = cur->get_projected_inode()->layout;
+ if (is_default_layout(curr_layout)) {
+ return {curr_layout, layout_xattr_info_t::InheritanceStatus::DEFAULT};
+ }
+ if (cur == orig_in) {
+ // we've found a new layout at this inode
+ return {curr_layout, layout_xattr_info_t::InheritanceStatus::SET};
+ } else {
+ return {curr_layout, layout_xattr_info_t::InheritanceStatus::INHERITED};
+ }
+ }
+
+ if (cur->is_root()) {
+ break;
+ }
+
+ cur = cur->get_projected_parent_dir()->get_inode();
+ }
+ mds->clog->error() << "no layout found at root dir!";
+ ceph_abort("no layout found at root dir! something is really messed up with layouts!");
+ };
+
+ if (xattr_name == "ceph.dir.layout.json"sv ||
+ xattr_name == "ceph.file.layout.json"sv) {
+ // fetch layout only for valid xattr_name
+ const auto lxi = get_inherited_layout(cur);
+
+ *css << "{\"stripe_unit\": " << lxi.layout.stripe_unit
+ << ", \"stripe_count\": " << lxi.layout.stripe_count
+ << ", \"object_size\": " << lxi.layout.object_size
+ << ", \"pool_name\": ";
+ mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
+ *css << "\"";
+ if (o.have_pg_pool(lxi.layout.pool_id)) {
+ *css << o.get_pool_name(lxi.layout.pool_id);
+ }
+ *css << "\"";
+ });
+ *css << ", \"pool_id\": " << (uint64_t)lxi.layout.pool_id;
+ *css << ", \"pool_namespace\": \"" << lxi.layout.pool_ns << "\"";
+ *css << ", \"inheritance\": \"@"
+ << layout_xattr_info_t::status_to_string(lxi.status) << "\"}";
+ } else if ((xattr_name == "ceph.dir.layout.pool_name"sv) ||
+ (xattr_name == "ceph.file.layout.pool_name"sv)) {
+ // fetch layout only for valid xattr_name
+ const auto lxi = get_inherited_layout(cur);
+ mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
+ if (o.have_pg_pool(lxi.layout.pool_id)) {
+ *css << o.get_pool_name(lxi.layout.pool_id);
+ }
+ });
+ } else if ((xattr_name == "ceph.dir.layout.pool_id"sv) ||
+ (xattr_name == "ceph.file.layout.pool_id"sv)) {
+ // fetch layout only for valid xattr_name
+ const auto lxi = get_inherited_layout(cur);
+ *css << (uint64_t)lxi.layout.pool_id;
+ } else {
+ r = -CEPHFS_ENODATA; // no such attribute
+ }
+ } else if (xattr_name.substr(0, 12) == "ceph.dir.pin"sv) {
+ if (xattr_name == "ceph.dir.pin"sv) {
+ *css << cur->get_projected_inode()->export_pin;
+ } else if (xattr_name == "ceph.dir.pin.random"sv) {
+ *css << cur->get_projected_inode()->export_ephemeral_random_pin;
+ } else if (xattr_name == "ceph.dir.pin.distributed"sv) {
+ *css << cur->get_projected_inode()->export_ephemeral_distributed_pin;
+ } else {
+ // otherwise respond as invalid request
+ // since we only handle ceph vxattrs here
+ r = -CEPHFS_ENODATA; // no such attribute
+ }
+ } else {
+ // otherwise respond as invalid request
+ // since we only handle ceph vxattrs here
+ r = -CEPHFS_ENODATA; // no such attribute
+ }
+
+ if (r == 0) {
+ ENCODE_START(1, 1, bl);
+ encode(css->strv(), bl);
+ ENCODE_FINISH(bl);
+ mdr->reply_extra_bl = bl;
+ }
+
+ respond_to_request(mdr, r);
+}
+
+// =================================================================
+// DIRECTORY and NAMESPACE OPS
+
+
+// ------------------------------------------------
+
+// MKNOD
+
+class C_MDS_mknod_finish : public ServerLogContext {
+ CDentry *dn;
+ CInode *newi;
+public:
+ C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
+ ServerLogContext(s, r), dn(d), newi(ni) {}
+ void finish(int r) override {
+ ceph_assert(r == 0);
+
+ // crash current MDS and the replacing MDS will test the journal
+ ceph_assert(!g_conf()->mds_kill_skip_replaying_inotable);
+
+ // link the inode
+ dn->pop_projected_linkage();
+
+ // be a bit hacky with the inode version, here.. we decrement it
+ // just to keep mark_dirty() happen. (we didn't bother projecting
+ // a new version of hte inode since it's just been created)
+ newi->mark_dirty(mdr->ls);
+ newi->mark_dirty_parent(mdr->ls, true);
+
+ // mkdir?
+ if (newi->is_dir()) {
+ CDir *dir = newi->get_dirfrag(frag_t());
+ ceph_assert(dir);
+ dir->mark_dirty(mdr->ls);
+ dir->mark_new(mdr->ls);
+ }
+
+ mdr->apply();
+
+ MDRequestRef null_ref;
+ get_mds()->mdcache->send_dentry_link(dn, null_ref);
+
+ if (newi->is_file()) {
+ get_mds()->locker->share_inode_max_size(newi);
+ } else if (newi->is_dir()) {
+ // We do this now so that the linkages on the new directory are stable.
+ newi->maybe_ephemeral_rand();
+ }
+
+ // hit pop
+ get_mds()->balancer->hit_inode(newi, META_POP_IWR);
+
+ // reply
+ server->respond_to_request(mdr, 0);
+ }
+};
+
+
+void Server::handle_client_mknod(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ client_t client = mdr->get_client();
+
+ unsigned mode = req->head.args.mknod.mode;
+ if ((mode & S_IFMT) == 0)
+ mode |= S_IFREG;
+
+ mdr->disable_lock_cache();
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, false, S_ISREG(mode));
+ if (!dn)
+ return;
+
+ CDir *dir = dn->get_dir();
+ CInode *diri = dir->get_inode();
+ if (!check_access(mdr, diri, MAY_WRITE))
+ return;
+ if (!check_fragment_space(mdr, dir))
+ return;
+ if (!check_dir_max_entries(mdr, dir))
+ return;
+
+ ceph_assert(dn->get_projected_linkage()->is_null());
+ if (req->get_alternate_name().size() > alternate_name_max) {
+ dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
+ respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
+ return;
+ }
+ dn->set_alternate_name(req->get_alternate_name());
+
+ // set layout
+ file_layout_t layout;
+ if (mdr->dir_layout != file_layout_t())
+ layout = mdr->dir_layout;
+ else
+ layout = mdcache->default_file_layout;
+
+ CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
+ ceph_assert(newi);
+
+ dn->push_projected_linkage(newi);
+
+ auto _inode = newi->_get_inode();
+ _inode->version = dn->pre_dirty();
+ _inode->rdev = req->head.args.mknod.rdev;
+ _inode->rstat.rfiles = 1;
+ _inode->accounted_rstat = _inode->rstat;
+ if (layout.pool_id != mdcache->default_file_layout.pool_id)
+ _inode->add_old_pool(mdcache->default_file_layout.pool_id);
+ _inode->update_backtrace();
+
+ snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+ SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
+ ceph_assert(follows >= realm->get_newest_seq());
+
+ // if the client created a _regular_ file via MKNOD, it's highly likely they'll
+ // want to write to it (e.g., if they are reexporting NFS)
+ if (S_ISREG(_inode->mode)) {
+ // issue a cap on the file
+ int cmode = CEPH_FILE_MODE_RDWR;
+ Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
+ if (cap) {
+ cap->set_wanted(0);
+
+ // put locks in excl mode
+ newi->filelock.set_state(LOCK_EXCL);
+ newi->authlock.set_state(LOCK_EXCL);
+ newi->xattrlock.set_state(LOCK_EXCL);
+
+ dout(15) << " setting a client_range too, since this is a regular file" << dendl;
+ _inode->client_ranges[client].range.first = 0;
+ _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
+ _inode->client_ranges[client].follows = follows;
+ newi->mark_clientwriteable();
+ cap->mark_clientwriteable();
+ }
+ }
+
+ ceph_assert(dn->first == follows + 1);
+ newi->first = dn->first;
+
+ dout(10) << "mknod mode " << _inode->mode << " rdev " << _inode->rdev << dendl;
+
+ // prepare finisher
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "mknod");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ journal_allocated_inos(mdr, &le->metablob);
+
+ mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
+ PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+ le->metablob.add_primary_dentry(dn, newi, true, true, true);
+
+ journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
+ mds->balancer->maybe_fragment(dn->get_dir(), false);
+}
+
+
+
+// MKDIR
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_mkdir(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ mdr->disable_lock_cache();
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
+ if (!dn)
+ return;
+
+ CDir *dir = dn->get_dir();
+ CInode *diri = dir->get_inode();
+
+ // mkdir check access
+ if (!check_access(mdr, diri, MAY_WRITE))
+ return;
+
+ if (!check_fragment_space(mdr, dir))
+ return;
+ if (!check_dir_max_entries(mdr, dir))
+ return;
+
+ ceph_assert(dn->get_projected_linkage()->is_null());
+ if (req->get_alternate_name().size() > alternate_name_max) {
+ dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
+ respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
+ return;
+ }
+ dn->set_alternate_name(req->get_alternate_name());
+
+ // new inode
+ unsigned mode = req->head.args.mkdir.mode;
+ mode &= ~S_IFMT;
+ mode |= S_IFDIR;
+ CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
+ ceph_assert(newi);
+
+ // it's a directory.
+ dn->push_projected_linkage(newi);
+
+ auto _inode = newi->_get_inode();
+ _inode->version = dn->pre_dirty();
+ _inode->rstat.rsubdirs = 1;
+ _inode->accounted_rstat = _inode->rstat;
+ _inode->update_backtrace();
+
+ snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
+ SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
+ ceph_assert(follows >= realm->get_newest_seq());
+
+ dout(12) << " follows " << follows << dendl;
+ ceph_assert(dn->first == follows + 1);
+ newi->first = dn->first;
+
+ // ...and that new dir is empty.
+ CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
+ newdir->state_set(CDir::STATE_CREATING);
+ newdir->mark_complete();
+ newdir->_get_fnode()->version = newdir->pre_dirty();
+
+ // prepare finisher
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "mkdir");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ journal_allocated_inos(mdr, &le->metablob);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+ le->metablob.add_primary_dentry(dn, newi, true, true);
+ le->metablob.add_new_dir(newdir); // dirty AND complete AND new
+
+ // issue a cap on the directory
+ int cmode = CEPH_FILE_MODE_RDWR;
+ Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
+ if (cap) {
+ cap->set_wanted(0);
+
+ // put locks in excl mode
+ newi->filelock.set_state(LOCK_EXCL);
+ newi->authlock.set_state(LOCK_EXCL);
+ newi->xattrlock.set_state(LOCK_EXCL);
+ }
+
+ // make sure this inode gets into the journal
+ le->metablob.add_opened_ino(newi->ino());
+
+ journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
+
+ // We hit_dir (via hit_inode) in our finish callback, but by then we might
+ // have overshot the split size (multiple mkdir in flight), so here is
+ // an early chance to split the dir if this mkdir makes it oversized.
+ mds->balancer->maybe_fragment(dir, false);
+}
+
+
+// SYMLINK
+
+void Server::handle_client_symlink(MDRequestRef& mdr)
+{
+ const auto& req = mdr->client_request;
+
+ mdr->disable_lock_cache();
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
+ if (!dn)
+ return;
+
+ CDir *dir = dn->get_dir();
+ CInode *diri = dir->get_inode();
+
+ if (!check_access(mdr, diri, MAY_WRITE))
+ return;
+ if (!check_fragment_space(mdr, dir))
+ return;
+ if (!check_dir_max_entries(mdr, dir))
+ return;
+
+ ceph_assert(dn->get_projected_linkage()->is_null());
+ if (req->get_alternate_name().size() > alternate_name_max) {
+ dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
+ respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
+ }
+ dn->set_alternate_name(req->get_alternate_name());
+
+ unsigned mode = S_IFLNK | 0777;
+ CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
+ ceph_assert(newi);
+
+ // it's a symlink
+ dn->push_projected_linkage(newi);
+
+ newi->symlink = req->get_path2();
+ auto _inode = newi->_get_inode();
+ _inode->version = dn->pre_dirty();
+ _inode->size = newi->symlink.length();
+ _inode->rstat.rbytes = _inode->size;
+ _inode->rstat.rfiles = 1;
+ _inode->accounted_rstat = _inode->rstat;
+ _inode->update_backtrace();
+
+ newi->first = dn->first;
+
+ // prepare finisher
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "symlink");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ journal_allocated_inos(mdr, &le->metablob);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+ le->metablob.add_primary_dentry(dn, newi, true, true);
+
+ journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
+ mds->balancer->maybe_fragment(dir, false);
+
+ // flush the journal as soon as possible
+ if (g_conf()->mds_kill_skip_replaying_inotable) {
+ mdlog->flush();
+ }
+}
+
+
+
+
+
+// LINK
+
+void Server::handle_client_link(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ dout(7) << "handle_client_link " << req->get_filepath()
+ << " to " << req->get_filepath2()
+ << dendl;
+
+ mdr->disable_lock_cache();
+
+ CDentry *destdn;
+ CInode *targeti;
+
+ if (req->get_filepath2().depth() == 0) {
+ targeti = mdcache->get_inode(req->get_filepath2().get_ino());
+ if (!targeti) {
+ dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
+ inodeno_t ino = req->get_filepath2().get_ino();
+ mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
+ return;
+ }
+ mdr->pin(targeti);
+
+ if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
+ CDentry *pdn = targeti->get_projected_parent_dn();
+ if (!pdn) {
+ dout(7) << "target has no parent dn, failing..." << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+ if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
+ return;
+ mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
+ }
+
+ destdn = rdlock_path_xlock_dentry(mdr, false);
+ if (!destdn)
+ return;
+ } else {
+ auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
+ destdn = ret.first;
+ if (!destdn)
+ return;
+
+ if (!destdn->get_projected_linkage()->is_null()) {
+ respond_to_request(mdr, -CEPHFS_EEXIST);
+ return;
+ }
+
+ targeti = ret.second->get_projected_linkage()->get_inode();
+ }
+
+ ceph_assert(destdn->get_projected_linkage()->is_null());
+ if (req->get_alternate_name().size() > alternate_name_max) {
+ dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
+ respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
+ return;
+ }
+ destdn->set_alternate_name(req->get_alternate_name());
+
+ if (targeti->is_dir()) {
+ dout(7) << "target is a dir, failing..." << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ CDir *dir = destdn->get_dir();
+ dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
+ dout(7) << "target is " << *targeti << dendl;
+
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&targeti->snaplock);
+ lov.add_xlock(&targeti->linklock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
+
+ if (targeti->get_projected_inode()->nlink == 0) {
+ dout(7) << "target has no link, failing..." << dendl;
+ respond_to_request(mdr, -CEPHFS_ENOENT);
+ return;
+ }
+
+ if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
+ if (!check_access(mdr, targeti, MAY_WRITE))
+ return;
+
+ if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
+ return;
+
+ if (!check_fragment_space(mdr, dir))
+ return;
+
+ if (!check_dir_max_entries(mdr, dir))
+ return;
+ }
+
+ CInode* target_pin = targeti->get_projected_parent_dir()->inode;
+ SnapRealm *target_realm = target_pin->find_snaprealm();
+ if (target_pin != dir->inode &&
+ target_realm->get_subvolume_ino() !=
+ dir->inode->find_snaprealm()->get_subvolume_ino() &&
+ /* The inode is temporarily located in the stray dir pending reintegration */
+ !target_pin->is_stray()) {
+ dout(7) << "target is in different subvolume, failing..." << dendl;
+ respond_to_request(mdr, -CEPHFS_EXDEV);
+ return;
+ }
+
+ // go!
+ ceph_assert(g_conf()->mds_kill_link_at != 1);
+
+ // local or remote?
+ if (targeti->is_auth())
+ _link_local(mdr, destdn, targeti, target_realm);
+ else
+ _link_remote(mdr, true, destdn, targeti);
+ mds->balancer->maybe_fragment(dir, false);
+}
+
+
+class C_MDS_link_local_finish : public ServerLogContext {
+ CDentry *dn;
+ CInode *targeti;
+ version_t dnpv;
+ version_t tipv;
+ bool adjust_realm;
+public:
+ C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
+ version_t dnpv_, version_t tipv_, bool ar) :
+ ServerLogContext(s, r), dn(d), targeti(ti),
+ dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
+ }
+};
+
+
+void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
+{
+ dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
+
+ mdr->ls = mdlog->get_current_segment();
+
+ // predirty NEW dentry
+ version_t dnpv = dn->pre_dirty();
+ version_t tipv = targeti->pre_dirty();
+
+ // project inode update
+ auto pi = targeti->project_inode(mdr);
+ pi.inode->nlink++;
+ pi.inode->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+ pi.inode->rstat.rctime = mdr->get_op_stamp();
+ pi.inode->change_attr++;
+ pi.inode->version = tipv;
+
+ bool adjust_realm = false;
+ if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
+ sr_t *newsnap = targeti->project_snaprealm();
+ targeti->mark_snaprealm_global(newsnap);
+ targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
+ adjust_realm = true;
+ }
+
+ // log + wait
+ EUpdate *le = new EUpdate(mdlog, "link_local");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+ mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
+ mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
+ le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
+
+ // do this after predirty_*, to avoid funky extra dnl arg
+ dn->push_projected_linkage(targeti->ino(), targeti->d_type());
+
+ journal_and_reply(mdr, targeti, dn, le,
+ new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
+}
+
+void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
+ version_t dnpv, version_t tipv, bool adjust_realm)
+{
+ dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
+
+ // link and unlock the NEW dentry
+ CDentry::linkage_t *dnl = dn->pop_projected_linkage();
+ if (!dnl->get_inode())
+ dn->link_remote(dnl, targeti);
+ dn->mark_dirty(dnpv, mdr->ls);
+
+ // target inode
+ mdr->apply();
+
+ MDRequestRef null_ref;
+ mdcache->send_dentry_link(dn, null_ref);
+
+ if (adjust_realm) {
+ int op = CEPH_SNAP_OP_SPLIT;
+ mds->mdcache->send_snap_update(targeti, 0, op);
+ mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
+ }
+
+ // bump target popularity
+ mds->balancer->hit_inode(targeti, META_POP_IWR);
+ mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
+
+ // reply
+ respond_to_request(mdr, 0);
+}
+
+
+// link / unlink remote
+
+class C_MDS_link_remote_finish : public ServerLogContext {
+ bool inc;
+ CDentry *dn;
+ CInode *targeti;
+ version_t dpv;
+public:
+ C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
+ ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
+ dpv(d->get_projected_version()) {}
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
+ }
+};
+
+void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
+{
+ dout(10) << "_link_remote "
+ << (inc ? "link ":"unlink ")
+ << *dn << " to " << *targeti << dendl;
+
+ // 1. send LinkPrepare to dest (journal nlink++ prepare)
+ mds_rank_t linkauth = targeti->authority().first;
+ if (mdr->more()->witnessed.count(linkauth) == 0) {
+ if (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
+ dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
+ if (mdr->more()->waiting_on_peer.empty())
+ mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ dout(10) << " targeti auth must prepare nlink++/--" << dendl;
+ int op;
+ if (inc)
+ op = MMDSPeerRequest::OP_LINKPREP;
+ else
+ op = MMDSPeerRequest::OP_UNLINKPREP;
+ auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, op);
+ targeti->set_object_info(req->get_object_info());
+ req->op_stamp = mdr->get_op_stamp();
+ if (auto& desti_srnode = mdr->more()->desti_srnode)
+ encode(*desti_srnode, req->desti_snapbl);
+ mds->send_message_mds(req, linkauth);
+
+ ceph_assert(mdr->more()->waiting_on_peer.count(linkauth) == 0);
+ mdr->more()->waiting_on_peer.insert(linkauth);
+ return;
+ }
+ dout(10) << " targeti auth has prepared nlink++/--" << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 2);
+
+ if (auto& desti_srnode = mdr->more()->desti_srnode) {
+ delete desti_srnode;
+ desti_srnode = NULL;
+ }
+
+ mdr->set_mds_stamp(ceph_clock_now());
+
+ // add to event
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+ if (!mdr->more()->witnessed.empty()) {
+ dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
+ le->reqid = mdr->reqid;
+ le->had_peers = true;
+ mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+ }
+
+ if (inc) {
+ dn->pre_dirty();
+ mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
+ le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
+ dn->push_projected_linkage(targeti->ino(), targeti->d_type());
+ } else {
+ dn->pre_dirty();
+ mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
+ mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
+ le->metablob.add_null_dentry(dn, true);
+ dn->push_projected_linkage();
+ }
+
+ journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
+ new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
+}
+
+void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
+ CDentry *dn, CInode *targeti,
+ version_t dpv)
+{
+ dout(10) << "_link_remote_finish "
+ << (inc ? "link ":"unlink ")
+ << *dn << " to " << *targeti << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 3);
+
+ if (!mdr->more()->witnessed.empty())
+ mdcache->logged_leader_update(mdr->reqid);
+
+ if (inc) {
+ // link the new dentry
+ CDentry::linkage_t *dnl = dn->pop_projected_linkage();
+ if (!dnl->get_inode())
+ dn->link_remote(dnl, targeti);
+ dn->mark_dirty(dpv, mdr->ls);
+ } else {
+ // unlink main dentry
+ dn->get_dir()->unlink_inode(dn);
+ dn->pop_projected_linkage();
+ dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
+ }
+
+ mdr->apply();
+
+ MDRequestRef null_ref;
+ if (inc)
+ mdcache->send_dentry_link(dn, null_ref);
+ else
+ mdcache->send_dentry_unlink(dn, NULL, null_ref);
+
+ // bump target popularity
+ mds->balancer->hit_inode(targeti, META_POP_IWR);
+ mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
+
+ // reply
+ respond_to_request(mdr, 0);
+
+ if (!inc)
+ // removing a new dn?
+ dn->get_dir()->try_remove_unlinked_dn(dn);
+}
+
+
+// remote linking/unlinking
+
+class C_MDS_PeerLinkPrep : public ServerLogContext {
+ CInode *targeti;
+ bool adjust_realm;
+public:
+ C_MDS_PeerLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
+ ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ server->_logged_peer_link(mdr, targeti, adjust_realm);
+ }
+};
+
+class C_MDS_PeerLinkCommit : public ServerContext {
+ MDRequestRef mdr;
+ CInode *targeti;
+public:
+ C_MDS_PeerLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
+ ServerContext(s), mdr(r), targeti(t) { }
+ void finish(int r) override {
+ server->_commit_peer_link(mdr, r, targeti);
+ }
+};
+
+void Server::handle_peer_link_prep(MDRequestRef& mdr)
+{
+ dout(10) << "handle_peer_link_prep " << *mdr
+ << " on " << mdr->peer_request->get_object_info()
+ << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 4);
+
+ CInode *targeti = mdcache->get_inode(mdr->peer_request->get_object_info().ino);
+ ceph_assert(targeti);
+ dout(10) << "targeti " << *targeti << dendl;
+ CDentry *dn = targeti->get_parent_dn();
+ CDentry::linkage_t *dnl = dn->get_linkage();
+ ceph_assert(dnl->is_primary());
+
+ mdr->set_op_stamp(mdr->peer_request->op_stamp);
+
+ mdr->auth_pin(targeti);
+
+ //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
+ ceph_assert(g_conf()->mds_kill_link_at != 5);
+
+ // journal it
+ mdr->ls = mdlog->get_current_segment();
+ EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_prep", mdr->reqid, mdr->peer_to_mds,
+ EPeerUpdate::OP_PREPARE, EPeerUpdate::LINK);
+ mdlog->start_entry(le);
+
+ auto pi = dnl->get_inode()->project_inode(mdr);
+
+ // update journaled target inode
+ bool inc;
+ bool adjust_realm = false;
+ bool realm_projected = false;
+ if (mdr->peer_request->get_op() == MMDSPeerRequest::OP_LINKPREP) {
+ inc = true;
+ pi.inode->nlink++;
+
+ CDentry *target_pdn = targeti->get_projected_parent_dn();
+ SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
+ if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
+ sr_t *newsnap = targeti->project_snaprealm();
+ targeti->mark_snaprealm_global(newsnap);
+ targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
+ adjust_realm = true;
+ realm_projected = true;
+ }
+ } else {
+ inc = false;
+ pi.inode->nlink--;
+ if (targeti->is_projected_snaprealm_global()) {
+ ceph_assert(mdr->peer_request->desti_snapbl.length());
+ auto p = mdr->peer_request->desti_snapbl.cbegin();
+
+ sr_t *newsnap = targeti->project_snaprealm();
+ decode(*newsnap, p);
+
+ if (pi.inode->nlink == 0)
+ ceph_assert(!newsnap->is_parent_global());
+
+ realm_projected = true;
+ } else {
+ ceph_assert(mdr->peer_request->desti_snapbl.length() == 0);
+ }
+ }
+
+ link_rollback rollback;
+ rollback.reqid = mdr->reqid;
+ rollback.ino = targeti->ino();
+ rollback.old_ctime = targeti->get_inode()->ctime; // we hold versionlock xlock; no concorrent projections
+ const auto& pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
+ rollback.old_dir_mtime = pf->fragstat.mtime;
+ rollback.old_dir_rctime = pf->rstat.rctime;
+ rollback.was_inc = inc;
+ if (realm_projected) {
+ if (targeti->snaprealm) {
+ encode(true, rollback.snapbl);
+ targeti->encode_snap_blob(rollback.snapbl);
+ } else {
+ encode(false, rollback.snapbl);
+ }
+ }
+ encode(rollback, le->rollback);
+ mdr->more()->rollback_bl = le->rollback;
+
+ pi.inode->ctime = mdr->get_op_stamp();
+ pi.inode->version = targeti->pre_dirty();
+
+ dout(10) << " projected inode " << pi.inode->ino << " v " << pi.inode->version << dendl;
+
+ // commit case
+ mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
+ mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
+
+ // set up commit waiter
+ mdr->more()->peer_commit = new C_MDS_PeerLinkCommit(this, mdr, targeti);
+
+ mdr->more()->peer_update_journaled = true;
+ submit_mdlog_entry(le, new C_MDS_PeerLinkPrep(this, mdr, targeti, adjust_realm),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
+{
+ dout(10) << "_logged_peer_link " << *mdr
+ << " " << *targeti << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 6);
+
+ // update the target
+ mdr->apply();
+
+ // hit pop
+ mds->balancer->hit_inode(targeti, META_POP_IWR);
+
+ // done.
+ mdr->reset_peer_request();
+
+ if (adjust_realm) {
+ int op = CEPH_SNAP_OP_SPLIT;
+ mds->mdcache->send_snap_update(targeti, 0, op);
+ mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
+ }
+
+ // ack
+ if (!mdr->aborted) {
+ auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_LINKPREPACK);
+ mds->send_message_mds(reply, mdr->peer_to_mds);
+ } else {
+ dout(10) << " abort flag set, finishing" << dendl;
+ mdcache->request_finish(mdr);
+ }
+}
+
+
+struct C_MDS_CommittedPeer : public ServerLogContext {
+ C_MDS_CommittedPeer(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
+ void finish(int r) override {
+ server->_committed_peer(mdr);
+ }
+};
+
+void Server::_commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti)
+{
+ dout(10) << "_commit_peer_link " << *mdr
+ << " r=" << r
+ << " " << *targeti << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 7);
+
+ if (r == 0) {
+ // drop our pins, etc.
+ mdr->cleanup();
+
+ // write a commit to the journal
+ EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_commit", mdr->reqid, mdr->peer_to_mds,
+ EPeerUpdate::OP_COMMIT, EPeerUpdate::LINK);
+ mdlog->start_entry(le);
+ submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
+ mdlog->flush();
+ } else {
+ do_link_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
+ }
+}
+
+void Server::_committed_peer(MDRequestRef& mdr)
+{
+ dout(10) << "_committed_peer " << *mdr << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 8);
+
+ bool assert_exist = mdr->more()->peer_update_journaled;
+ mdcache->finish_uncommitted_peer(mdr->reqid, assert_exist);
+ auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_COMMITTED);
+ mds->send_message_mds(req, mdr->peer_to_mds);
+ mdcache->request_finish(mdr);
+}
+
+struct C_MDS_LoggedLinkRollback : public ServerLogContext {
+ MutationRef mut;
+ map<client_t,ref_t<MClientSnap>> splits;
+ C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
+ map<client_t,ref_t<MClientSnap>>&& _splits) :
+ ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
+ }
+ void finish(int r) override {
+ server->_link_rollback_finish(mut, mdr, splits);
+ }
+};
+
+void Server::do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
+{
+ link_rollback rollback;
+ auto p = rbl.cbegin();
+ decode(rollback, p);
+
+ dout(10) << "do_link_rollback on " << rollback.reqid
+ << (rollback.was_inc ? " inc":" dec")
+ << " ino " << rollback.ino
+ << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 9);
+
+ mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
+ ceph_assert(mdr || mds->is_resolve());
+
+ MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
+ mut->ls = mds->mdlog->get_current_segment();
+
+ CInode *in = mdcache->get_inode(rollback.ino);
+ ceph_assert(in);
+ dout(10) << " target is " << *in << dendl;
+ ceph_assert(!in->is_projected()); // live peer request hold versionlock xlock.
+
+ auto pi = in->project_inode(mut);
+ pi.inode->version = in->pre_dirty();
+
+ // parent dir rctime
+ CDir *parent = in->get_projected_parent_dn()->get_dir();
+ auto pf = parent->project_fnode(mut);
+ pf->version = parent->pre_dirty();
+ if (pf->fragstat.mtime == pi.inode->ctime) {
+ pf->fragstat.mtime = rollback.old_dir_mtime;
+ if (pf->rstat.rctime == pi.inode->ctime)
+ pf->rstat.rctime = rollback.old_dir_rctime;
+ mut->add_updated_lock(&parent->get_inode()->filelock);
+ mut->add_updated_lock(&parent->get_inode()->nestlock);
+ }
+
+ // inode
+ pi.inode->ctime = rollback.old_ctime;
+ if (rollback.was_inc)
+ pi.inode->nlink--;
+ else
+ pi.inode->nlink++;
+
+ map<client_t,ref_t<MClientSnap>> splits;
+ if (rollback.snapbl.length() && in->snaprealm) {
+ bool hadrealm;
+ auto p = rollback.snapbl.cbegin();
+ decode(hadrealm, p);
+ if (hadrealm) {
+ if (!mds->is_resolve()) {
+ sr_t *new_srnode = new sr_t();
+ decode(*new_srnode, p);
+ in->project_snaprealm(new_srnode);
+ } else {
+ decode(in->snaprealm->srnode, p);
+ }
+ } else {
+ SnapRealm *realm = parent->get_inode()->find_snaprealm();
+ if (!mds->is_resolve())
+ mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
+ in->project_snaprealm(NULL);
+ }
+ }
+
+ // journal it
+ EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_rollback", rollback.reqid, leader,
+ EPeerUpdate::OP_ROLLBACK, EPeerUpdate::LINK);
+ mdlog->start_entry(le);
+ le->commit.add_dir_context(parent);
+ le->commit.add_dir(parent, true);
+ le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
+
+ submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
+ map<client_t,ref_t<MClientSnap>>& splits)
+{
+ dout(10) << "_link_rollback_finish" << dendl;
+
+ ceph_assert(g_conf()->mds_kill_link_at != 10);
+
+ mut->apply();
+
+ if (!mds->is_resolve())
+ mdcache->send_snaps(splits);
+
+ if (mdr)
+ mdcache->request_finish(mdr);
+
+ mdcache->finish_rollback(mut->reqid, mdr);
+
+ mut->cleanup();
+}
+
+
+void Server::handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m)
+{
+ dout(10) << "handle_peer_link_prep_ack " << *mdr
+ << " " << *m << dendl;
+ mds_rank_t from = mds_rank_t(m->get_source().num());
+
+ ceph_assert(g_conf()->mds_kill_link_at != 11);
+
+ // note peer
+ mdr->more()->peers.insert(from);
+
+ // witnessed!
+ ceph_assert(mdr->more()->witnessed.count(from) == 0);
+ mdr->more()->witnessed.insert(from);
+ ceph_assert(!m->is_not_journaled());
+ mdr->more()->has_journaled_peers = true;
+
+ // remove from waiting list
+ ceph_assert(mdr->more()->waiting_on_peer.count(from));
+ mdr->more()->waiting_on_peer.erase(from);
+
+ ceph_assert(mdr->more()->waiting_on_peer.empty());
+
+ dispatch_client_request(mdr); // go again!
+}
+
+
+
+
+
+// UNLINK
+
+void Server::handle_client_unlink(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ client_t client = mdr->get_client();
+
+ // rmdir or unlink?
+ bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
+
+ if (rmdir)
+ mdr->disable_lock_cache();
+ CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
+ if (!dn)
+ return;
+
+ CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
+ ceph_assert(!dnl->is_null());
+ CInode *in = dnl->get_inode();
+
+ if (rmdir) {
+ dout(7) << "handle_client_rmdir on " << *dn << dendl;
+ } else {
+ dout(7) << "handle_client_unlink on " << *dn << dendl;
+ }
+ dout(7) << "dn links to " << *in << dendl;
+
+ // rmdir vs is_dir
+ if (in->is_dir()) {
+ if (rmdir) {
+ // do empty directory checks
+ if (_dir_is_nonempty_unlocked(mdr, in)) {
+ respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+ return;
+ }
+ } else {
+ dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
+ respond_to_request(mdr, -CEPHFS_EISDIR);
+ return;
+ }
+ } else {
+ if (rmdir) {
+ // unlink
+ dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
+ respond_to_request(mdr, -CEPHFS_ENOTDIR);
+ return;
+ }
+ }
+
+ CInode *diri = dn->get_dir()->get_inode();
+ if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
+ if (!check_access(mdr, diri, MAY_WRITE))
+ return;
+ }
+
+ // -- create stray dentry? --
+ CDentry *straydn = NULL;
+ if (dnl->is_primary()) {
+ straydn = prepare_stray_dentry(mdr, dnl->get_inode());
+ if (!straydn)
+ return;
+ dout(10) << " straydn is " << *straydn << dendl;
+ } else if (mdr->straydn) {
+ mdr->unpin(mdr->straydn);
+ mdr->straydn = NULL;
+ }
+
+ // lock
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ MutationImpl::LockOpVec lov;
+
+ lov.add_xlock(&in->linklock);
+ lov.add_xlock(&in->snaplock);
+ if (in->is_dir())
+ lov.add_rdlock(&in->filelock); // to verify it's empty
+
+ if (straydn) {
+ lov.add_wrlock(&straydn->get_dir()->inode->filelock);
+ lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
+ lov.add_xlock(&straydn->lock);
+ }
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
+
+ if (in->is_dir() &&
+ _dir_is_nonempty(mdr, in)) {
+ respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+ return;
+ }
+
+ if (straydn)
+ straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+
+ if (!mdr->more()->desti_srnode) {
+ if (in->is_projected_snaprealm_global()) {
+ sr_t *new_srnode = in->prepare_new_srnode(0);
+ in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
+ // dropping the last linkage or dropping the last remote linkage,
+ // detch the inode from global snaprealm
+ auto nlink = in->get_projected_inode()->nlink;
+ if (nlink == 1 ||
+ (nlink == 2 && !dnl->is_primary() &&
+ !in->get_projected_parent_dir()->inode->is_stray()))
+ in->clear_snaprealm_global(new_srnode);
+ mdr->more()->desti_srnode = new_srnode;
+ } else if (dnl->is_primary()) {
+ // prepare snaprealm blob for peer request
+ SnapRealm *realm = in->find_snaprealm();
+ snapid_t follows = realm->get_newest_seq();
+ if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
+ sr_t *new_srnode = in->prepare_new_srnode(follows);
+ in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
+ mdr->more()->desti_srnode = new_srnode;
+ }
+ }
+ }
+
+ // yay!
+ if (in->is_dir() && in->has_subtree_root_dirfrag()) {
+ // subtree root auths need to be witnesses
+ set<mds_rank_t> witnesses;
+ in->list_replicas(witnesses);
+ dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
+
+ for (set<mds_rank_t>::iterator p = witnesses.begin();
+ p != witnesses.end();
+ ++p) {
+ if (mdr->more()->witnessed.count(*p)) {
+ dout(10) << " already witnessed by mds." << *p << dendl;
+ } else if (mdr->more()->waiting_on_peer.count(*p)) {
+ dout(10) << " already waiting on witness mds." << *p << dendl;
+ } else {
+ if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
+ return;
+ }
+ }
+ if (!mdr->more()->waiting_on_peer.empty())
+ return; // we're waiting for a witness.
+ }
+
+ if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
+ mds->locker->create_lock_cache(mdr, diri);
+
+ // ok!
+ if (dnl->is_remote() && !dnl->get_inode()->is_auth())
+ _link_remote(mdr, false, dn, dnl->get_inode());
+ else
+ _unlink_local(mdr, dn, straydn);
+}
+
+class C_MDS_unlink_local_finish : public ServerLogContext {
+ CDentry *dn;
+ CDentry *straydn;
+ version_t dnpv; // deleted dentry
+public:
+ C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
+ ServerLogContext(s, r), dn(d), straydn(sd),
+ dnpv(d->get_projected_version()) {}
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ server->_unlink_local_finish(mdr, dn, straydn, dnpv);
+ }
+};
+
+void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
+{
+ dout(10) << "_unlink_local " << *dn << dendl;
+
+ CDentry::linkage_t *dnl = dn->get_projected_linkage();
+ CInode *in = dnl->get_inode();
+
+
+ // ok, let's do it.
+ mdr->ls = mdlog->get_current_segment();
+
+ // prepare log entry
+ EUpdate *le = new EUpdate(mdlog, "unlink_local");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
+ if (!mdr->more()->witnessed.empty()) {
+ dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
+ le->reqid = mdr->reqid;
+ le->had_peers = true;
+ mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+ }
+
+ if (straydn) {
+ ceph_assert(dnl->is_primary());
+ straydn->push_projected_linkage(in);
+ }
+
+ // the unlinked dentry
+ dn->pre_dirty();
+
+ auto pi = in->project_inode(mdr);
+ {
+ std::string t;
+ dn->make_path_string(t, true);
+ pi.inode->stray_prior_path = std::move(t);
+ }
+ pi.inode->version = in->pre_dirty();
+ pi.inode->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+ pi.inode->rstat.rctime = mdr->get_op_stamp();
+ pi.inode->change_attr++;
+ pi.inode->nlink--;
+ if (pi.inode->nlink == 0)
+ in->state_set(CInode::STATE_ORPHAN);
+
+ if (mdr->more()->desti_srnode) {
+ auto& desti_srnode = mdr->more()->desti_srnode;
+ in->project_snaprealm(desti_srnode);
+ desti_srnode = NULL;
+ }
+
+ if (straydn) {
+ // will manually pop projected inode
+
+ // primary link. add stray dentry.
+ mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+
+ pi.inode->update_backtrace();
+ le->metablob.add_primary_dentry(straydn, in, true, true);
+ } else {
+ // remote link. update remote inode.
+ mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
+ }
+
+ mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
+ le->metablob.add_null_dentry(dn, true);
+
+ if (in->is_dir()) {
+ dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
+ le->metablob.renamed_dirino = in->ino();
+ }
+
+ dn->push_projected_linkage();
+
+ if (straydn) {
+ ceph_assert(in->first <= straydn->first);
+ in->first = straydn->first;
+ }
+
+ if (in->is_dir()) {
+ ceph_assert(straydn);
+ mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
+ }
+
+ journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
+}
+
+void Server::_unlink_local_finish(MDRequestRef& mdr,
+ CDentry *dn, CDentry *straydn,
+ version_t dnpv)
+{
+ dout(10) << "_unlink_local_finish " << *dn << dendl;
+
+ if (!mdr->more()->witnessed.empty())
+ mdcache->logged_leader_update(mdr->reqid);
+
+ CInode *strayin = NULL;
+ bool hadrealm = false;
+ if (straydn) {
+ // if there is newly created snaprealm, need to split old snaprealm's
+ // inodes_with_caps. So pop snaprealm before linkage changes.
+ strayin = dn->get_linkage()->get_inode();
+ hadrealm = strayin->snaprealm ? true : false;
+ strayin->early_pop_projected_snaprealm();
+ }
+
+ // unlink main dentry
+ dn->get_dir()->unlink_inode(dn);
+ dn->pop_projected_linkage();
+ dn->mark_dirty(dnpv, mdr->ls);
+
+ // relink as stray? (i.e. was primary link?)
+ if (straydn) {
+ dout(20) << " straydn is " << *straydn << dendl;
+ straydn->pop_projected_linkage();
+ mdcache->touch_dentry_bottom(straydn);
+ }
+
+ mdr->apply();
+
+ mdcache->send_dentry_unlink(dn, straydn, mdr);
+
+ if (straydn) {
+ // update subtree map?
+ if (strayin->is_dir())
+ mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
+
+ if (strayin->snaprealm && !hadrealm)
+ mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
+ }
+
+ // bump pop
+ mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
+
+ // reply
+ respond_to_request(mdr, 0);
+
+ // removing a new dn?
+ dn->get_dir()->try_remove_unlinked_dn(dn);
+
+ // clean up ?
+ // respond_to_request() drops locks. So stray reintegration can race with us.
+ if (straydn && !straydn->get_projected_linkage()->is_null()) {
+ // Tip off the MDCache that this dentry is a stray that
+ // might be elegible for purge.
+ mdcache->notify_stray(straydn);
+ }
+}
+
+bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
+{
+ if (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
+ dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
+ if (mdr->more()->waiting_on_peer.empty())
+ mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
+ return false;
+ }
+
+ dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
+ auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREP);
+ req->srcdnpath = filepath(trace.front()->get_dir()->ino());
+ for (auto dn : trace)
+ req->srcdnpath.push_dentry(dn->get_name());
+ mdcache->encode_replica_stray(straydn, who, req->straybl);
+ if (mdr->more()->desti_srnode)
+ encode(*mdr->more()->desti_srnode, req->desti_snapbl);
+
+ req->op_stamp = mdr->get_op_stamp();
+ mds->send_message_mds(req, who);
+
+ ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
+ mdr->more()->waiting_on_peer.insert(who);
+ return true;
+}
+
+struct C_MDS_PeerRmdirPrep : public ServerLogContext {
+ CDentry *dn, *straydn;
+ C_MDS_PeerRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
+ : ServerLogContext(s, r), dn(d), straydn(st) {}
+ void finish(int r) override {
+ server->_logged_peer_rmdir(mdr, dn, straydn);
+ }
+};
+
+struct C_MDS_PeerRmdirCommit : public ServerContext {
+ MDRequestRef mdr;
+ CDentry *straydn;
+ C_MDS_PeerRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
+ : ServerContext(s), mdr(r), straydn(sd) { }
+ void finish(int r) override {
+ server->_commit_peer_rmdir(mdr, r, straydn);
+ }
+};
+
+void Server::handle_peer_rmdir_prep(MDRequestRef& mdr)
+{
+ dout(10) << "handle_peer_rmdir_prep " << *mdr
+ << " " << mdr->peer_request->srcdnpath
+ << " to " << mdr->peer_request->destdnpath
+ << dendl;
+
+ vector<CDentry*> trace;
+ filepath srcpath(mdr->peer_request->srcdnpath);
+ dout(10) << " src " << srcpath << dendl;
+ CInode *in;
+ CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
+ int r = mdcache->path_traverse(mdr, cf, srcpath,
+ MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
+ &trace, &in);
+ if (r > 0) return;
+ if (r == -CEPHFS_ESTALE) {
+ mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
+ mdr->peer_to_mds, true);
+ return;
+ }
+ ceph_assert(r == 0);
+ CDentry *dn = trace.back();
+ dout(10) << " dn " << *dn << dendl;
+ mdr->pin(dn);
+
+ ceph_assert(mdr->straydn);
+ CDentry *straydn = mdr->straydn;
+ dout(10) << " straydn " << *straydn << dendl;
+
+ mdr->set_op_stamp(mdr->peer_request->op_stamp);
+
+ rmdir_rollback rollback;
+ rollback.reqid = mdr->reqid;
+ rollback.src_dir = dn->get_dir()->dirfrag();
+ rollback.src_dname = dn->get_name();
+ rollback.dest_dir = straydn->get_dir()->dirfrag();
+ rollback.dest_dname = straydn->get_name();
+ if (mdr->peer_request->desti_snapbl.length()) {
+ if (in->snaprealm) {
+ encode(true, rollback.snapbl);
+ in->encode_snap_blob(rollback.snapbl);
+ } else {
+ encode(false, rollback.snapbl);
+ }
+ }
+ encode(rollback, mdr->more()->rollback_bl);
+ // FIXME: rollback snaprealm
+ dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
+
+ // set up commit waiter
+ mdr->more()->peer_commit = new C_MDS_PeerRmdirCommit(this, mdr, straydn);
+
+ straydn->push_projected_linkage(in);
+ dn->push_projected_linkage();
+
+ ceph_assert(straydn->first >= in->first);
+ in->first = straydn->first;
+
+ if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
+ dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
+ _logged_peer_rmdir(mdr, dn, straydn);
+ return;
+ }
+
+ mdr->ls = mdlog->get_current_segment();
+ EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir", mdr->reqid, mdr->peer_to_mds,
+ EPeerUpdate::OP_PREPARE, EPeerUpdate::RMDIR);
+ mdlog->start_entry(le);
+ le->rollback = mdr->more()->rollback_bl;
+
+ le->commit.add_dir_context(straydn->get_dir());
+ le->commit.add_primary_dentry(straydn, in, true);
+ // peer: no need to journal original dentry
+
+ dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
+ le->commit.renamed_dirino = in->ino();
+
+ mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
+ mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
+
+ mdr->more()->peer_update_journaled = true;
+ submit_mdlog_entry(le, new C_MDS_PeerRmdirPrep(this, mdr, dn, straydn),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_logged_peer_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
+{
+ dout(10) << "_logged_peer_rmdir " << *mdr << " on " << *dn << dendl;
+ CInode *in = dn->get_linkage()->get_inode();
+
+ bool new_realm;
+ if (mdr->peer_request->desti_snapbl.length()) {
+ new_realm = !in->snaprealm;
+ in->decode_snap_blob(mdr->peer_request->desti_snapbl);
+ ceph_assert(in->snaprealm);
+ } else {
+ new_realm = false;
+ }
+
+ // update our cache now, so we are consistent with what is in the journal
+ // when we journal a subtree map
+ dn->get_dir()->unlink_inode(dn);
+ straydn->pop_projected_linkage();
+ dn->pop_projected_linkage();
+
+ mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->peer_update_journaled);
+
+ if (new_realm)
+ mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
+
+ // done.
+ mdr->reset_peer_request();
+ mdr->straydn = 0;
+
+ if (!mdr->aborted) {
+ auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREPACK);
+ if (!mdr->more()->peer_update_journaled)
+ reply->mark_not_journaled();
+ mds->send_message_mds(reply, mdr->peer_to_mds);
+ } else {
+ dout(10) << " abort flag set, finishing" << dendl;
+ mdcache->request_finish(mdr);
+ }
+}
+
+void Server::handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
+{
+ dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
+ << " " << *ack << dendl;
+
+ mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+ mdr->more()->peers.insert(from);
+ mdr->more()->witnessed.insert(from);
+ if (!ack->is_not_journaled())
+ mdr->more()->has_journaled_peers = true;
+
+ // remove from waiting list
+ ceph_assert(mdr->more()->waiting_on_peer.count(from));
+ mdr->more()->waiting_on_peer.erase(from);
+
+ if (mdr->more()->waiting_on_peer.empty())
+ dispatch_client_request(mdr); // go again!
+ else
+ dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
+}
+
+void Server::_commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
+{
+ dout(10) << "_commit_peer_rmdir " << *mdr << " r=" << r << dendl;
+
+ if (r == 0) {
+ if (mdr->more()->peer_update_journaled) {
+ CInode *strayin = straydn->get_projected_linkage()->get_inode();
+ if (strayin && !strayin->snaprealm)
+ mdcache->clear_dirty_bits_for_stray(strayin);
+ }
+
+ mdr->cleanup();
+
+ if (mdr->more()->peer_update_journaled) {
+ // write a commit to the journal
+ EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_commit", mdr->reqid,
+ mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
+ EPeerUpdate::RMDIR);
+ mdlog->start_entry(le);
+ submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
+ mdlog->flush();
+ } else {
+ _committed_peer(mdr);
+ }
+ } else {
+ // abort
+ do_rmdir_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
+ }
+}
+
+struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
+ metareqid_t reqid;
+ CDentry *dn;
+ CDentry *straydn;
+ C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
+ : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
+ void finish(int r) override {
+ server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
+ }
+};
+
+void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
+{
+ // unlink the other rollback methods, the rmdir rollback is only
+ // needed to record the subtree changes in the journal for inode
+ // replicas who are auth for empty dirfrags. no actual changes to
+ // the file system are taking place here, so there is no Mutation.
+
+ rmdir_rollback rollback;
+ auto p = rbl.cbegin();
+ decode(rollback, p);
+
+ dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
+ mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
+ ceph_assert(mdr || mds->is_resolve());
+
+ CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
+ if (!dir)
+ dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
+ ceph_assert(dir);
+ CDentry *dn = dir->lookup(rollback.src_dname);
+ ceph_assert(dn);
+ dout(10) << " dn " << *dn << dendl;
+ CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
+ ceph_assert(straydir);
+ CDentry *straydn = straydir->lookup(rollback.dest_dname);
+ ceph_assert(straydn);
+ dout(10) << " straydn " << *straydn << dendl;
+ CInode *in = straydn->get_linkage()->get_inode();
+
+ dn->push_projected_linkage(in);
+ straydn->push_projected_linkage();
+
+ if (rollback.snapbl.length() && in->snaprealm) {
+ bool hadrealm;
+ auto p = rollback.snapbl.cbegin();
+ decode(hadrealm, p);
+ if (hadrealm) {
+ decode(in->snaprealm->srnode, p);
+ } else {
+ in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
+ }
+ }
+
+ if (mdr && !mdr->more()->peer_update_journaled) {
+ ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
+
+ _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
+ return;
+ }
+
+
+ EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_rollback", rollback.reqid, leader,
+ EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RMDIR);
+ mdlog->start_entry(le);
+
+ le->commit.add_dir_context(dn->get_dir());
+ le->commit.add_primary_dentry(dn, in, true);
+ // peer: no need to journal straydn
+
+ dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
+ le->commit.renamed_dirino = in->ino();
+
+ mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
+
+ submit_mdlog_entry(le,
+ new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
+ dn, straydn),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
+{
+ dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
+
+ straydn->get_dir()->unlink_inode(straydn);
+ dn->pop_projected_linkage();
+ straydn->pop_projected_linkage();
+
+ CInode *in = dn->get_linkage()->get_inode();
+ mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
+ !mdr || mdr->more()->peer_update_journaled);
+
+ if (mds->is_resolve()) {
+ CDir *root = mdcache->get_subtree_root(straydn->get_dir());
+ mdcache->try_trim_non_auth_subtree(root);
+ }
+
+ if (mdr)
+ mdcache->request_finish(mdr);
+
+ mdcache->finish_rollback(reqid, mdr);
+}
+
+
+/** _dir_is_nonempty[_unlocked]
+ *
+ * check if a directory is non-empty (i.e. we can rmdir it).
+ *
+ * the unlocked varient this is a fastpath check. we can't really be
+ * sure until we rdlock the filelock.
+ */
+bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
+{
+ dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
+ ceph_assert(in->is_auth());
+
+ if (in->filelock.is_cached())
+ return false; // there can be pending async create/unlink. don't know.
+ if (in->snaprealm && in->snaprealm->srnode.snaps.size())
+ return true; // in a snapshot!
+
+ auto&& ls = in->get_dirfrags();
+ for (const auto& dir : ls) {
+ // is the frag obviously non-empty?
+ if (dir->is_auth()) {
+ if (dir->get_projected_fnode()->fragstat.size()) {
+ dout(10) << "dir_is_nonempty_unlocked dirstat has "
+ << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
+{
+ dout(10) << "dir_is_nonempty " << *in << dendl;
+ ceph_assert(in->is_auth());
+ ceph_assert(in->filelock.can_read(mdr->get_client()));
+
+ frag_info_t dirstat;
+ version_t dirstat_version = in->get_projected_inode()->dirstat.version;
+
+ auto&& ls = in->get_dirfrags();
+ for (const auto& dir : ls) {
+ const auto& pf = dir->get_projected_fnode();
+ if (pf->fragstat.size()) {
+ dout(10) << "dir_is_nonempty dirstat has "
+ << pf->fragstat.size() << " items " << *dir << dendl;
+ return true;
+ }
+
+ if (pf->accounted_fragstat.version == dirstat_version)
+ dirstat.add(pf->accounted_fragstat);
+ else
+ dirstat.add(pf->fragstat);
+ }
+
+ return dirstat.size() != in->get_projected_inode()->dirstat.size();
+}
+
+
+// ======================================================
+
+
+class C_MDS_rename_finish : public ServerLogContext {
+ CDentry *srcdn;
+ CDentry *destdn;
+ CDentry *straydn;
+public:
+ C_MDS_rename_finish(Server *s, MDRequestRef& r,
+ CDentry *sdn, CDentry *ddn, CDentry *stdn) :
+ ServerLogContext(s, r),
+ srcdn(sdn), destdn(ddn), straydn(stdn) { }
+ void finish(int r) override {
+ ceph_assert(r == 0);
+ server->_rename_finish(mdr, srcdn, destdn, straydn);
+ }
+};
+
+
+/** handle_client_rename
+ *
+ * rename leader is the destdn auth. this is because cached inodes
+ * must remain connected. thus, any replica of srci, must also
+ * replicate destdn, and possibly straydn, so that srci (and
+ * destdn->inode) remain connected during the rename.
+ *
+ * to do this, we freeze srci, then leader (destdn auth) verifies that
+ * all other nodes have also replciated destdn and straydn. note that
+ * destdn replicas need not also replicate srci. this only works when
+ * destdn is leader.
+ *
+ * This function takes responsibility for the passed mdr.
+ */
+void Server::handle_client_rename(MDRequestRef& mdr)
+{
+ const auto& req = mdr->client_request;
+ dout(7) << "handle_client_rename " << *req << dendl;
+
+ filepath destpath = req->get_filepath();
+ filepath srcpath = req->get_filepath2();
+ if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
+ respond_to_request(mdr, -CEPHFS_EBUSY);
+ return;
+ }
+
+ if (req->get_alternate_name().size() > alternate_name_max) {
+ dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
+ respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
+ return;
+ }
+
+ auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
+ if (!destdn)
+ return;
+
+ dout(10) << " destdn " << *destdn << dendl;
+ CDir *destdir = destdn->get_dir();
+ ceph_assert(destdir->is_auth());
+ CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
+
+ dout(10) << " srcdn " << *srcdn << dendl;
+ CDir *srcdir = srcdn->get_dir();
+ CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+ CInode *srci = srcdnl->get_inode();
+ dout(10) << " srci " << *srci << dendl;
+
+ // -- some sanity checks --
+ if (destdn == srcdn) {
+ dout(7) << "rename src=dest, noop" << dendl;
+ respond_to_request(mdr, 0);
+ return;
+ }
+
+ // dest a child of src?
+ // e.g. mv /usr /usr/foo
+ if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
+ dout(7) << "cannot rename item to be a child of itself" << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ // is this a stray migration, reintegration or merge? (sanity checks!)
+ if (mdr->reqid.name.is_mds() &&
+ !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
+ MDS_INO_IS_STRAY(destpath.get_ino())) &&
+ !(destdnl->is_remote() &&
+ destdnl->get_remote_ino() == srci->ino())) {
+ respond_to_request(mdr, -CEPHFS_EINVAL); // actually, this won't reply, but whatev.
+ return;
+ }
+
+ CInode *oldin = 0;
+ if (!destdnl->is_null()) {
+ //dout(10) << "dest dn exists " << *destdn << dendl;
+ oldin = mdcache->get_dentry_inode(destdn, mdr, true);
+ if (!oldin) return;
+ dout(10) << " oldin " << *oldin << dendl;
+
+ // non-empty dir? do trivial fast unlocked check, do another check later with read locks
+ if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
+ respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+ return;
+ }
+
+ // mv /some/thing /to/some/existing_other_thing
+ if (oldin->is_dir() && !srci->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_EISDIR);
+ return;
+ }
+ if (!oldin->is_dir() && srci->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_ENOTDIR);
+ return;
+ }
+ if (srci == oldin && !srcdir->inode->is_stray()) {
+ respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
+ return;
+ }
+ if (destdn->get_alternate_name() != req->get_alternate_name()) {
+ /* the dentry exists but the alternate_names do not match, fail... */
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+ }
+
+ vector<CDentry*>& srctrace = mdr->dn[1];
+ vector<CDentry*>& desttrace = mdr->dn[0];
+
+ // src+dest traces _must_ share a common ancestor for locking to prevent orphans
+ if (destpath.get_ino() != srcpath.get_ino() &&
+ !(req->get_source().is_mds() &&
+ MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
+ CInode *srcbase = srctrace[0]->get_dir()->get_inode();
+ CInode *destbase = desttrace[0]->get_dir()->get_inode();
+ // ok, extend srctrace toward root until it is an ancestor of desttrace.
+ while (srcbase != destbase &&
+ !srcbase->is_projected_ancestor_of(destbase)) {
+ CDentry *pdn = srcbase->get_projected_parent_dn();
+ srctrace.insert(srctrace.begin(), pdn);
+ dout(10) << "rename prepending srctrace with " << *pdn << dendl;
+ srcbase = pdn->get_dir()->get_inode();
+ }
+
+ // then, extend destpath until it shares the same parent inode as srcpath.
+ while (destbase != srcbase) {
+ CDentry *pdn = destbase->get_projected_parent_dn();
+ desttrace.insert(desttrace.begin(), pdn);
+ dout(10) << "rename prepending desttrace with " << *pdn << dendl;
+ destbase = pdn->get_dir()->get_inode();
+ }
+ dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
+ }
+
+
+ bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
+ if (linkmerge)
+ dout(10) << " this is a link merge" << dendl;
+
+ // -- create stray dentry? --
+ CDentry *straydn = NULL;
+ if (destdnl->is_primary() && !linkmerge) {
+ straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
+ if (!straydn)
+ return;
+ dout(10) << " straydn is " << *straydn << dendl;
+ } else if (mdr->straydn) {
+ mdr->unpin(mdr->straydn);
+ mdr->straydn = NULL;
+ }
+
+
+ // -- locks --
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ MutationImpl::LockOpVec lov;
+
+ // we need to update srci's ctime. xlock its least contended lock to do that...
+ lov.add_xlock(&srci->linklock);
+ lov.add_xlock(&srci->snaplock);
+
+ if (oldin) {
+ // xlock oldin (for nlink--)
+ lov.add_xlock(&oldin->linklock);
+ lov.add_xlock(&oldin->snaplock);
+ if (oldin->is_dir()) {
+ ceph_assert(srci->is_dir());
+ lov.add_rdlock(&oldin->filelock); // to verify it's empty
+
+ // adjust locking order?
+ int cmp = mdr->compare_paths();
+ if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
+ std::reverse(lov.begin(), lov.end());
+ } else {
+ ceph_assert(!srci->is_dir());
+ // adjust locking order;
+ if (srci->ino() > oldin->ino())
+ std::reverse(lov.begin(), lov.end());
+ }
+ }
+
+ // straydn?
+ if (straydn) {
+ lov.add_wrlock(&straydn->get_dir()->inode->filelock);
+ lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
+ lov.add_xlock(&straydn->lock);
+ }
+
+ CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
+ if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
+ return;
+
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
+
+ if (linkmerge)
+ ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
+
+ if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
+ if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
+ return;
+
+ if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
+ return;
+
+ if (!linkmerge && !check_fragment_space(mdr, destdn->get_dir()))
+ return;
+
+ if (!linkmerge && !check_dir_max_entries(mdr, destdn->get_dir()))
+ return;
+
+ if (!check_access(mdr, srci, MAY_WRITE))
+ return;
+ }
+
+ // with read lock, really verify oldin is empty
+ if (oldin &&
+ oldin->is_dir() &&
+ _dir_is_nonempty(mdr, oldin)) {
+ respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
+ return;
+ }
+
+ /* project_snaprealm_past_parent() will do this job
+ *
+ // moving between snaprealms?
+ if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
+ SnapRealm *srcrealm = srci->find_snaprealm();
+ SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
+ if (srcrealm != destrealm &&
+ (srcrealm->get_newest_seq() + 1 > srcdn->first ||
+ destrealm->get_newest_seq() + 1 > srcdn->first)) {
+ dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
+ mdcache->snaprealm_create(mdr, srci);
+ return;
+ }
+ }
+ */
+
+ SnapRealm *dest_realm = nullptr;
+ SnapRealm *src_realm = nullptr;
+ if (!linkmerge) {
+ dest_realm = destdir->inode->find_snaprealm();
+ if (srcdir->inode == destdir->inode)
+ src_realm = dest_realm;
+ else
+ src_realm = srcdir->inode->find_snaprealm();
+ if (src_realm != dest_realm &&
+ src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
+ respond_to_request(mdr, -CEPHFS_EXDEV);
+ return;
+ }
+ }
+
+ ceph_assert(g_conf()->mds_kill_rename_at != 1);
+
+ // -- open all srcdn inode frags, if any --
+ // we need these open so that auth can properly delegate from inode to dirfrags
+ // after the inode is _ours_.
+ if (srcdnl->is_primary() &&
+ !srcdn->is_auth() &&
+ srci->is_dir()) {
+ dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
+ mdr->set_stickydirs(srci);
+
+ frag_vec_t leaves;
+ srci->dirfragtree.get_leaves(leaves);
+ for (const auto& leaf : leaves) {
+ CDir *dir = srci->get_dirfrag(leaf);
+ if (!dir) {
+ dout(10) << " opening " << leaf << " under " << *srci << dendl;
+ mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ }
+ }
+
+ // -- prepare snaprealm ---
+
+ if (linkmerge) {
+ if (!mdr->more()->srci_srnode &&
+ srci->get_projected_inode()->nlink == 1 &&
+ srci->is_projected_snaprealm_global()) {
+ sr_t *new_srnode = srci->prepare_new_srnode(0);
+ srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
+
+ srci->clear_snaprealm_global(new_srnode);
+ mdr->more()->srci_srnode = new_srnode;
+ }
+ } else {
+ if (oldin && !mdr->more()->desti_srnode) {
+ if (oldin->is_projected_snaprealm_global()) {
+ sr_t *new_srnode = oldin->prepare_new_srnode(0);
+ oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
+ // dropping the last linkage or dropping the last remote linkage,
+ // detch the inode from global snaprealm
+ auto nlink = oldin->get_projected_inode()->nlink;
+ if (nlink == 1 ||
+ (nlink == 2 && !destdnl->is_primary() &&
+ !oldin->get_projected_parent_dir()->inode->is_stray()))
+ oldin->clear_snaprealm_global(new_srnode);
+ mdr->more()->desti_srnode = new_srnode;
+ } else if (destdnl->is_primary()) {
+ snapid_t follows = dest_realm->get_newest_seq();
+ if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
+ sr_t *new_srnode = oldin->prepare_new_srnode(follows);
+ oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
+ mdr->more()->desti_srnode = new_srnode;
+ }
+ }
+ }
+ if (!mdr->more()->srci_srnode) {
+ if (srci->is_projected_snaprealm_global()) {
+ sr_t *new_srnode = srci->prepare_new_srnode(0);
+ srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
+ mdr->more()->srci_srnode = new_srnode;
+ } else if (srcdnl->is_primary()) {
+ snapid_t follows = src_realm->get_newest_seq();
+ if (src_realm != dest_realm &&
+ (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
+ sr_t *new_srnode = srci->prepare_new_srnode(follows);
+ srci->record_snaprealm_past_parent(new_srnode, dest_realm);
+ mdr->more()->srci_srnode = new_srnode;
+ }
+ }
+ }
+ }
+
+ // -- prepare witnesses --
+
+ /*
+ * NOTE: we use _all_ replicas as witnesses.
+ * this probably isn't totally necessary (esp for file renames),
+ * but if/when we change that, we have to make sure rejoin is
+ * sufficiently robust to handle strong rejoins from survivors
+ * with totally wrong dentry->inode linkage.
+ * (currently, it can ignore rename effects, because the resolve
+ * stage will sort them out.)
+ */
+ set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
+ if (srcdn->is_auth())
+ srcdn->list_replicas(witnesses);
+ else
+ witnesses.insert(srcdn->authority().first);
+ if (srcdnl->is_remote() && !srci->is_auth())
+ witnesses.insert(srci->authority().first);
+ destdn->list_replicas(witnesses);
+ if (destdnl->is_remote() && !oldin->is_auth())
+ witnesses.insert(oldin->authority().first);
+ dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
+
+ if (!witnesses.empty()) {
+ // Replicas can't see projected dentry linkages and will get confused.
+ // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
+ // can't project these inodes' linkages.
+ bool need_flush = false;
+ for (auto& dn : srctrace) {
+ if (dn->is_projected()) {
+ need_flush = true;
+ break;
+ }
+ }
+ if (!need_flush) {
+ CDentry *dn = destdn;
+ do {
+ if (dn->is_projected()) {
+ need_flush = true;
+ break;
+ }
+ CInode *diri = dn->get_dir()->get_inode();
+ dn = diri->get_projected_parent_dn();
+ } while (dn);
+ }
+ if (need_flush) {
+ mdlog->wait_for_safe(
+ new MDSInternalContextWrapper(mds,
+ new C_MDS_RetryRequest(mdcache, mdr)));
+ mdlog->flush();
+ return;
+ }
+ }
+
+ // do srcdn auth last
+ mds_rank_t last = MDS_RANK_NONE;
+ if (!srcdn->is_auth()) {
+ last = srcdn->authority().first;
+ mdr->more()->srcdn_auth_mds = last;
+ // ask auth of srci to mark srci as ambiguous auth if more than two MDS
+ // are involved in the rename operation.
+ if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
+ dout(10) << " preparing ambiguous auth for srci" << dendl;
+ ceph_assert(mdr->more()->is_remote_frozen_authpin);
+ ceph_assert(mdr->more()->rename_inode == srci);
+ _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
+ return;
+ }
+ }
+
+ for (set<mds_rank_t>::iterator p = witnesses.begin();
+ p != witnesses.end();
+ ++p) {
+ if (*p == last) continue; // do it last!
+ if (mdr->more()->witnessed.count(*p)) {
+ dout(10) << " already witnessed by mds." << *p << dendl;
+ } else if (mdr->more()->waiting_on_peer.count(*p)) {
+ dout(10) << " already waiting on witness mds." << *p << dendl;
+ } else {
+ if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
+ return;
+ }
+ }
+ if (!mdr->more()->waiting_on_peer.empty())
+ return; // we're waiting for a witness.
+
+ if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
+ dout(10) << " preparing last witness (srcdn auth)" << dendl;
+ ceph_assert(mdr->more()->waiting_on_peer.count(last) == 0);
+ _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
+ return;
+ }
+
+ // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
+ if (!mdr->more()->peers.empty() && !srci->is_dir())
+ ceph_assert(g_conf()->mds_kill_rename_at != 3);
+ if (!mdr->more()->peers.empty() && srci->is_dir())
+ ceph_assert(g_conf()->mds_kill_rename_at != 4);
+
+ // -- declare now --
+ mdr->set_mds_stamp(ceph_clock_now());
+
+ // -- prepare journal entry --
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "rename");
+ mdlog->start_entry(le);
+ le->metablob.add_client_req(mdr->reqid, req->get_oldest_client_tid());
+ if (!mdr->more()->witnessed.empty()) {
+ dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
+
+ le->reqid = mdr->reqid;
+ le->had_peers = true;
+
+ mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+ // no need to send frozen auth pin to recovring auth MDS of srci
+ mdr->more()->is_remote_frozen_authpin = false;
+ }
+
+ _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, req->get_alternate_name(), straydn);
+ if (le->client_map.length())
+ le->cmapv = mds->sessionmap.get_projected();
+
+ // -- commit locally --
+ C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
+
+ journal_and_reply(mdr, srci, destdn, le, fin);
+ mds->balancer->maybe_fragment(destdn->get_dir(), false);
+}
+
+
+void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+ dout(10) << "_rename_finish " << *mdr << dendl;
+
+ if (!mdr->more()->witnessed.empty())
+ mdcache->logged_leader_update(mdr->reqid);
+
+ // apply
+ _rename_apply(mdr, srcdn, destdn, straydn);
+
+ mdcache->send_dentry_link(destdn, mdr);
+
+ CDentry::linkage_t *destdnl = destdn->get_linkage();
+ CInode *in = destdnl->get_inode();
+ bool need_eval = mdr->more()->cap_imports.count(in);
+
+ // test hack: test peer commit
+ if (!mdr->more()->peers.empty() && !in->is_dir())
+ ceph_assert(g_conf()->mds_kill_rename_at != 5);
+ if (!mdr->more()->peers.empty() && in->is_dir())
+ ceph_assert(g_conf()->mds_kill_rename_at != 6);
+
+ // bump popularity
+ mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
+ if (destdnl->is_remote() && in->is_auth())
+ mds->balancer->hit_inode(in, META_POP_IWR);
+
+ // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
+
+ ceph_assert(g_conf()->mds_kill_rename_at != 7);
+
+ // reply
+ respond_to_request(mdr, 0);
+
+ if (need_eval)
+ mds->locker->eval(in, CEPH_CAP_LOCKS, true);
+
+ // clean up?
+ // respond_to_request() drops locks. So stray reintegration can race with us.
+ if (straydn && !straydn->get_projected_linkage()->is_null()) {
+ mdcache->notify_stray(straydn);
+ }
+}
+
+
+
+// helpers
+
+bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
+ vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
+{
+ const auto& client_req = mdr->client_request;
+ ceph_assert(client_req);
+
+ if (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
+ dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
+ if (mdr->more()->waiting_on_peer.empty())
+ mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
+ return false;
+ }
+
+ dout(10) << "_rename_prepare_witness mds." << who << dendl;
+ auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREP);
+
+ req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
+ for (auto dn : srctrace)
+ req->srcdnpath.push_dentry(dn->get_name());
+ req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
+ for (auto dn : dsttrace)
+ req->destdnpath.push_dentry(dn->get_name());
+ req->alternate_name = client_req->alternate_name;
+ if (straydn)
+ mdcache->encode_replica_stray(straydn, who, req->straybl);
+
+ if (mdr->more()->srci_srnode)
+ encode(*mdr->more()->srci_srnode, req->srci_snapbl);
+ if (mdr->more()->desti_srnode)
+ encode(*mdr->more()->desti_srnode, req->desti_snapbl);
+
+ req->srcdn_auth = mdr->more()->srcdn_auth_mds;
+
+ // srcdn auth will verify our current witness list is sufficient
+ req->witnesses = witnesse;
+
+ req->op_stamp = mdr->get_op_stamp();
+ mds->send_message_mds(req, who);
+
+ ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
+ mdr->more()->waiting_on_peer.insert(who);
+ return true;
+}
+
+version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
+{
+ version_t oldpv = mdr->more()->inode_import_v;
+
+ CDentry::linkage_t *srcdnl = srcdn->get_linkage();
+
+ /* import node */
+ auto blp = mdr->more()->inode_import.cbegin();
+
+ // imported caps
+ map<client_t,entity_inst_t> client_map;
+ map<client_t, client_metadata_t> client_metadata_map;
+ decode(client_map, blp);
+ decode(client_metadata_map, blp);
+ prepare_force_open_sessions(client_map, client_metadata_map,
+ mdr->more()->imported_session_map);
+ encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
+ encode(client_metadata_map, *client_map_bl);
+
+ list<ScatterLock*> updated_scatterlocks;
+ mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
+ mdr->more()->cap_imports, updated_scatterlocks);
+
+ // hack: force back to !auth and clean, temporarily
+ srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
+ srcdnl->get_inode()->mark_clean();
+
+ return oldpv;
+}
+
+bool Server::_need_force_journal(CInode *diri, bool empty)
+{
+ auto&& dirs = diri->get_dirfrags();
+
+ bool force_journal = false;
+ if (empty) {
+ for (const auto& dir : dirs) {
+ if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
+ dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
+ force_journal = true;
+ break;
+ } else
+ dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
+ }
+ } else {
+ // see if any children of our frags are auth subtrees.
+ std::vector<CDir*> subtrees;
+ mdcache->get_subtrees(subtrees);
+ dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
+ for (const auto& dir : dirs) {
+ for (const auto& subtree : subtrees) {
+ if (dir->contains(subtree)) {
+ if (subtree->get_dir_auth().first == mds->get_nodeid()) {
+ dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
+ << *subtree << dendl;
+ force_journal = true;
+ break;
+ } else
+ dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
+ } else
+ dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
+ }
+ if (force_journal)
+ break;
+ }
+ }
+ return force_journal;
+}
+
+void Server::_rename_prepare(MDRequestRef& mdr,
+ EMetaBlob *metablob, bufferlist *client_map_bl,
+ CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
+ CDentry *straydn)
+{
+ dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
+ if (straydn)
+ dout(10) << " straydn " << *straydn << dendl;
+
+ CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+ CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
+ CInode *srci = srcdnl->get_inode();
+ CInode *oldin = destdnl->get_inode();
+
+ // primary+remote link merge?
+ bool linkmerge = (srci == oldin);
+ if (linkmerge)
+ ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
+ bool silent = srcdn->get_dir()->inode->is_stray();
+
+ bool force_journal_dest = false;
+ if (srci->is_dir() && !destdn->is_auth()) {
+ if (srci->is_auth()) {
+ // if we are auth for srci and exporting it, force journal because journal replay needs
+ // the source inode to create auth subtrees.
+ dout(10) << " we are exporting srci, will force journal destdn" << dendl;
+ force_journal_dest = true;
+ } else
+ force_journal_dest = _need_force_journal(srci, false);
+ }
+
+ bool force_journal_stray = false;
+ if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
+ force_journal_stray = _need_force_journal(oldin, true);
+
+ if (linkmerge)
+ dout(10) << " merging remote and primary links to the same inode" << dendl;
+ if (silent)
+ dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
+ if (force_journal_dest)
+ dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
+ if (force_journal_stray)
+ dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
+
+ if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
+ dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
+ metablob->renamed_dirino = srci->ino();
+ } else if (oldin && oldin->is_dir() && force_journal_stray) {
+ dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
+ metablob->renamed_dirino = oldin->ino();
+ }
+
+ // prepare
+ CInode::mempool_inode *spi = 0; // renamed inode
+ CInode::mempool_inode *tpi = 0; // target/overwritten inode
+
+ // target inode
+ if (!linkmerge) {
+ if (destdnl->is_primary()) {
+ ceph_assert(straydn); // moving to straydn.
+ // link--, and move.
+ if (destdn->is_auth()) {
+ auto pi= oldin->project_inode(mdr); //project_snaprealm
+ pi.inode->version = straydn->pre_dirty(pi.inode->version);
+ pi.inode->update_backtrace();
+ tpi = pi.inode.get();
+ }
+ straydn->push_projected_linkage(oldin);
+ } else if (destdnl->is_remote()) {
+ // nlink-- targeti
+ if (oldin->is_auth()) {
+ auto pi = oldin->project_inode(mdr);
+ pi.inode->version = oldin->pre_dirty();
+ tpi = pi.inode.get();
+ }
+ }
+ }
+
+ // dest
+ if (destdnl->is_null()) {
+ /* handle_client_rename checks that alternate_name matches for existing destdn */
+ destdn->set_alternate_name(alternate_name);
+ }
+ if (srcdnl->is_remote()) {
+ if (!linkmerge) {
+ // destdn
+ if (destdn->is_auth())
+ mdr->more()->pvmap[destdn] = destdn->pre_dirty();
+ destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
+ // srci
+ if (srci->is_auth()) {
+ auto pi = srci->project_inode(mdr);
+ pi.inode->version = srci->pre_dirty();
+ spi = pi.inode.get();
+ }
+ } else {
+ dout(10) << " will merge remote onto primary link" << dendl;
+ if (destdn->is_auth()) {
+ auto pi = oldin->project_inode(mdr);
+ pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->get_version());
+ spi = pi.inode.get();
+ }
+ }
+ } else { // primary
+ if (destdn->is_auth()) {
+ version_t oldpv;
+ if (srcdn->is_auth())
+ oldpv = srci->get_projected_version();
+ else {
+ oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
+
+ // note which dirfrags have child subtrees in the journal
+ // event, so that we can open those (as bounds) during replay.
+ if (srci->is_dir()) {
+ auto&& ls = srci->get_dirfrags();
+ for (const auto& dir : ls) {
+ if (!dir->is_auth())
+ metablob->renamed_dir_frags.push_back(dir->get_frag());
+ }
+ dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
+ }
+ }
+ auto pi = srci->project_inode(mdr); // project snaprealm if srcdnl->is_primary
+ // & srcdnl->snaprealm
+ pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
+ pi.inode->update_backtrace();
+ spi = pi.inode.get();
+ }
+ destdn->push_projected_linkage(srci);
+ }
+
+ // src
+ if (srcdn->is_auth())
+ mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
+ srcdn->push_projected_linkage(); // push null linkage
+
+ if (!silent) {
+ if (spi) {
+ spi->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > spi->rstat.rctime)
+ spi->rstat.rctime = mdr->get_op_stamp();
+ spi->change_attr++;
+ if (linkmerge)
+ spi->nlink--;
+ }
+ if (tpi) {
+ tpi->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > tpi->rstat.rctime)
+ tpi->rstat.rctime = mdr->get_op_stamp();
+ tpi->change_attr++;
+ {
+ std::string t;
+ destdn->make_path_string(t, true);
+ tpi->stray_prior_path = std::move(t);
+ }
+ tpi->nlink--;
+ if (tpi->nlink == 0)
+ oldin->state_set(CInode::STATE_ORPHAN);
+ }
+ }
+
+ // prepare nesting, mtime updates
+ int predirty_dir = silent ? 0:PREDIRTY_DIR;
+
+ // guarantee stray dir is processed first during journal replay. unlink the old inode,
+ // then link the source inode to destdn
+ if (destdnl->is_primary()) {
+ ceph_assert(straydn);
+ if (straydn->is_auth()) {
+ metablob->add_dir_context(straydn->get_dir());
+ metablob->add_dir(straydn->get_dir(), true);
+ }
+ }
+
+ if (!linkmerge && destdnl->is_remote() && oldin->is_auth()) {
+ CDir *oldin_dir = oldin->get_projected_parent_dir();
+ if (oldin_dir != srcdn->get_dir() && oldin_dir != destdn->get_dir())
+ mdcache->predirty_journal_parents(mdr, metablob, oldin, oldin_dir, PREDIRTY_PRIMARY);
+ }
+
+ // sub off target
+ if (destdn->is_auth() && !destdnl->is_null()) {
+ mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
+ (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
+ if (destdnl->is_primary()) {
+ ceph_assert(straydn);
+ mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
+ PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
+ }
+ }
+
+ if (srcdnl->is_remote() && srci->is_auth()) {
+ CDir *srci_dir = srci->get_projected_parent_dir();
+ if (srci_dir != srcdn->get_dir() && srci_dir != destdn->get_dir())
+ mdcache->predirty_journal_parents(mdr, metablob, srci, srci_dir, PREDIRTY_PRIMARY);
+ }
+
+ // move srcdn
+ int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
+ int flags = predirty_dir | predirty_primary;
+ if (srcdn->is_auth())
+ mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
+ if (destdn->is_auth())
+ mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
+
+ // add it all to the metablob
+ // target inode
+ if (!linkmerge) {
+ if (destdnl->is_primary()) {
+ ceph_assert(straydn);
+ if (destdn->is_auth()) {
+ // project snaprealm, too
+ if (auto& desti_srnode = mdr->more()->desti_srnode) {
+ oldin->project_snaprealm(desti_srnode);
+ if (tpi->nlink == 0)
+ ceph_assert(!desti_srnode->is_parent_global());
+ desti_srnode = NULL;
+ }
+ straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+ metablob->add_primary_dentry(straydn, oldin, true, true);
+ } else if (force_journal_stray) {
+ dout(10) << " forced journaling straydn " << *straydn << dendl;
+ metablob->add_dir_context(straydn->get_dir());
+ metablob->add_primary_dentry(straydn, oldin, true);
+ }
+ } else if (destdnl->is_remote()) {
+ if (oldin->is_auth()) {
+ sr_t *new_srnode = NULL;
+ if (mdr->peer_request) {
+ if (mdr->peer_request->desti_snapbl.length() > 0) {
+ new_srnode = new sr_t();
+ auto p = mdr->peer_request->desti_snapbl.cbegin();
+ decode(*new_srnode, p);
+ }
+ } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
+ new_srnode = desti_srnode;
+ desti_srnode = NULL;
+ }
+ if (new_srnode) {
+ oldin->project_snaprealm(new_srnode);
+ if (tpi->nlink == 0)
+ ceph_assert(!new_srnode->is_parent_global());
+ }
+ // auth for targeti
+ CDentry *oldin_pdn = oldin->get_projected_parent_dn();
+ mdcache->journal_cow_dentry(mdr.get(), metablob, oldin_pdn);
+ metablob->add_primary_dentry(oldin_pdn, oldin, true);
+ }
+ }
+ }
+
+ // dest
+ if (srcdnl->is_remote()) {
+ ceph_assert(!linkmerge);
+ if (destdn->is_auth() && !destdnl->is_null())
+ mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
+ else
+ destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+
+ if (destdn->is_auth())
+ metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
+
+ if (srci->is_auth() ) { // it's remote
+ if (mdr->peer_request) {
+ if (mdr->peer_request->srci_snapbl.length() > 0) {
+ sr_t *new_srnode = new sr_t();
+ auto p = mdr->peer_request->srci_snapbl.cbegin();
+ decode(*new_srnode, p);
+ srci->project_snaprealm(new_srnode);
+ }
+ } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
+ srci->project_snaprealm(srci_srnode);
+ srci_srnode = NULL;
+ }
+
+ CDentry *srci_pdn = srci->get_projected_parent_dn();
+ mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn);
+ metablob->add_primary_dentry(srci_pdn, srci, true);
+ }
+ } else if (srcdnl->is_primary()) {
+ // project snap parent update?
+ if (destdn->is_auth()) {
+ if (auto& srci_srnode = mdr->more()->srci_srnode) {
+ srci->project_snaprealm(srci_srnode);
+ srci_srnode = NULL;
+ }
+ }
+
+ if (destdn->is_auth() && !destdnl->is_null())
+ mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
+
+ destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
+ {
+ auto do_corruption = inject_rename_corrupt_dentry_first;
+ if (unlikely(do_corruption > 0.0)) {
+ auto r = ceph::util::generate_random_number(0.0, 1.0);
+ if (r < do_corruption) {
+ dout(0) << "corrupting dn: " << *destdn << dendl;
+ destdn->first = -10;
+ }
+ }
+ }
+
+ if (destdn->is_auth())
+ metablob->add_primary_dentry(destdn, srci, true, true);
+ else if (force_journal_dest) {
+ dout(10) << " forced journaling destdn " << *destdn << dendl;
+ metablob->add_dir_context(destdn->get_dir());
+ metablob->add_primary_dentry(destdn, srci, true);
+ if (srcdn->is_auth() && srci->is_dir()) {
+ // journal new subtrees root dirfrags
+ auto&& ls = srci->get_dirfrags();
+ for (const auto& dir : ls) {
+ if (dir->is_auth())
+ metablob->add_dir(dir, true);
+ }
+ }
+ }
+ }
+
+ // src
+ if (srcdn->is_auth()) {
+ dout(10) << " journaling srcdn " << *srcdn << dendl;
+ mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
+ // also journal the inode in case we need do peer rename rollback. It is Ok to add
+ // both primary and NULL dentries. Because during journal replay, null dentry is
+ // processed after primary dentry.
+ if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
+ metablob->add_primary_dentry(srcdn, srci, true);
+ metablob->add_null_dentry(srcdn, true);
+ } else
+ dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
+
+ // make renamed inode first track the dn
+ if (srcdnl->is_primary() && destdn->is_auth()) {
+ ceph_assert(srci->first <= destdn->first);
+ srci->first = destdn->first;
+ }
+ // make stray inode first track the straydn
+ if (straydn && straydn->is_auth()) {
+ ceph_assert(oldin->first <= straydn->first);
+ oldin->first = straydn->first;
+ }
+
+ if (oldin && oldin->is_dir()) {
+ ceph_assert(straydn);
+ mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
+ }
+ if (srci->is_dir())
+ mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
+
+}
+
+
+void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+ dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
+ dout(10) << " pvs " << mdr->more()->pvmap << dendl;
+
+ CDentry::linkage_t *srcdnl = srcdn->get_linkage();
+ CDentry::linkage_t *destdnl = destdn->get_linkage();
+
+ CInode *oldin = destdnl->get_inode();
+
+ // primary+remote link merge?
+ bool linkmerge = (srcdnl->get_inode() == oldin);
+ if (linkmerge)
+ ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
+
+ bool new_in_snaprealm = false;
+ bool new_oldin_snaprealm = false;
+
+ // target inode
+ if (!linkmerge) {
+ if (destdnl->is_primary()) {
+ ceph_assert(straydn);
+ dout(10) << "straydn is " << *straydn << dendl;
+
+ // if there is newly created snaprealm, need to split old snaprealm's
+ // inodes_with_caps. So pop snaprealm before linkage changes.
+ if (destdn->is_auth()) {
+ bool hadrealm = (oldin->snaprealm ? true : false);
+ oldin->early_pop_projected_snaprealm();
+ new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
+ } else {
+ ceph_assert(mdr->peer_request);
+ if (mdr->peer_request->desti_snapbl.length()) {
+ new_oldin_snaprealm = !oldin->snaprealm;
+ oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
+ ceph_assert(oldin->snaprealm);
+ }
+ }
+
+ destdn->get_dir()->unlink_inode(destdn, false);
+
+ straydn->pop_projected_linkage();
+ if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
+ ceph_assert(!straydn->is_projected()); // no other projected
+
+ // nlink-- targeti
+ if (destdn->is_auth())
+ oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
+
+ mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
+ } else if (destdnl->is_remote()) {
+ destdn->get_dir()->unlink_inode(destdn, false);
+ if (oldin->is_auth()) {
+ oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
+ } else if (mdr->peer_request) {
+ if (mdr->peer_request->desti_snapbl.length() > 0) {
+ ceph_assert(oldin->snaprealm);
+ oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
+ }
+ } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
+ delete desti_srnode;
+ desti_srnode = NULL;
+ }
+ }
+ }
+
+ // unlink src before we relink it at dest
+ CInode *in = srcdnl->get_inode();
+ ceph_assert(in);
+
+ bool srcdn_was_remote = srcdnl->is_remote();
+ if (!srcdn_was_remote) {
+ // if there is newly created snaprealm, need to split old snaprealm's
+ // inodes_with_caps. So pop snaprealm before linkage changes.
+ if (destdn->is_auth()) {
+ bool hadrealm = (in->snaprealm ? true : false);
+ in->early_pop_projected_snaprealm();
+ new_in_snaprealm = (in->snaprealm && !hadrealm);
+ } else {
+ ceph_assert(mdr->peer_request);
+ if (mdr->peer_request->srci_snapbl.length()) {
+ new_in_snaprealm = !in->snaprealm;
+ in->decode_snap_blob(mdr->peer_request->srci_snapbl);
+ ceph_assert(in->snaprealm);
+ }
+ }
+ }
+
+ srcdn->get_dir()->unlink_inode(srcdn);
+
+ // dest
+ if (srcdn_was_remote) {
+ if (!linkmerge) {
+ // destdn
+ destdnl = destdn->pop_projected_linkage();
+ if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
+ ceph_assert(!destdn->is_projected()); // no other projected
+
+ destdn->link_remote(destdnl, in);
+ if (destdn->is_auth())
+ destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
+ // in
+ if (in->is_auth()) {
+ in->pop_and_dirty_projected_inode(mdr->ls, mdr);
+ } else if (mdr->peer_request) {
+ if (mdr->peer_request->srci_snapbl.length() > 0) {
+ ceph_assert(in->snaprealm);
+ in->decode_snap_blob(mdr->peer_request->srci_snapbl);
+ }
+ } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
+ delete srci_srnode;
+ srci_srnode = NULL;
+ }
+ } else {
+ dout(10) << "merging remote onto primary link" << dendl;
+ oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
+ }
+ } else { // primary
+ if (linkmerge) {
+ dout(10) << "merging primary onto remote link" << dendl;
+ destdn->get_dir()->unlink_inode(destdn, false);
+ }
+ destdnl = destdn->pop_projected_linkage();
+ if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
+ ceph_assert(!destdn->is_projected()); // no other projected
+
+ // srcdn inode import?
+ if (!srcdn->is_auth() && destdn->is_auth()) {
+ ceph_assert(mdr->more()->inode_import.length() > 0);
+
+ map<client_t,Capability::Import> imported_caps;
+
+ // finish cap imports
+ finish_force_open_sessions(mdr->more()->imported_session_map);
+ if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
+ mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
+ mdr->more()->srcdn_auth_mds, true,
+ mdr->more()->imported_session_map,
+ mdr->more()->cap_imports[destdnl->get_inode()],
+ imported_caps);
+ }
+
+ mdr->more()->inode_import.clear();
+ encode(imported_caps, mdr->more()->inode_import);
+
+ /* hack: add an auth pin for each xlock we hold. These were
+ * remote xlocks previously but now they're local and
+ * we're going to try and unpin when we xlock_finish. */
+
+ for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
+ i != mdr->locks.end();
+ ++i) {
+ SimpleLock *lock = i->lock;
+ if (lock->get_parent() != destdnl->get_inode())
+ break;
+ if (i->is_xlock() && !lock->is_locallock())
+ mds->locker->xlock_import(lock);
+ }
+
+ // hack: fix auth bit
+ in->state_set(CInode::STATE_AUTH);
+
+ mdr->clear_ambiguous_auth();
+ }
+
+ if (destdn->is_auth())
+ in->pop_and_dirty_projected_inode(mdr->ls, mdr);
+ }
+
+ // src
+ if (srcdn->is_auth())
+ srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
+ srcdn->pop_projected_linkage();
+ if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
+ ceph_assert(!srcdn->is_projected()); // no other projected
+
+ // apply remaining projected inodes (nested)
+ mdr->apply();
+
+ // update subtree map?
+ if (destdnl->is_primary() && in->is_dir())
+ mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
+
+ if (straydn && oldin->is_dir())
+ mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
+
+ if (new_oldin_snaprealm)
+ mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
+ if (new_in_snaprealm)
+ mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
+
+ // removing a new dn?
+ if (srcdn->is_auth())
+ srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
+}
+
+
+
+// ------------
+// PEER
+
+class C_MDS_PeerRenamePrep : public ServerLogContext {
+ CDentry *srcdn, *destdn, *straydn;
+public:
+ C_MDS_PeerRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
+ ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
+ void finish(int r) override {
+ server->_logged_peer_rename(mdr, srcdn, destdn, straydn);
+ }
+};
+
+class C_MDS_PeerRenameCommit : public ServerContext {
+ MDRequestRef mdr;
+ CDentry *srcdn, *destdn, *straydn;
+public:
+ C_MDS_PeerRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
+ ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
+ void finish(int r) override {
+ server->_commit_peer_rename(mdr, r, srcdn, destdn, straydn);
+ }
+};
+
+class C_MDS_PeerRenameSessionsFlushed : public ServerContext {
+ MDRequestRef mdr;
+public:
+ C_MDS_PeerRenameSessionsFlushed(Server *s, MDRequestRef& r) :
+ ServerContext(s), mdr(r) {}
+ void finish(int r) override {
+ server->_peer_rename_sessions_flushed(mdr);
+ }
+};
+
+void Server::handle_peer_rename_prep(MDRequestRef& mdr)
+{
+ dout(10) << "handle_peer_rename_prep " << *mdr
+ << " " << mdr->peer_request->srcdnpath
+ << " to " << mdr->peer_request->destdnpath
+ << dendl;
+
+ if (mdr->peer_request->is_interrupted()) {
+ dout(10) << " peer request interrupted, sending noop reply" << dendl;
+ auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
+ reply->mark_interrupted();
+ mds->send_message_mds(reply, mdr->peer_to_mds);
+ mdr->reset_peer_request();
+ return;
+ }
+
+ // discover destdn
+ filepath destpath(mdr->peer_request->destdnpath);
+ dout(10) << " dest " << destpath << dendl;
+ vector<CDentry*> trace;
+ CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
+ int r = mdcache->path_traverse(mdr, cf, destpath,
+ MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
+ &trace);
+ if (r > 0) return;
+ if (r == -CEPHFS_ESTALE) {
+ mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
+ mdr->peer_to_mds, true);
+ return;
+ }
+ ceph_assert(r == 0); // we shouldn't get an error here!
+
+ CDentry *destdn = trace.back();
+ CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
+ dout(10) << " destdn " << *destdn << dendl;
+ mdr->pin(destdn);
+
+ // discover srcdn
+ filepath srcpath(mdr->peer_request->srcdnpath);
+ dout(10) << " src " << srcpath << dendl;
+ CInode *srci = nullptr;
+ r = mdcache->path_traverse(mdr, cf, srcpath,
+ MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
+ &trace, &srci);
+ if (r > 0) return;
+ ceph_assert(r == 0);
+
+ CDentry *srcdn = trace.back();
+ CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+ dout(10) << " srcdn " << *srcdn << dendl;
+ mdr->pin(srcdn);
+ mdr->pin(srci);
+
+ // stray?
+ bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
+ if (linkmerge)
+ ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
+ CDentry *straydn = mdr->straydn;
+ if (destdnl->is_primary() && !linkmerge)
+ ceph_assert(straydn);
+
+ mdr->set_op_stamp(mdr->peer_request->op_stamp);
+ mdr->more()->srcdn_auth_mds = srcdn->authority().first;
+
+ // set up commit waiter (early, to clean up any freezing etc we do)
+ if (!mdr->more()->peer_commit)
+ mdr->more()->peer_commit = new C_MDS_PeerRenameCommit(this, mdr, srcdn, destdn, straydn);
+
+ // am i srcdn auth?
+ if (srcdn->is_auth()) {
+ set<mds_rank_t> srcdnrep;
+ srcdn->list_replicas(srcdnrep);
+
+ bool reply_witness = false;
+ if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
+ // freeze?
+ // we need this to
+ // - avoid conflicting lock state changes
+ // - avoid concurrent updates to the inode
+ // (this could also be accomplished with the versionlock)
+ int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
+ dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
+ bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
+
+ // unfreeze auth pin after freezing the inode to avoid queueing waiters
+ if (srcdnl->get_inode()->is_frozen_auth_pin())
+ mdr->unfreeze_auth_pin();
+
+ if (!frozen_inode) {
+ srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ /*
+ * set ambiguous auth for srci
+ * NOTE: we don't worry about ambiguous cache expire as we do
+ * with subtree migrations because all peers will pin
+ * srcdn->get_inode() for duration of this rename.
+ */
+ mdr->set_ambiguous_auth(srcdnl->get_inode());
+
+ // just mark the source inode as ambiguous auth if more than two MDS are involved.
+ // the leader will send another OP_RENAMEPREP peer request later.
+ if (mdr->peer_request->witnesses.size() > 1) {
+ dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
+ reply_witness = true;
+ }
+
+ // make sure bystanders have received all lock related messages
+ for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
+ if (*p == mdr->peer_to_mds ||
+ (mds->is_cluster_degraded() &&
+ !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
+ continue;
+ auto notify = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMENOTIFY);
+ mds->send_message_mds(notify, *p);
+ mdr->more()->waiting_on_peer.insert(*p);
+ }
+
+ // make sure clients have received all cap related messages
+ set<client_t> export_client_set;
+ mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
+
+ MDSGatherBuilder gather(g_ceph_context);
+ flush_client_sessions(export_client_set, gather);
+ if (gather.has_subs()) {
+ mdr->more()->waiting_on_peer.insert(MDS_RANK_NONE);
+ gather.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr));
+ gather.activate();
+ }
+ }
+
+ // is witness list sufficient?
+ for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
+ if (*p == mdr->peer_to_mds ||
+ mdr->peer_request->witnesses.count(*p)) continue;
+ dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
+ reply_witness = true;
+ break;
+ }
+
+ if (reply_witness) {
+ ceph_assert(!srcdnrep.empty());
+ auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
+ reply->witnesses.swap(srcdnrep);
+ mds->send_message_mds(reply, mdr->peer_to_mds);
+ mdr->reset_peer_request();
+ return;
+ }
+ dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
+ if (!mdr->more()->waiting_on_peer.empty()) {
+ dout(10) << " still waiting for rename notify acks from "
+ << mdr->more()->waiting_on_peer << dendl;
+ return;
+ }
+ } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
+ // set ambiguous auth for srci on witnesses
+ mdr->set_ambiguous_auth(srcdnl->get_inode());
+ }
+
+ // encode everything we'd need to roll this back... basically, just the original state.
+ rename_rollback rollback;
+
+ rollback.reqid = mdr->reqid;
+
+ rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
+ rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
+ rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
+ rollback.orig_src.dname = srcdn->get_name();
+ if (srcdnl->is_primary())
+ rollback.orig_src.ino = srcdnl->get_inode()->ino();
+ else {
+ ceph_assert(srcdnl->is_remote());
+ rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
+ rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
+ }
+
+ rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
+ rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
+ rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
+ rollback.orig_dest.dname = destdn->get_name();
+ if (destdnl->is_primary())
+ rollback.orig_dest.ino = destdnl->get_inode()->ino();
+ else if (destdnl->is_remote()) {
+ rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
+ rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
+ }
+
+ if (straydn) {
+ rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
+ rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
+ rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
+ rollback.stray.dname = straydn->get_name();
+ }
+ if (mdr->peer_request->desti_snapbl.length()) {
+ CInode *oldin = destdnl->get_inode();
+ if (oldin->snaprealm) {
+ encode(true, rollback.desti_snapbl);
+ oldin->encode_snap_blob(rollback.desti_snapbl);
+ } else {
+ encode(false, rollback.desti_snapbl);
+ }
+ }
+ if (mdr->peer_request->srci_snapbl.length()) {
+ if (srci->snaprealm) {
+ encode(true, rollback.srci_snapbl);
+ srci->encode_snap_blob(rollback.srci_snapbl);
+ } else {
+ encode(false, rollback.srci_snapbl);
+ }
+ }
+ encode(rollback, mdr->more()->rollback_bl);
+ // FIXME: rollback snaprealm
+ dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
+
+ // journal.
+ mdr->ls = mdlog->get_current_segment();
+ EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_prep", mdr->reqid, mdr->peer_to_mds,
+ EPeerUpdate::OP_PREPARE, EPeerUpdate::RENAME);
+ mdlog->start_entry(le);
+ le->rollback = mdr->more()->rollback_bl;
+
+ bufferlist blah; // inode import data... obviously not used if we're the peer
+ _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, mdr->peer_request->alternate_name, straydn);
+
+ if (le->commit.empty()) {
+ dout(10) << " empty metablob, skipping journal" << dendl;
+ mdlog->cancel_entry(le);
+ mdr->ls = NULL;
+ _logged_peer_rename(mdr, srcdn, destdn, straydn);
+ } else {
+ mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
+ mdr->more()->peer_update_journaled = true;
+ submit_mdlog_entry(le, new C_MDS_PeerRenamePrep(this, mdr, srcdn, destdn, straydn),
+ mdr, __func__);
+ mdlog->flush();
+ }
+}
+
+void Server::_logged_peer_rename(MDRequestRef& mdr,
+ CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+ dout(10) << "_logged_peer_rename " << *mdr << dendl;
+
+ // prepare ack
+ ref_t<MMDSPeerRequest> reply;
+ if (!mdr->aborted) {
+ reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
+ if (!mdr->more()->peer_update_journaled)
+ reply->mark_not_journaled();
+ }
+
+ CDentry::linkage_t *srcdnl = srcdn->get_linkage();
+ //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
+
+ // export srci?
+ if (srcdn->is_auth() && srcdnl->is_primary()) {
+ // set export bounds for CInode::encode_export()
+ if (reply) {
+ std::vector<CDir*> bounds;
+ if (srcdnl->get_inode()->is_dir()) {
+ srcdnl->get_inode()->get_dirfrags(bounds);
+ for (const auto& bound : bounds) {
+ bound->state_set(CDir::STATE_EXPORTBOUND);
+ }
+ }
+
+ map<client_t,entity_inst_t> exported_client_map;
+ map<client_t, client_metadata_t> exported_client_metadata_map;
+ bufferlist inodebl;
+ mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
+ exported_client_map,
+ exported_client_metadata_map);
+
+ for (const auto& bound : bounds) {
+ bound->state_clear(CDir::STATE_EXPORTBOUND);
+ }
+
+ encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
+ encode(exported_client_metadata_map, reply->inode_export);
+ reply->inode_export.claim_append(inodebl);
+ reply->inode_export_v = srcdnl->get_inode()->get_version();
+ }
+
+ // remove mdr auth pin
+ mdr->auth_unpin(srcdnl->get_inode());
+ mdr->more()->is_inode_exporter = true;
+
+ if (srcdnl->get_inode()->is_dirty())
+ srcdnl->get_inode()->mark_clean();
+
+ dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
+ }
+
+ // apply
+ _rename_apply(mdr, srcdn, destdn, straydn);
+
+ CDentry::linkage_t *destdnl = destdn->get_linkage();
+
+ // bump popularity
+ mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
+ if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
+ mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
+
+ // done.
+ mdr->reset_peer_request();
+ mdr->straydn = 0;
+
+ if (reply) {
+ mds->send_message_mds(reply, mdr->peer_to_mds);
+ } else {
+ ceph_assert(mdr->aborted);
+ dout(10) << " abort flag set, finishing" << dendl;
+ mdcache->request_finish(mdr);
+ }
+}
+
+void Server::_commit_peer_rename(MDRequestRef& mdr, int r,
+ CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+ dout(10) << "_commit_peer_rename " << *mdr << " r=" << r << dendl;
+
+ CInode *in = destdn->get_linkage()->get_inode();
+
+ inodeno_t migrated_stray;
+ if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
+ migrated_stray = in->ino();
+
+ MDSContext::vec finished;
+ if (r == 0) {
+ // unfreeze+singleauth inode
+ // hmm, do i really need to delay this?
+ if (mdr->more()->is_inode_exporter) {
+ // drop our pins
+ // we exported, clear out any xlocks that we moved to another MDS
+
+ for (auto i = mdr->locks.lower_bound(&in->versionlock);
+ i != mdr->locks.end(); ) {
+ SimpleLock *lock = i->lock;
+ if (lock->get_parent() != in)
+ break;
+ // we only care about xlocks on the exported inode
+ if (i->is_xlock() && !lock->is_locallock())
+ mds->locker->xlock_export(i++, mdr.get());
+ else
+ ++i;
+ }
+
+ map<client_t,Capability::Import> peer_imported;
+ auto bp = mdr->more()->inode_import.cbegin();
+ decode(peer_imported, bp);
+
+ dout(10) << " finishing inode export on " << *in << dendl;
+ mdcache->migrator->finish_export_inode(in, mdr->peer_to_mds, peer_imported, finished);
+ mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
+
+ // unfreeze
+ ceph_assert(in->is_frozen_inode());
+ in->unfreeze_inode(finished);
+ }
+
+ // singleauth
+ if (mdr->more()->is_ambiguous_auth) {
+ mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+ mdr->more()->is_ambiguous_auth = false;
+ }
+
+ if (straydn && mdr->more()->peer_update_journaled) {
+ CInode *strayin = straydn->get_projected_linkage()->get_inode();
+ if (strayin && !strayin->snaprealm)
+ mdcache->clear_dirty_bits_for_stray(strayin);
+ }
+
+ mds->queue_waiters(finished);
+ mdr->cleanup();
+
+ if (mdr->more()->peer_update_journaled) {
+ // write a commit to the journal
+ EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_commit", mdr->reqid,
+ mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
+ EPeerUpdate::RENAME);
+ mdlog->start_entry(le);
+ submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
+ mdlog->flush();
+ } else {
+ _committed_peer(mdr);
+ }
+ } else {
+
+ // abort
+ // rollback_bl may be empty if we froze the inode but had to provide an expanded
+ // witness list from the leader, and they failed before we tried prep again.
+ if (mdr->more()->rollback_bl.length()) {
+ if (mdr->more()->is_inode_exporter) {
+ dout(10) << " reversing inode export of " << *in << dendl;
+ in->abort_export();
+ }
+ if (mdcache->is_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds)) {
+ mdcache->remove_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds);
+ // rollback but preserve the peer request
+ do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, false);
+ mdr->more()->rollback_bl.clear();
+ } else
+ do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, true);
+ } else {
+ dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl;
+ // singleauth
+ if (mdr->more()->is_ambiguous_auth) {
+ if (srcdn->is_auth())
+ mdr->more()->rename_inode->unfreeze_inode(finished);
+
+ mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+ mdr->more()->is_ambiguous_auth = false;
+ }
+ mds->queue_waiters(finished);
+ mdcache->request_finish(mdr);
+ }
+ }
+
+ if (migrated_stray && mds->is_stopping())
+ mdcache->shutdown_export_stray_finish(migrated_stray);
+}
+
+static void _rollback_repair_dir(MutationRef& mut, CDir *dir,
+ rename_rollback::drec &r, utime_t ctime,
+ bool isdir, const nest_info_t &rstat)
+{
+ auto pf = dir->project_fnode(mut);
+ pf->version = dir->pre_dirty();
+
+ if (isdir) {
+ pf->fragstat.nsubdirs += 1;
+ } else {
+ pf->fragstat.nfiles += 1;
+ }
+ if (r.ino) {
+ pf->rstat.rbytes += rstat.rbytes;
+ pf->rstat.rfiles += rstat.rfiles;
+ pf->rstat.rsubdirs += rstat.rsubdirs;
+ pf->rstat.rsnaps += rstat.rsnaps;
+ }
+ if (pf->fragstat.mtime == ctime) {
+ pf->fragstat.mtime = r.dirfrag_old_mtime;
+ if (pf->rstat.rctime == ctime)
+ pf->rstat.rctime = r.dirfrag_old_rctime;
+ }
+ mut->add_updated_lock(&dir->get_inode()->filelock);
+ mut->add_updated_lock(&dir->get_inode()->nestlock);
+}
+
+struct C_MDS_LoggedRenameRollback : public ServerLogContext {
+ MutationRef mut;
+ CDentry *srcdn;
+ version_t srcdnpv;
+ CDentry *destdn;
+ CDentry *straydn;
+ map<client_t,ref_t<MClientSnap>> splits[2];
+ bool finish_mdr;
+ C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
+ CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
+ map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
+ ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
+ straydn(st), finish_mdr(f) {
+ splits[0].swap(_splits[0]);
+ splits[1].swap(_splits[1]);
+ }
+ void finish(int r) override {
+ server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
+ destdn, straydn, splits, finish_mdr);
+ }
+};
+
+void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr,
+ bool finish_mdr)
+{
+ rename_rollback rollback;
+ auto p = rbl.cbegin();
+ decode(rollback, p);
+
+ dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
+ // need to finish this update before sending resolve to claim the subtree
+ mdcache->add_rollback(rollback.reqid, leader);
+
+ MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
+ mut->ls = mds->mdlog->get_current_segment();
+
+ CDentry *srcdn = NULL;
+ CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
+ if (!srcdir)
+ srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
+ if (srcdir) {
+ dout(10) << " srcdir " << *srcdir << dendl;
+ srcdn = srcdir->lookup(rollback.orig_src.dname);
+ if (srcdn) {
+ dout(10) << " srcdn " << *srcdn << dendl;
+ ceph_assert(srcdn->get_linkage()->is_null());
+ } else
+ dout(10) << " srcdn not found" << dendl;
+ } else
+ dout(10) << " srcdir not found" << dendl;
+
+ CDentry *destdn = NULL;
+ CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
+ if (!destdir)
+ destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
+ if (destdir) {
+ dout(10) << " destdir " << *destdir << dendl;
+ destdn = destdir->lookup(rollback.orig_dest.dname);
+ if (destdn)
+ dout(10) << " destdn " << *destdn << dendl;
+ else
+ dout(10) << " destdn not found" << dendl;
+ } else
+ dout(10) << " destdir not found" << dendl;
+
+ CInode *in = NULL;
+ if (rollback.orig_src.ino) {
+ in = mdcache->get_inode(rollback.orig_src.ino);
+ if (in && in->is_dir())
+ ceph_assert(srcdn && destdn);
+ } else
+ in = mdcache->get_inode(rollback.orig_src.remote_ino);
+
+ CDir *straydir = NULL;
+ CDentry *straydn = NULL;
+ if (rollback.stray.dirfrag.ino) {
+ straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
+ if (straydir) {
+ dout(10) << "straydir " << *straydir << dendl;
+ straydn = straydir->lookup(rollback.stray.dname);
+ if (straydn) {
+ dout(10) << " straydn " << *straydn << dendl;
+ ceph_assert(straydn->get_linkage()->is_primary());
+ } else
+ dout(10) << " straydn not found" << dendl;
+ } else
+ dout(10) << "straydir not found" << dendl;
+ }
+
+ CInode *target = NULL;
+ if (rollback.orig_dest.ino) {
+ target = mdcache->get_inode(rollback.orig_dest.ino);
+ if (target)
+ ceph_assert(destdn && straydn);
+ } else if (rollback.orig_dest.remote_ino)
+ target = mdcache->get_inode(rollback.orig_dest.remote_ino);
+
+ // can't use is_auth() in the resolve stage
+ mds_rank_t whoami = mds->get_nodeid();
+ // peer
+ ceph_assert(!destdn || destdn->authority().first != whoami);
+ ceph_assert(!straydn || straydn->authority().first != whoami);
+
+ bool force_journal_src = false;
+ bool force_journal_dest = false;
+ if (in && in->is_dir() && srcdn->authority().first != whoami)
+ force_journal_src = _need_force_journal(in, false);
+ if (in && target && target->is_dir())
+ force_journal_dest = _need_force_journal(in, true);
+
+ version_t srcdnpv = 0;
+ // repair src
+ if (srcdn) {
+ if (srcdn->authority().first == whoami)
+ srcdnpv = srcdn->pre_dirty();
+ if (rollback.orig_src.ino) {
+ ceph_assert(in);
+ srcdn->push_projected_linkage(in);
+ } else
+ srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
+ rollback.orig_src.remote_d_type);
+ }
+
+ map<client_t,ref_t<MClientSnap>> splits[2];
+
+ const CInode::mempool_inode *pip = nullptr;
+ if (in) {
+ bool projected;
+ CDir *pdir = in->get_projected_parent_dir();
+ if (pdir->authority().first == whoami) {
+ auto pi = in->project_inode(mut);
+ pi.inode->version = in->pre_dirty();
+ if (pdir != srcdir) {
+ auto pf = pdir->project_fnode(mut);
+ pf->version = pdir->pre_dirty();
+ }
+ if (pi.inode->ctime == rollback.ctime)
+ pi.inode->ctime = rollback.orig_src.old_ctime;
+ projected = true;
+ } else {
+ if (in->get_inode()->ctime == rollback.ctime) {
+ auto _inode = CInode::allocate_inode(*in->get_inode());
+ _inode->ctime = rollback.orig_src.old_ctime;
+ in->reset_inode(_inode);
+ }
+ projected = false;
+ }
+ pip = in->get_projected_inode().get();
+
+ if (rollback.srci_snapbl.length() && in->snaprealm) {
+ bool hadrealm;
+ auto p = rollback.srci_snapbl.cbegin();
+ decode(hadrealm, p);
+ if (hadrealm) {
+ if (projected && !mds->is_resolve()) {
+ sr_t *new_srnode = new sr_t();
+ decode(*new_srnode, p);
+ in->project_snaprealm(new_srnode);
+ } else
+ decode(in->snaprealm->srnode, p);
+ } else {
+ SnapRealm *realm;
+ if (rollback.orig_src.ino) {
+ ceph_assert(srcdir);
+ realm = srcdir->get_inode()->find_snaprealm();
+ } else {
+ realm = in->snaprealm->parent;
+ }
+ if (!mds->is_resolve())
+ mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
+ if (projected)
+ in->project_snaprealm(NULL);
+ else
+ in->snaprealm->merge_to(realm);
+ }
+ }
+ }
+
+ // repair dest
+ if (destdn) {
+ if (rollback.orig_dest.ino && target) {
+ destdn->push_projected_linkage(target);
+ } else if (rollback.orig_dest.remote_ino) {
+ destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
+ rollback.orig_dest.remote_d_type);
+ } else {
+ // the dentry will be trimmed soon, it's ok to have wrong linkage
+ if (rollback.orig_dest.ino)
+ ceph_assert(mds->is_resolve());
+ destdn->push_projected_linkage();
+ }
+ }
+
+ if (straydn)
+ straydn->push_projected_linkage();
+
+ if (target) {
+ bool projected;
+ CInode::inode_ptr ti;
+ CDir *pdir = target->get_projected_parent_dir();
+ if (pdir->authority().first == whoami) {
+ auto pi = target->project_inode(mut);
+ pi.inode->version = target->pre_dirty();
+ if (pdir != srcdir) {
+ auto pf = pdir->project_fnode(mut);
+ pf->version = pdir->pre_dirty();
+ }
+ ti = pi.inode;
+ projected = true;
+ } else {
+ ti = CInode::allocate_inode(*target->get_inode());
+ projected = false;
+ }
+
+ if (ti->ctime == rollback.ctime)
+ ti->ctime = rollback.orig_dest.old_ctime;
+ if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
+ if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
+ ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
+ else
+ ceph_assert(rollback.orig_dest.remote_ino &&
+ rollback.orig_dest.remote_ino == rollback.orig_src.ino);
+ } else
+ ti->nlink++;
+
+ if (!projected)
+ target->reset_inode(ti);
+
+ if (rollback.desti_snapbl.length() && target->snaprealm) {
+ bool hadrealm;
+ auto p = rollback.desti_snapbl.cbegin();
+ decode(hadrealm, p);
+ if (hadrealm) {
+ if (projected && !mds->is_resolve()) {
+ sr_t *new_srnode = new sr_t();
+ decode(*new_srnode, p);
+ target->project_snaprealm(new_srnode);
+ } else
+ decode(target->snaprealm->srnode, p);
+ } else {
+ SnapRealm *realm;
+ if (rollback.orig_dest.ino) {
+ ceph_assert(destdir);
+ realm = destdir->get_inode()->find_snaprealm();
+ } else {
+ realm = target->snaprealm->parent;
+ }
+ if (!mds->is_resolve())
+ mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
+ if (projected)
+ target->project_snaprealm(NULL);
+ else
+ target->snaprealm->merge_to(realm);
+ }
+ }
+ }
+
+ if (srcdn && srcdn->authority().first == whoami) {
+ nest_info_t blah;
+ _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
+ in && in->is_dir(), pip ? pip->accounted_rstat : blah);
+ }
+
+ if (srcdn)
+ dout(0) << " srcdn back to " << *srcdn << dendl;
+ if (in)
+ dout(0) << " srci back to " << *in << dendl;
+ if (destdn)
+ dout(0) << " destdn back to " << *destdn << dendl;
+ if (target)
+ dout(0) << " desti back to " << *target << dendl;
+
+ // journal it
+ EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_rollback", rollback.reqid, leader,
+ EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RENAME);
+ mdlog->start_entry(le);
+
+ if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
+ le->commit.add_dir_context(srcdir);
+ if (rollback.orig_src.ino)
+ le->commit.add_primary_dentry(srcdn, 0, true);
+ else
+ le->commit.add_remote_dentry(srcdn, true);
+ }
+
+ if (!rollback.orig_src.ino && // remote linkage
+ in && in->authority().first == whoami) {
+ le->commit.add_dir_context(in->get_projected_parent_dir());
+ le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
+ }
+
+ if (force_journal_dest) {
+ ceph_assert(rollback.orig_dest.ino);
+ le->commit.add_dir_context(destdir);
+ le->commit.add_primary_dentry(destdn, 0, true);
+ }
+
+ // peer: no need to journal straydn
+
+ if (target && target != in && target->authority().first == whoami) {
+ ceph_assert(rollback.orig_dest.remote_ino);
+ le->commit.add_dir_context(target->get_projected_parent_dir());
+ le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
+ }
+
+ if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
+ dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
+ le->commit.renamed_dirino = in->ino();
+ if (srcdn->authority().first == whoami) {
+ auto&& ls = in->get_dirfrags();
+ for (const auto& dir : ls) {
+ if (!dir->is_auth())
+ le->commit.renamed_dir_frags.push_back(dir->get_frag());
+ }
+ dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
+ }
+ } else if (force_journal_dest) {
+ dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
+ le->commit.renamed_dirino = target->ino();
+ }
+
+ if (target && target->is_dir()) {
+ ceph_assert(destdn);
+ mdcache->project_subtree_rename(target, straydir, destdir);
+ }
+
+ if (in && in->is_dir()) {
+ ceph_assert(srcdn);
+ mdcache->project_subtree_rename(in, destdir, srcdir);
+ }
+
+ if (mdr && !mdr->more()->peer_update_journaled) {
+ ceph_assert(le->commit.empty());
+ mdlog->cancel_entry(le);
+ mut->ls = NULL;
+ _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
+ } else {
+ ceph_assert(!le->commit.empty());
+ if (mdr)
+ mdr->more()->peer_update_journaled = false;
+ MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
+ srcdn, srcdnpv, destdn, straydn,
+ splits, finish_mdr);
+ submit_mdlog_entry(le, fin, mdr, __func__);
+ mdlog->flush();
+ }
+}
+
+void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
+ version_t srcdnpv, CDentry *destdn, CDentry *straydn,
+ map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
+{
+ dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
+
+ if (straydn) {
+ straydn->get_dir()->unlink_inode(straydn);
+ straydn->pop_projected_linkage();
+ }
+ if (destdn) {
+ destdn->get_dir()->unlink_inode(destdn);
+ destdn->pop_projected_linkage();
+ }
+ if (srcdn) {
+ srcdn->pop_projected_linkage();
+ if (srcdn->authority().first == mds->get_nodeid()) {
+ srcdn->mark_dirty(srcdnpv, mut->ls);
+ if (srcdn->get_linkage()->is_primary())
+ srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
+ }
+ }
+
+ mut->apply();
+
+ if (srcdn && srcdn->get_linkage()->is_primary()) {
+ CInode *in = srcdn->get_linkage()->get_inode();
+ if (in && in->is_dir()) {
+ ceph_assert(destdn);
+ mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
+ }
+ }
+
+ if (destdn) {
+ CInode *oldin = destdn->get_linkage()->get_inode();
+ // update subtree map?
+ if (oldin && oldin->is_dir()) {
+ ceph_assert(straydn);
+ mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
+ }
+ }
+
+ if (mds->is_resolve()) {
+ CDir *root = NULL;
+ if (straydn)
+ root = mdcache->get_subtree_root(straydn->get_dir());
+ else if (destdn)
+ root = mdcache->get_subtree_root(destdn->get_dir());
+ if (root)
+ mdcache->try_trim_non_auth_subtree(root);
+ } else {
+ mdcache->send_snaps(splits[1]);
+ mdcache->send_snaps(splits[0]);
+ }
+
+ if (mdr) {
+ MDSContext::vec finished;
+ if (mdr->more()->is_ambiguous_auth) {
+ if (srcdn->is_auth())
+ mdr->more()->rename_inode->unfreeze_inode(finished);
+
+ mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+ mdr->more()->is_ambiguous_auth = false;
+ }
+ mds->queue_waiters(finished);
+ if (finish_mdr || mdr->aborted)
+ mdcache->request_finish(mdr);
+ else
+ mdr->more()->peer_rolling_back = false;
+ }
+
+ mdcache->finish_rollback(mut->reqid, mdr);
+
+ mut->cleanup();
+}
+
+void Server::handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
+{
+ dout(10) << "handle_peer_rename_prep_ack " << *mdr
+ << " witnessed by " << ack->get_source()
+ << " " << *ack << dendl;
+ mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+ // note peer
+ mdr->more()->peers.insert(from);
+ if (mdr->more()->srcdn_auth_mds == from &&
+ mdr->more()->is_remote_frozen_authpin &&
+ !mdr->more()->is_ambiguous_auth) {
+ mdr->set_ambiguous_auth(mdr->more()->rename_inode);
+ }
+
+ // witnessed? or add extra witnesses?
+ ceph_assert(mdr->more()->witnessed.count(from) == 0);
+ if (ack->is_interrupted()) {
+ dout(10) << " peer request interrupted, noop" << dendl;
+ } else if (ack->witnesses.empty()) {
+ mdr->more()->witnessed.insert(from);
+ if (!ack->is_not_journaled())
+ mdr->more()->has_journaled_peers = true;
+ } else {
+ dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
+ mdr->more()->extra_witnesses = ack->witnesses;
+ mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
+ }
+
+ // srci import?
+ if (ack->inode_export.length()) {
+ dout(10) << " got srci import" << dendl;
+ mdr->more()->inode_import.share(ack->inode_export);
+ mdr->more()->inode_import_v = ack->inode_export_v;
+ }
+
+ // remove from waiting list
+ ceph_assert(mdr->more()->waiting_on_peer.count(from));
+ mdr->more()->waiting_on_peer.erase(from);
+
+ if (mdr->more()->waiting_on_peer.empty())
+ dispatch_client_request(mdr); // go again!
+ else
+ dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
+}
+
+void Server::handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
+{
+ dout(10) << "handle_peer_rename_notify_ack " << *mdr << " from mds."
+ << ack->get_source() << dendl;
+ ceph_assert(mdr->is_peer());
+ mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+ if (mdr->more()->waiting_on_peer.count(from)) {
+ mdr->more()->waiting_on_peer.erase(from);
+
+ if (mdr->more()->waiting_on_peer.empty()) {
+ if (mdr->peer_request)
+ dispatch_peer_request(mdr);
+ } else
+ dout(10) << " still waiting for rename notify acks from "
+ << mdr->more()->waiting_on_peer << dendl;
+ }
+}
+
+void Server::_peer_rename_sessions_flushed(MDRequestRef& mdr)
+{
+ dout(10) << "_peer_rename_sessions_flushed " << *mdr << dendl;
+
+ if (mdr->more()->waiting_on_peer.count(MDS_RANK_NONE)) {
+ mdr->more()->waiting_on_peer.erase(MDS_RANK_NONE);
+
+ if (mdr->more()->waiting_on_peer.empty()) {
+ if (mdr->peer_request)
+ dispatch_peer_request(mdr);
+ } else
+ dout(10) << " still waiting for rename notify acks from "
+ << mdr->more()->waiting_on_peer << dendl;
+ }
+}
+
+// snaps
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_lssnap(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ // traverse to path
+ CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!diri)
+ return;
+
+ if (!diri->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_ENOTDIR);
+ return;
+ }
+ dout(10) << "lssnap on " << *diri << dendl;
+
+ // lock snap
+ if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
+ return;
+
+ if (!check_access(mdr, diri, MAY_READ))
+ return;
+
+ SnapRealm *realm = diri->find_snaprealm();
+ map<snapid_t,const SnapInfo*> infomap;
+ realm->get_snap_info(infomap, diri->get_oldest_snap());
+
+ unsigned max_entries = req->head.args.readdir.max_entries;
+ if (!max_entries)
+ max_entries = infomap.size();
+ int max_bytes = req->head.args.readdir.max_bytes;
+ if (!max_bytes)
+ // make sure at least one item can be encoded
+ max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
+
+ __u64 last_snapid = 0;
+ string offset_str = req->get_path2();
+ if (!offset_str.empty())
+ last_snapid = realm->resolve_snapname(offset_str, diri->ino());
+
+ //Empty DirStat
+ bufferlist dirbl;
+ static DirStat empty;
+ CDir::encode_dirstat(dirbl, mdr->session->info, empty);
+
+ max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
+
+ __u32 num = 0;
+ bufferlist dnbl;
+ auto p = infomap.upper_bound(last_snapid);
+ for (; p != infomap.end() && num < max_entries; ++p) {
+ dout(10) << p->first << " -> " << *p->second << dendl;
+
+ // actual
+ string snap_name;
+ if (p->second->ino == diri->ino())
+ snap_name = p->second->name;
+ else
+ snap_name = p->second->get_long_name();
+
+ unsigned start_len = dnbl.length();
+ if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
+ break;
+
+ encode(snap_name, dnbl);
+ //infinite lease
+ LeaseStat e(CEPH_LEASE_VALID, -1, 0);
+ mds->locker->encode_lease(dnbl, mdr->session->info, e);
+ dout(20) << "encode_infinite_lease" << dendl;
+
+ int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
+ if (r < 0) {
+ bufferlist keep;
+ keep.substr_of(dnbl, 0, start_len);
+ dnbl.swap(keep);
+ break;
+ }
+ ++num;
+ }
+
+ encode(num, dirbl);
+ __u16 flags = 0;
+ if (p == infomap.end()) {
+ flags = CEPH_READDIR_FRAG_END;
+ if (last_snapid == 0)
+ flags |= CEPH_READDIR_FRAG_COMPLETE;
+ }
+ encode(flags, dirbl);
+ dirbl.claim_append(dnbl);
+
+ mdr->reply_extra_bl = dirbl;
+ mdr->tracei = diri;
+ respond_to_request(mdr, 0);
+}
+
+
+// MKSNAP
+
+struct C_MDS_mksnap_finish : public ServerLogContext {
+ CInode *diri;
+ SnapInfo info;
+ C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
+ ServerLogContext(s, r), diri(di), info(i) {}
+ void finish(int r) override {
+ server->_mksnap_finish(mdr, diri, info);
+ }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_mksnap(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ // make sure we have as new a map as the client
+ if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
+ mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ if (!mds->mdsmap->allows_snaps()) {
+ // you can't make snapshots until you set an option right now
+ dout(5) << "new snapshots are disabled for this fs" << dendl;
+ respond_to_request(mdr, -CEPHFS_EPERM);
+ return;
+ }
+
+ CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!diri)
+ return;
+
+ // dir only
+ if (!diri->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_ENOTDIR);
+ return;
+ }
+ if (diri->is_system() && !diri->is_root()) {
+ // no snaps in system dirs (root is ok)
+ dout(5) << "is an internal system dir" << dendl;
+ respond_to_request(mdr, -CEPHFS_EPERM);
+ return;
+ }
+
+ std::string_view snapname = req->get_filepath().last_dentry();
+
+ if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
+ dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
+ respond_to_request(mdr, -CEPHFS_EPERM);
+ return;
+ }
+
+ dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
+
+ // lock snap
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&diri->snaplock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
+ if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
+ return;
+ }
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
+
+ if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
+ return;
+
+ if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
+ (subvol_ino && subvol_ino != diri->ino())) {
+ dout(5) << "is a descendent of a subvolume dir" << dendl;
+ respond_to_request(mdr, -CEPHFS_EPERM);
+ return;
+ }
+
+ // check if we can create any more snapshots
+ // we don't allow any more if we are already at or beyond the limit
+ if (diri->snaprealm &&
+ diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
+ respond_to_request(mdr, -CEPHFS_EMLINK);
+ return;
+ }
+
+ // make sure name is unique
+ if (diri->snaprealm &&
+ diri->snaprealm->exists(snapname)) {
+ respond_to_request(mdr, -CEPHFS_EEXIST);
+ return;
+ }
+ if (snapname.length() == 0 ||
+ snapname.length() > snapshot_name_max ||
+ snapname[0] == '_') {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ // allocate a snapid
+ if (!mdr->more()->stid) {
+ // prepare an stid
+ mds->snapclient->prepare_create(diri->ino(), snapname,
+ mdr->get_mds_stamp(),
+ &mdr->more()->stid, &mdr->more()->snapidbl,
+ new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ version_t stid = mdr->more()->stid;
+ snapid_t snapid;
+ auto p = mdr->more()->snapidbl.cbegin();
+ decode(snapid, p);
+ dout(10) << " stid " << stid << " snapid " << snapid << dendl;
+
+ ceph_assert(mds->snapclient->get_cached_version() >= stid);
+
+ SnapPayload payload;
+ if (req->get_data().length()) {
+ try {
+ auto iter = req->get_data().cbegin();
+ decode(payload, iter);
+ } catch (const ceph::buffer::error &e) {
+ // backward compat -- client sends xattr bufferlist. however,
+ // that is not used anywhere -- so (log and) ignore.
+ dout(20) << ": no metadata in payload (old client?)" << dendl;
+ }
+ }
+
+ // journal
+ SnapInfo info;
+ info.ino = diri->ino();
+ info.snapid = snapid;
+ info.name = snapname;
+ info.stamp = mdr->get_op_stamp();
+ info.metadata = payload.metadata;
+
+ auto pi = diri->project_inode(mdr, false, true);
+ pi.inode->ctime = info.stamp;
+ if (info.stamp > pi.inode->rstat.rctime)
+ pi.inode->rstat.rctime = info.stamp;
+ pi.inode->rstat.rsnaps++;
+ pi.inode->version = diri->pre_dirty();
+
+ // project the snaprealm
+ auto &newsnap = *pi.snapnode;
+ newsnap.created = snapid;
+ auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
+ if (!em.second)
+ em.first->second = info;
+ newsnap.seq = snapid;
+ newsnap.last_created = snapid;
+ newsnap.last_modified = info.stamp;
+ newsnap.change_attr++;
+
+ // journal the inode changes
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "mksnap");
+ mdlog->start_entry(le);
+
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ le->metablob.add_table_transaction(TABLE_SNAP, stid);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
+
+ // journal the snaprealm changes
+ submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
+{
+ dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
+
+ int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
+
+ mdr->apply();
+
+ mds->snapclient->commit(mdr->more()->stid, mdr->ls);
+
+ // create snap
+ dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
+
+ // notify other mds
+ mdcache->send_snap_update(diri, mdr->more()->stid, op);
+
+ mdcache->do_realm_invalidate_and_update_notify(diri, op);
+
+ // yay
+ mdr->in[0] = diri;
+ mdr->snapid = info.snapid;
+ mdr->tracei = diri;
+ respond_to_request(mdr, 0);
+}
+
+
+// RMSNAP
+
+struct C_MDS_rmsnap_finish : public ServerLogContext {
+ CInode *diri;
+ snapid_t snapid;
+ C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
+ ServerLogContext(s, r), diri(di), snapid(sn) {}
+ void finish(int r) override {
+ server->_rmsnap_finish(mdr, diri, snapid);
+ }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_rmsnap(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+
+ CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!diri)
+ return;
+
+ if (!diri->is_dir()) {
+ respond_to_request(mdr, -CEPHFS_ENOTDIR);
+ return;
+ }
+
+ std::string_view snapname = req->get_filepath().last_dentry();
+
+ if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
+ dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
+ respond_to_request(mdr, -CEPHFS_EPERM);
+ return;
+ }
+
+ dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
+
+ // does snap exist?
+ if (snapname.length() == 0 || snapname[0] == '_') {
+ respond_to_request(mdr, -CEPHFS_EINVAL); // can't prune a parent snap, currently.
+ return;
+ }
+ if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
+ respond_to_request(mdr, -CEPHFS_ENOENT);
+ return;
+ }
+ snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
+ dout(10) << " snapname " << snapname << " is " << snapid << dendl;
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&diri->snaplock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+ if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
+ if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
+ return;
+ }
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
+
+ if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
+ return;
+
+ // prepare
+ if (!mdr->more()->stid) {
+ mds->snapclient->prepare_destroy(diri->ino(), snapid,
+ &mdr->more()->stid, &mdr->more()->snapidbl,
+ new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ version_t stid = mdr->more()->stid;
+ auto p = mdr->more()->snapidbl.cbegin();
+ snapid_t seq;
+ decode(seq, p);
+ dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
+
+ ceph_assert(mds->snapclient->get_cached_version() >= stid);
+
+ // journal
+ auto pi = diri->project_inode(mdr, false, true);
+ pi.inode->version = diri->pre_dirty();
+ pi.inode->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+ pi.inode->rstat.rctime = mdr->get_op_stamp();
+ pi.inode->rstat.rsnaps--;
+
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "rmsnap");
+ mdlog->start_entry(le);
+
+ // project the snaprealm
+ auto &newnode = *pi.snapnode;
+ newnode.snaps.erase(snapid);
+ newnode.seq = seq;
+ newnode.last_destroyed = seq;
+ newnode.last_modified = mdr->get_op_stamp();
+ newnode.change_attr++;
+
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ le->metablob.add_table_transaction(TABLE_SNAP, stid);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
+
+ submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
+{
+ dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
+ snapid_t stid = mdr->more()->stid;
+
+ mdr->apply();
+
+ mds->snapclient->commit(stid, mdr->ls);
+
+ dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
+
+ // notify other mds
+ mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
+
+ mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
+
+ // yay
+ mdr->in[0] = diri;
+ mdr->tracei = diri;
+ mdr->snapid = snapid;
+ respond_to_request(mdr, 0);
+
+ // purge snapshot data
+ diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
+}
+
+struct C_MDS_renamesnap_finish : public ServerLogContext {
+ CInode *diri;
+ snapid_t snapid;
+ C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
+ ServerLogContext(s, r), diri(di), snapid(sn) {}
+ void finish(int r) override {
+ server->_renamesnap_finish(mdr, diri, snapid);
+ }
+};
+
+/* This function takes responsibility for the passed mdr*/
+void Server::handle_client_renamesnap(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest> &req = mdr->client_request;
+ if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+
+ CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+ if (!diri)
+ return;
+
+ if (!diri->is_dir()) { // dir only
+ respond_to_request(mdr, -CEPHFS_ENOTDIR);
+ return;
+ }
+
+ if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
+ mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
+ respond_to_request(mdr, -CEPHFS_EPERM);
+ return;
+ }
+
+ std::string_view dstname = req->get_filepath().last_dentry();
+ std::string_view srcname = req->get_filepath2().last_dentry();
+ dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
+
+ if (srcname.length() == 0 || srcname[0] == '_') {
+ respond_to_request(mdr, -CEPHFS_EINVAL); // can't rename a parent snap.
+ return;
+ }
+ if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
+ respond_to_request(mdr, -CEPHFS_ENOENT);
+ return;
+ }
+ if (dstname.length() == 0 || dstname[0] == '_') {
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ return;
+ }
+ if (diri->snaprealm->exists(dstname)) {
+ respond_to_request(mdr, -CEPHFS_EEXIST);
+ return;
+ }
+
+ snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
+
+ dout(10) << " snapname " << srcname << " is " << snapid << dendl;
+
+ // lock snap
+ if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&diri->snaplock);
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+ if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
+ if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
+ return;
+ }
+ mdr->locking_state |= MutationImpl::ALL_LOCKED;
+ }
+
+ if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
+ return;
+
+ // prepare
+ if (!mdr->more()->stid) {
+ mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
+ &mdr->more()->stid,
+ new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ version_t stid = mdr->more()->stid;
+ dout(10) << " stid is " << stid << dendl;
+
+ ceph_assert(mds->snapclient->get_cached_version() >= stid);
+
+ // journal
+ auto pi = diri->project_inode(mdr, false, true);
+ pi.inode->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
+ pi.inode->rstat.rctime = mdr->get_op_stamp();
+ pi.inode->version = diri->pre_dirty();
+
+ // project the snaprealm
+ auto &newsnap = *pi.snapnode;
+ auto it = newsnap.snaps.find(snapid);
+ ceph_assert(it != newsnap.snaps.end());
+ it->second.name = dstname;
+ newsnap.last_modified = mdr->get_op_stamp();
+ newsnap.change_attr++;
+
+ // journal the inode changes
+ mdr->ls = mdlog->get_current_segment();
+ EUpdate *le = new EUpdate(mdlog, "renamesnap");
+ mdlog->start_entry(le);
+
+ le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
+ le->metablob.add_table_transaction(TABLE_SNAP, stid);
+ mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
+ mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
+
+ // journal the snaprealm changes
+ submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
+ mdr, __func__);
+ mdlog->flush();
+}
+
+void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
+{
+ dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
+
+ mdr->apply();
+
+ mds->snapclient->commit(mdr->more()->stid, mdr->ls);
+
+ dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
+
+ // notify other mds
+ mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
+
+ mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
+
+ // yay
+ mdr->in[0] = diri;
+ mdr->tracei = diri;
+ mdr->snapid = snapid;
+ respond_to_request(mdr, 0);
+}
+
+void Server::handle_client_readdir_snapdiff(MDRequestRef& mdr)
+{
+ const cref_t<MClientRequest>& req = mdr->client_request;
+ Session* session = mds->get_session(req);
+ MutationImpl::LockOpVec lov;
+ CInode* diri = rdlock_path_pin_ref(mdr, false, true);
+ if (!diri) return;
+
+ // it's a directory, right?
+ if (!diri->is_dir()) {
+ // not a dir
+ dout(10) << "reply to " << *req << " snapdiff -CEPHFS_ENOTDIR" << dendl;
+ respond_to_request(mdr, -CEPHFS_ENOTDIR);
+ return;
+ }
+
+ auto num_caps = session->get_num_caps();
+ auto session_cap_acquisition = session->get_cap_acquisition();
+
+ if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
+ dout(20) << "snapdiff throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
+ << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
+ if (logger)
+ logger->inc(l_mdss_cap_acquisition_throttle);
+
+ mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+
+ lov.add_rdlock(&diri->filelock);
+ lov.add_rdlock(&diri->dirfragtreelock);
+
+ if (!mds->locker->acquire_locks(mdr, lov))
+ return;
+
+ if (!check_access(mdr, diri, MAY_READ))
+ return;
+
+ // which frag?
+ frag_t fg = (__u32)req->head.args.snapdiff.frag;
+ unsigned req_flags = (__u32)req->head.args.snapdiff.flags;
+ string offset_str = req->get_path2();
+
+ __u32 offset_hash = 0;
+ if (!offset_str.empty()) {
+ offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
+ } else {
+ offset_hash = (__u32)req->head.args.snapdiff.offset_hash;
+ }
+
+ dout(10) << " frag " << fg << " offset '" << offset_str << "'"
+ << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
+
+ // does the frag exist?
+ if (diri->dirfragtree[fg.value()] != fg) {
+ frag_t newfg;
+ if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
+ if (fg.contains((unsigned)offset_hash)) {
+ newfg = diri->dirfragtree[offset_hash];
+ } else {
+ // client actually wants next frag
+ newfg = diri->dirfragtree[fg.value()];
+ }
+ } else {
+ offset_str.clear();
+ newfg = diri->dirfragtree[fg.value()];
+ }
+ dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
+ fg = newfg;
+ }
+
+ CDir* dir = try_open_auth_dirfrag(diri, fg, mdr);
+ if (!dir) return;
+
+ // ok!
+ dout(10) << __func__<< " on " << *dir << dendl;
+ ceph_assert(dir->is_auth());
+
+ if (!dir->is_complete()) {
+ if (dir->is_frozen()) {
+ dout(7) << "dir is frozen " << *dir << dendl;
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ // fetch
+ dout(10) << " incomplete dir contents for snapdiff on " << *dir << ", fetching" << dendl;
+ dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
+ return;
+ }
+
+#ifdef MDS_VERIFY_FRAGSTAT
+ dir->verify_fragstat();
+#endif
+
+ utime_t now = ceph_clock_now();
+ mdr->set_mds_stamp(now);
+
+ mdr->snapid_diff_other = (uint64_t)req->head.args.snapdiff.snap_other;
+ if (mdr->snapid_diff_other == mdr->snapid ||
+ mdr->snapid == CEPH_NOSNAP ||
+ mdr->snapid_diff_other == CEPH_NOSNAP) {
+ dout(10) << "reply to " << *req << " snapdiff -CEPHFS_EINVAL" << dendl;
+ respond_to_request(mdr, -CEPHFS_EINVAL);
+ }
+
+ dout(10) << __func__
+ << " snap " << mdr->snapid
+ << " vs. snap " << mdr->snapid_diff_other
+ << dendl;
+
+ SnapRealm* realm = diri->find_snaprealm();
+
+ unsigned max = req->head.args.snapdiff.max_entries;
+ if (!max)
+ max = dir->get_num_any(); // whatever, something big.
+ unsigned max_bytes = req->head.args.snapdiff.max_bytes;
+ if (!max_bytes)
+ // make sure at least one item can be encoded
+ max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
+
+ // start final blob
+ bufferlist dirbl;
+ DirStat ds;
+ ds.frag = dir->get_frag();
+ ds.auth = dir->get_dir_auth().first;
+ if (dir->is_auth() && !forward_all_requests_to_auth)
+ dir->get_dist_spec(ds.dist, mds->get_nodeid());
+
+ dir->encode_dirstat(dirbl, mdr->session->info, ds);
+
+ // count bytes available.
+ // this isn't perfect, but we should capture the main variable/unbounded size items!
+ int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8) * 2;
+ int bytes_left = max_bytes - front_bytes;
+ bytes_left -= get_snap_trace(session, realm).length();
+
+ _readdir_diff(
+ now,
+ mdr,
+ diri,
+ dir,
+ realm,
+ max,
+ bytes_left,
+ offset_str,
+ offset_hash,
+ req_flags,
+ dirbl);
+}
+
+
+/**
+ * Return true if server is in state RECONNECT and this
+ * client has not yet reconnected.
+ */
+bool Server::waiting_for_reconnect(client_t c) const
+{
+ return client_reconnect_gather.count(c) > 0;
+}
+
+void Server::dump_reconnect_status(Formatter *f) const
+{
+ f->open_object_section("reconnect_status");
+ f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
+ f->close_section();
+}
+
+const bufferlist& Server::get_snap_trace(Session *session, SnapRealm *realm) const {
+ ceph_assert(session);
+ ceph_assert(realm);
+ if (session->info.has_feature(CEPHFS_FEATURE_NEW_SNAPREALM_INFO)) {
+ return realm->get_snap_trace_new();
+ } else {
+ return realm->get_snap_trace();
+ }
+}
+
+const bufferlist& Server::get_snap_trace(client_t client, SnapRealm *realm) const {
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
+ return get_snap_trace(session, realm);
+}
+
+void Server::_readdir_diff(
+ utime_t now,
+ MDRequestRef& mdr,
+ CInode* diri,
+ CDir* dir,
+ SnapRealm* realm,
+ unsigned max_entries,
+ int bytes_left,
+ const string& offset_str,
+ uint32_t offset_hash,
+ unsigned req_flags,
+ bufferlist& dirbl)
+{
+ // build dir contents
+ bufferlist dnbl;
+ __u32 numfiles = 0;
+
+ snapid_t snapid = mdr->snapid;
+ snapid_t snapid_prev = mdr->snapid_diff_other;
+ if (snapid < snapid_prev) {
+ std::swap(snapid, snapid_prev);
+ }
+ bool from_the_beginning = !offset_hash && offset_str.empty();
+ // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
+ dentry_key_t skip_key(snapid_prev, offset_str.c_str(), offset_hash);
+
+ bool end = build_snap_diff(
+ mdr,
+ dir,
+ bytes_left,
+ from_the_beginning ? nullptr : & skip_key,
+ snapid_prev,
+ snapid,
+ dnbl,
+ [&](CDentry* dn, CInode* in, bool exists) {
+ string name;
+ snapid_t effective_snapid;
+ const auto& dn_name = dn->get_name();
+ // provide the first snapid for removed entries and
+ // the last one for existent ones
+ effective_snapid = exists ? snapid : snapid_prev;
+ name.append(dn_name);
+ if ((int)(dnbl.length() + name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
+ dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
+ return false;
+ }
+
+ auto diri = dir->get_inode();
+ auto hash = ceph_frag_value(diri->hash_dentry_name(dn_name));
+ unsigned start_len = dnbl.length();
+ dout(10) << "inc dn " << *dn << " as " << name
+ << std::hex << " hash 0x" << hash << std::dec
+ << dendl;
+ encode(name, dnbl);
+ mds->locker->issue_client_lease(dn, in, mdr, now, dnbl);
+
+ // inode
+ dout(10) << "inc inode " << *in << " snap " << effective_snapid << dendl;
+ int r = in->encode_inodestat(dnbl, mdr->session, realm, effective_snapid, bytes_left - (int)dnbl.length());
+ if (r < 0) {
+ // chop off dn->name, lease
+ dout(10) << " ran out of room, stopping at "
+ << start_len << " < " << bytes_left << dendl;
+ bufferlist keep;
+ keep.substr_of(dnbl, 0, start_len);
+ dnbl.swap(keep);
+ return false;
+ }
+
+ // touch dn
+ mdcache->lru.lru_touch(dn);
+ ++numfiles;
+ return true;
+ });
+
+ __u16 flags = 0;
+ if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
+ flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
+ }
+
+ std::swap(mdr->snapid, mdr->snapid_diff_other); // we want opponent snapid to be used for tracei
+
+ _finalize_readdir(mdr, diri, dir, from_the_beginning, end, flags, numfiles,
+ dirbl, dnbl);
+}
+
+bool Server::build_snap_diff(
+ MDRequestRef& mdr,
+ CDir* dir,
+ int bytes_left,
+ dentry_key_t* skip_key,
+ snapid_t snapid_prev,
+ snapid_t snapid,
+ const bufferlist& dnbl,
+ std::function<bool (CDentry*, CInode*, bool)> add_result_cb)
+{
+ client_t client = mdr->client_request->get_source().num();
+
+ struct EntryInfo {
+ CDentry* dn = nullptr;
+ CInode* in = nullptr;
+ utime_t mtime;
+
+ void reset() {
+ *this = EntryInfo();
+ }
+ } before;
+
+ auto insert_deleted = [&](EntryInfo& ei) {
+ dout(20) << "build_snap_diff deleted file " << ei.dn->get_name() << " "
+ << ei.dn->first << "/" << ei.dn->last << dendl;
+ int r = add_result_cb(ei.dn, ei.in, false);
+ ei.reset();
+ return r;
+ };
+
+ auto it = !skip_key ? dir->begin() : dir->lower_bound(*skip_key);
+
+ while(it != dir->end()) {
+ CDentry* dn = it->second;
+ dout(20) << __func__ << " " << it->first << "->" << *dn << dendl;
+ ++it;
+ if (dn->state_test(CDentry::STATE_PURGING))
+ continue;
+
+ bool dnp = dn->use_projected(client, mdr);
+ CDentry::linkage_t* dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
+
+ if (dnl->is_null()) {
+ dout(20) << __func__ << " linkage is null, skipping" << dendl;
+ continue;
+ }
+
+ if (dn->last < snapid_prev || dn->first > snapid) {
+ dout(20) << __func__ << " not in range, skipping" << dendl;
+ continue;
+ }
+ if (skip_key) {
+ skip_key->snapid = dn->last;
+ if (!(*skip_key < dn->key()))
+ continue;
+ }
+
+ CInode* in = dnl->get_inode();
+ if (in && in->ino() == CEPH_INO_CEPH)
+ continue;
+
+ // remote link?
+ // better for the MDS to do the work, if we think the client will stat any of these files.
+ if (dnl->is_remote() && !in) {
+ in = mdcache->get_inode(dnl->get_remote_ino());
+ dout(20) << __func__ << " remote in: " << *in << " ino " << std::hex << dnl->get_remote_ino() << std::dec << dendl;
+ if (in) {
+ dn->link_remote(dnl, in);
+ } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
+ dout(10) << "skipping bad remote ino on " << *dn << dendl;
+ continue;
+ } else {
+ // touch everything i _do_ have
+ for (auto& p : *dir) {
+ if (!p.second->get_linkage()->is_null())
+ mdcache->lru.lru_touch(p.second);
+ }
+
+ // already issued caps and leases, reply immediately.
+ if (dnbl.length() > 0) {
+ mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
+ dout(10) << " open remote dentry after caps were issued, stopping at "
+ << dnbl.length() << " < " << bytes_left << dendl;
+ } else {
+ mds->locker->drop_locks(mdr.get());
+ mdr->drop_local_auth_pins();
+ mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
+ }
+ return false;
+ }
+ }
+ ceph_assert(in);
+
+ utime_t mtime = in->get_inode()->mtime;
+
+ if (in->is_dir()) {
+
+ // we need to maintain the order of entries (determined by their name hashes)
+ // hence need to insert the previous entry if any immediately.
+ if (before.dn) {
+ if (!insert_deleted(before)) {
+ break;
+ }
+ }
+
+ bool exists = true;
+ if (snapid_prev < dn->first && dn->last < snapid) {
+ dout(20) << __func__ << " skipping inner " << dn->get_name() << " "
+ << dn->first << "/" << dn->last << dendl;
+ continue;
+ } else if (dn->first <= snapid_prev && dn->last < snapid) {
+ // dir deleted
+ dout(20) << __func__ << " deleted dir " << dn->get_name() << " "
+ << dn->first << "/" << dn->last << dendl;
+ exists = false;
+ }
+ bool r = add_result_cb(dn, in, exists);
+ if (!r) {
+ break;
+ }
+ } else {
+ if (snapid_prev >= dn->first && snapid <= dn->last) {
+ dout(20) << __func__ << " skipping unchanged " << dn->get_name() << " "
+ << dn->first << "/" << dn->last << dendl;
+ continue;
+ } else if (snapid_prev < dn->first && snapid > dn->last) {
+ dout(20) << __func__ << " skipping inner modification " << dn->get_name() << " "
+ << dn->first << "/" << dn->last << dendl;
+ continue;
+ }
+ string_view name_before =
+ before.dn ? string_view(before.dn->get_name()) : string_view();
+ if (before.dn && dn->get_name() != name_before) {
+ if (!insert_deleted(before)) {
+ break;
+ }
+ before.reset();
+ }
+ if (snapid_prev >= dn->first && snapid_prev <= dn->last) {
+ dout(30) << __func__ << " dn_before " << dn->get_name() << " "
+ << dn->first << "/" << dn->last << dendl;
+ before = EntryInfo {dn, in, mtime};
+ continue;
+ } else {
+ if (before.dn && dn->get_name() == name_before) {
+ if (mtime == before.mtime) {
+ dout(30) << __func__ << " timestamp not changed " << dn->get_name() << " "
+ << dn->first << "/" << dn->last
+ << " " << mtime
+ << dendl;
+ before.reset();
+ continue;
+ } else {
+ dout(30) << __func__ << " timestamp changed " << dn->get_name() << " "
+ << dn->first << "/" << dn->last
+ << " " << before.mtime << " vs. " << mtime
+ << dendl;
+ before.reset();
+ }
+ }
+ dout(20) << __func__ << " new file " << dn->get_name() << " "
+ << dn->first << "/" << dn->last
+ << dendl;
+ ceph_assert(snapid >= dn->first && snapid <= dn->last);
+ }
+ if (!add_result_cb(dn, in, true)) {
+ break;
+ }
+ }
+ }
+ if (before.dn) {
+ insert_deleted(before);
+ }
+ return it == dir->end();
+}